2.13.4-1

Optimize CUDA graph launch; avoid launching a CPU callback for intra-node operations. Simplify kernel common code to improve the latency of send/recv operations. Strengthen CUDA streams semantics. Change NET API to v6, to add dmabuf support. Add ncclGetLastError() function. Add ncclRemoteError code and use it for remote network errors. Support the use of a different NCCL_NET parameter per communicator. Add support for SHM and P2P transfers using cudaMemcpy.
2022-05-24 02:02:31 -07:00 · 2022-05-24 02:02:31 -07:00 · 19ab67d172
commit 19ab67d172
parent 7aa1c46fd5
62 changed files with 4787 additions and 2496 deletions
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 12
-NCCL_PATCH   := 12
+NCCL_MINOR   := 13
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
--- a/src/Makefile
+++ b/src/Makefile
@ -10,7 +10,8 @@ include ../makefiles/version.mk
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
-		misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc \
+		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
+		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
                collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@ -105,6 +105,7 @@ static void *bootstrapRoot(void* args) {
  do {
    struct ncclSocket sock;
    sock.abortFlag = NULL;
+    /* bootstrap root thread always uses blocking ncclSocketAccept. */
    NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
    NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
    close(sock.fd);
@ -139,6 +140,7 @@ static void *bootstrapRoot(void* args) {
    int next = (r+1) % nranks;
    struct ncclSocket sock;
    sock.abortFlag = NULL;
+    sock.asyncFlag = 0;
    memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
    NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
    NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
@ -316,6 +318,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
  struct bootstrapState* state = (struct bootstrapState*)commState;
  struct ncclSocket sock;
  sock.abortFlag = state->abortFlag;
+  sock.asyncFlag = 0;
  memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
  NCCLCHECK(ncclSocketConnect(&sock));
  NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
--- a/src/channel.cc
+++ b/src/channel.cc
@ -8,75 +8,54 @@
 #include "param.h"
 #include "gdrwrap.h"

-// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
-NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
-
-ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
-  struct ncclChannel* channel = comm->channels+channelid;
+ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
+  struct ncclChannel* channel = &comm->channels[channelId];
  if (channel->id != -1) return ncclSuccess;
-  channel->id = channelid;

-  // Ring index to user rank table.
-  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+  int nRanks = comm->nRanks;
+  channel->id = channelId;
+  channel->workFifoSent = 0;

-  // Communication structures with peers.
-  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
-  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
-  for (size_t i=0; i<comm->nRanks+1; ++i) {
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      channel->peers[i].send[b].comm = comm;
-      channel->peers[i].recv[b].comm = comm;
+  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
+
+  // The extra on nRanks+1 is for collnet root (i.e. network)
+  channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1);
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.stream));
+  ncclCommPushCudaFree(comm, channel->devPeers);
+
+  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.stream));
+  ncclCommPushCudaFree(comm, channel->devRingUserRanks);
+
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
+
+  for (int r=0; r < nRanks+1; ++r) {
+    for (int b=0; b < NCCL_MAX_CONNS; b++) {
+      channel->peers[r].send[b].comm = comm;
+      channel->peers[r].recv[b].comm = comm;
    }
  }

-  // Per-channel operation list.
-  NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
-  if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
-    // GDRCOPY support
-    // We allocate a workFifo in GDR mapped CUDA memory
-    // But we still allocate the Host workFifo so that we
-    // can copy the work elements to CUDA memory on kernel launch
-    NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
-  } else {
-    // The device workFifo is the Host one
-    channel->workFifoDev = channel->workFifo;
-  }
-
  return ncclSuccess;
 }

 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
  if (channel->id == -1) return ncclSuccess;
-  // Operation list
-  NCCLCHECK(ncclCudaHostFree(channel->workFifo));
-  if (channel->gdrMemDesc) {
-    // GDRCOPY support
-    NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc));
-  }
-
-  // Free Ring index to rank tables
-  free(channel->ring.userRanks);
-  CUDACHECK(cudaFree(channel->ring.devUserRanks));

  // Free transport proxy resources
  // Note: free all send resources first due to CollNet arrangement
  for (int r=0; r<nRanks+1; r++) {
-    struct ncclPeer* peer = channel->peers+r;
+    struct ncclChannelPeer* peer = channel->peers+r;
    for (int b=0; b<NCCL_MAX_CONNS; b++) {
      if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
    }
  }
  for (int r=0; r<nRanks+1; r++) {
-    struct ncclPeer* peer = channel->peers+r;
+    struct ncclChannelPeer* peer = channel->peers+r;
    for (int b=0; b<NCCL_MAX_CONNS; b++) {
      if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
    }
  }

-  // Free the peer structures.
-  CUDACHECK(cudaFree(channel->devPeers));
-  free(channel->peers);
-
  return ncclSuccess;
 }
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@ -12,11 +12,11 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem.channel.ring;
-    const int *ringRanks = ring->devUserRanks;
+    const int *ringRanks = ring->userRanks;
    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@ -12,7 +12,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem.channel.ring;
@ -97,7 +97,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclTree *tree = &ncclShmem.channel.tree;
@ -169,7 +169,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclTree *tree = &ncclShmem.channel.tree;
@ -290,7 +290,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
    const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
    const int nThreadsGather  =             ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
    const int nThreadsBcast   = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
-    const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
+    const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
    const int tidStartBcast = nThreadsGather;
    const int tidStartScatter = tidStartBcast + nThreadsBcast;
    const int tidStartReduce = tidStartScatter + nThreadsScatter;
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@ -12,7 +12,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem.channel.ring;
@ -20,8 +20,8 @@ namespace {
    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
    const ssize_t loopSize = nChannels*chunkSize;
    const ssize_t size = args->count;
-    const int rank = ring->devUserRanks[0];
-    const int nextRank = ring->devUserRanks[1];
+    const int rank = ring->userRanks[0];
+    const int nextRank = ring->userRanks[1];
    const int root = args->root;

    T *inputBuf = (T*)args->sendbuff;
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@ -19,90 +19,6 @@

 #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1)  // Using balanced tree instead of split tree

-__device__ inline bool barrierReduceAny(int bit) {
-  uint32_t popc;
-  asm ("{"
-    ".reg .pred barr_pred;"
-    "setp.eq.u32 barr_pred, %1, 1;"
-    "bar.red.popc.u32 %0, 2, barr_pred;"
-  "}" : "=r"(popc) : "r"(bit));
-  return popc != 0;
-}
-
-// Copy src to dst and fill extra size with zeroes
-template<typename Tdst, typename Tsrc>
-__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) {
-  static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0,
-      "copyToShmem needs sizes which are multiple of 16B");
-  static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small");
-  static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle");
-  uint64_t *d = reinterpret_cast<uint64_t*>(dst);
-  uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
-  uint64_t *shmemPtr = shmemCvtPtr(d);
-  int offset = 2*tid;
-  uint64_t v0, v1;
-  if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) {
-    v0 = v1 = 0ULL;
-  } else {
-    v0 = s[offset] ; v1 = s[offset+1];
-  }
-  if (offset < sizeof(Tdst)/sizeof(uint64_t)) storeShmem128(shmemPtr+offset, v0, v1);
-}
-
-template<typename T>
-__device__ int copyToShmem(T *dst, T const *src, int turn=0) {
-  static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh");
-  uint64_t *d = reinterpret_cast<uint64_t*>(dst);
-  uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
-  int t = threadIdx.x - turn;
-  if (t < 0) t += blockDim.x;
-  int n = sizeof(T)/sizeof(uint64_t);
-
-  int delta = (n + WARP_SIZE-1) & -WARP_SIZE; // round up to warp lane 0
-  if (delta < blockDim.x) {
-    turn += delta;
-    if (turn >= blockDim.x) turn -= blockDim.x;
-  }
-  else
-    turn = 0;
-
-  n -= t;
-  d += t;
-  s += t;
-  #pragma unroll
-  for (int i=0; i < divUp(sizeof(T), WARP_SIZE*sizeof(uint64_t)); i++) {
-    if (n > 0) {
-      *d = *s;
-      d += blockDim.x;
-      s += blockDim.x;
-      n -= blockDim.x;
-    }
-  }
-  return turn;
-}
-
-template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWorkElement {
-  __device__ void run(ncclWorkElem*) {
-    // Put NOT IMPLEMENTED behavior here.
-  }
-};
-
-template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWork {
-  // This __forceinline__ is necessary. The compiler was inserting a function call
-  // here from the LL ncclKernel.
-  __device__ __forceinline__ void run(ncclWork *w) {
-    int wid = threadIdx.x / WARP_SIZE;
-    int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1;
-    #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) {
-      if (wid < w->header.nWarps)
-        RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
-    }
-  }
-};
-
 typedef void(*ncclKern_t)();
 extern __device__ ncclKern_t ncclFuncs[];

@ -120,15 +36,62 @@ struct ncclShmemData {
    struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
  };
  uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
-  struct ncclDevComm comm;
-  struct ncclChannel channel;
-  uint64_t pad;
-  struct ncclWork work;
+  int channelId;
+  alignas(16) struct ncclDevComm comm;
+  alignas(16) struct ncclDevChannel channel;
+  alignas(16) struct ncclWork work;
 };
 static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");

+extern __shared__ ncclShmemData ncclShmem;
+
+__device__ inline bool barrierReduceAny(int bit) {
+  uint32_t popc;
+  asm ("{"
+    ".reg .pred barr_pred;"
+    "setp.eq.u32 barr_pred, %1, 1;"
+    "bar.red.popc.u32 %0, 2, barr_pred;"
+  "}" : "=r"(popc) : "r"(bit));
+  return popc != 0;
+}
+
+// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
+inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
+  int offset = 16*tid;
+  if (offset < bytes) {
+    uint64_t a=0, b=0;
+    asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
+    asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b));
+  }
+}
+
+template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
+struct RunWorkElement {
+  __device__ void run(ncclWorkElem*) {
+    // Put NOT IMPLEMENTED behavior here.
+  }
+};
+
+template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
+struct RunWork {
+  // This __forceinline__ is necessary. The compiler was inserting a function call
+  // here from the LL ncclKernel.
+  __device__ __forceinline__ void run(ncclWork *w) {
+    int wid = threadIdx.x / WARP_SIZE;
+    ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
+    int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
+    #pragma unroll 1
+    while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
+      if (wid < we->nWarps) {
+        RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
+      }
+      we = (ncclWorkElem*)((char*)we + stride);
+    }
+  }
+};
+
 static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
-  if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) {
+  if (we->isUsed && we->redOpArgIsPtr) {
    /* redOpArg is a pointer to the scalar value, so we'll dereference it
     * here so that redOpArg holds the bits of the scalar going forward.
     * The tricky thing is we don't know its type T since that's encoded in
@ -148,48 +111,69 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
  }
 }

-extern __shared__ ncclShmemData ncclShmem;
-
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex>
-__device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
+__device__ void ncclKernel(
+    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
+  )  {
  int tid = threadIdx.x;
-  int nthreads = blockDim.x;
-  int bid = blockIdx.x;

-  int turn = copyToShmem(&ncclShmem.comm, comm);
-  // get address of channel without incurring indirect load from ncclDevCom::channels
-  ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
-  turn = copyToShmem(&ncclShmem.channel, channel, turn);
+  // To map blockId to channelId, we need the n'th set bit of channelMask which
+  // is the inverse of counting the number of set bits among the the first n.
+  if (tid < WARP_SIZE) {
+    int x = tid;
+    if (channelMask & (1ull<<x)) {
+      int y = __popcll(channelMask & ((1ull<<x)-1));
+      if (blockIdx.x == y) ncclShmem.channelId = x;
+    }
+    if (32 < MAXCHANNELS) {
+      x = 32 + tid;
+      if (channelMask & (1ull<<x)) {
+        int y = __popcll(channelMask & ((1ull<<x)-1));
+        if (blockIdx.x == y) ncclShmem.channelId = x;
+      }
+    }
+  }
+  __syncthreads(); // publish ncclShmem.channelId
+  int channelId = ncclShmem.channelId;

-  // To optimize for latency, (only) the first operation is passed as argument.
-  if (bid == 0 && first.header.type != ncclWorkTypeUnused) {
-    // Copy first elem to work and zero out the rest
-    copyToShmem(&ncclShmem.work, &first, tid, nthreads);
+  if (true) {
+    void *dst, *src;
+    int bytes;
+    // Use first 3 warps to load comm, channel, and work into ncclShmem
+    switch (tid/WARP_SIZE) {
+    case 0:
+      dst = &ncclShmem.comm;
+      src = comm;
+      bytes = sizeof(ncclDevComm);
+      static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
+      break;
+    case 1:
+      // Get address of channel without incurring indirect load from ncclDevComm::channels
+      dst = &ncclShmem.channel;
+      src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
+      bytes = sizeof(ncclDevChannel);
+      static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
+      break;
+    case 2:
+      dst = &ncclShmem.work;
+      src = workHead + blockIdx.x;
+      bytes = sizeof(ncclWork);
+      static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
+      break;
+    default:
+      bytes = 0;
+      break;
+    }
+    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
  }
  __syncthreads(); // publish ncclShmem

-  ncclWork *workFifoHost = ncclShmem.channel.workFifo;
-  ncclWork *workFifoDev = ncclShmem.channel.workFifoDev;
-  int workFifoIx = ncclShmem.channel.index;
-
-  if (bid == 0 && first.header.type != ncclWorkTypeUnused)
-    goto SkipLoadWork;
-
  while (true) {
-    copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx], tid, nthreads);
-    { // Check whether the last operation was aborted and make sure all threads exit
-      int aborted = tid == 0 ? *comm->abortFlag : 0;
-      if (barrierReduceAny(aborted)) // publish ncclShmem.work
-        break;
-      if (tid == 0)
-        workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
+    // Notify host that all fifo reads are complete.
+    if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
+      *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
    }

-  SkipLoadWork:
-    workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
-    if (tid == 0)
-      channel->index = workFifoIx; // write back to real channel, not shmem shadow
-
    __syncwarp();
    if (ncclShmem.work.header.type == ncclWorkTypeColl) {
      if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
@ -198,21 +182,34 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
    }
    __syncthreads();

-    if (ncclShmem.work.header.funcIndex == FnIndex)
+    if (ncclShmem.work.header.funcIndex == FnIndex) {
      RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
-    else
+    } else {
      ncclFuncs[ncclShmem.work.header.funcIndex]();
+    }

-    if (ncclShmem.work.header.isLast) break;
+    int workIxNext = ncclShmem.work.header.workNext;
    __syncthreads();
+    if (ncclShmem.work.header.isLast) break;
+
+    copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
+
+    { // Check whether the last operation was aborted and make sure all threads exit
+      int aborted = tid == 0 ? *comm->abortFlag : 0;
+      if (barrierReduceAny(aborted)) // publish ncclShmem.work
+        break;
+    }
  }
 }

 // Only generate kernels for SUM
 #if NCCL_OP == 0
 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem first) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(comm, first); \
+__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)( \
+    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead \
+  ) { \
+  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex> \
+    (comm, channelMask, workHead); \
 }
 #else
 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded)
--- a/src/collectives/device/onerank_reduce.cu
+++ b/src/collectives/device/onerank_reduce.cu
@ -16,7 +16,7 @@ namespace {
    int tid = threadIdx.x;
    int tn = blockDim.x;
    #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
+    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
      ncclWorkElem *we = &w->elems[e];
      intptr_t eltN = we->count;
      int bid = we->bid;
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@ -326,11 +326,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
    // If we are going to support oneshot collNet + LL, then we would need to add connector index here
    int nrecv=0, nsend=0;
    while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
      nrecv++;
    }
    while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
      nsend++;
    }
    this->fan = Fan(nrecv, nsend);
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@ -364,11 +364,11 @@ public:
    auto *channel = &ncclShmem.channel;
    int nrecv=0, nsend=0;
    while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
      nrecv++;
    }
    while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
      nsend++;
    }
    this->fan = Fan(nrecv, nsend);
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@ -303,9 +303,9 @@ class Primitives<
    }
  }

-  __device__ __forceinline__ void loadRecvConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
    if (flags & (RoleWaitRecv|RolePostRecv)) {
-      auto *conn = &peer->recv[connIndex].conn;
+      auto *conn = &peer->recv[connIndex];
      step = conn->step;
      step = roundUp(step, SlicePerChunk*StepPerSlice);
      if (flags & RolePostRecv) {
@ -343,9 +343,9 @@ class Primitives<
    }
  }

-  __device__ __forceinline__ void loadSendConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
    if (flags & (RoleWaitSend|RolePostSend)) {
-      auto *conn = &peer->send[connIndex].conn;
+      auto *conn = &peer->send[connIndex];
      step = conn->step;
      step = roundUp(step, SlicePerChunk*StepPerSlice);
      if (flags & RolePostSend) {
@ -428,8 +428,8 @@ class Primitives<
    if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
    if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];

-    loadRecvConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
-    loadSendConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
+    loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e);
+    loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e);

    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
  }
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@ -12,7 +12,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem.channel.ring;
@ -22,7 +22,7 @@ namespace {
    const ssize_t loopSize = nChannels*chunkSize;
    const ssize_t size = args->count;
    const int rank = ncclShmem.comm.rank;
-    const int prevRank = ring->devUserRanks[nranks-1];
+    const int prevRank = ring->userRanks[nranks-1];
    const int root = args->root;

    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@ -12,11 +12,11 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem.channel.ring;
-    int const *ringRanks = ring->devUserRanks;
+    int const *ringRanks = ring->userRanks;
    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@ -11,21 +11,23 @@
 template<typename T, typename RedOp>
 struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+    void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
+    size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
    if (args->peer == ncclShmem.comm.rank) {
      struct ncclWorkElemP2p* recvArgs = args-1;
-      if (args->buff != recvArgs->buff) {
-        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
+      void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
+      if (buff != recvBuff) {
+        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);
      }
    } else {
      using Proto = ProtoSimple<1, 1>;
-      ssize_t const count = args->count;
      int const chunkSize = args->chunkSize/sizeof(T);
      int const peer = args->peer;
      Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
-      ssize_t offset = 0;
+        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
+      size_t offset = 0;
      do {
-        int nelem = min(chunkSize, count-offset);
+        int nelem = min(size_t(chunkSize), count-offset);
        prims.directSend(offset, offset, nelem);
        offset += nelem;
      } while(offset < count);
@ -35,14 +37,15 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
    if (args->peer != ncclShmem.comm.rank) {
      using Proto = ProtoSimple<1, 1>;
-      ssize_t const count = args->count;
+      void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
+      ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
      int const chunkSize = args->chunkSize/sizeof(T);
      int const peer = args->peer;
      Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
-      ssize_t offset = 0;
+        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
+      size_t offset = 0;
      do {
-        int nelem = min(chunkSize, count-offset);
+        int nelem = min(size_t(chunkSize), count-offset);
        prims.directRecv(offset, nelem);
        offset += nelem;
      } while(offset < count);
@ -61,11 +64,11 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
    #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
    int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
    args += group;
-    if (args->header.type == ncclWorkTypeUnused) return;
-
    tid -= args->warpStart * WARP_SIZE;
    int nthreads = args->nWarps * WARP_SIZE;
    group |= 1<<16; // Used to select connIndex 1
+
+    if (args->p2pType == ncclWorkP2pTypeUnused) return;
    if (tid >= nthreads || args->peer == -1) return;
    if ((group%2) == 0) {
      runRecv(tid, nthreads, group, args);
--- a/src/debug.cc
+++ b/src/debug.cc
@ -8,30 +8,24 @@
 #include "nccl_net.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <sys/syscall.h>

 int ncclDebugLevel = -1;
+static int pid = -1;
+static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
+char ncclLastError[1024] = ""; // Global string for the last error in human readable form
 uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
 FILE *ncclDebugFile = stdout;
 pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+std::chrono::steady_clock::time_point ncclEpoch;
+
+static __thread int tid = -1;

 void ncclDebugInit() {
  pthread_mutex_lock(&ncclDebugLock);
  if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
  const char* nccl_debug = getenv("NCCL_DEBUG");
-  if (nccl_debug == NULL) {
-    ncclDebugLevel = NCCL_LOG_NONE;
-  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = NCCL_LOG_VERSION;
-  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = NCCL_LOG_WARN;
-  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = NCCL_LOG_INFO;
-  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = NCCL_LOG_ABORT;
-  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
-    ncclDebugLevel = NCCL_LOG_TRACE;
-  }

  /* Parse the NCCL_DEBUG_SUBSYS env var
   * This can be a comma separated list such as INIT,COLL
@ -64,6 +58,8 @@ void ncclDebugInit() {
        mask = NCCL_ENV;
      } else if (strcasecmp(subsys, "ALLOC") == 0) {
        mask = NCCL_ALLOC;
+      } else if (strcasecmp(subsys, "CALL") == 0) {
+        mask = NCCL_CALL;
      } else if (strcasecmp(subsys, "ALL") == 0) {
        mask = NCCL_ALL;
      }
@ -75,6 +71,10 @@ void ncclDebugInit() {
    free(ncclDebugSubsys);
  }

+  // Cache pid and hostname
+  getHostName(hostname, 1024, '.');
+  pid = getpid();
+
  /* Parse and expand the NCCL_DEBUG_FILE path and
   * then create the debug file. But don't bother unless the
   * NCCL_DEBUG level is > VERSION
@ -94,12 +94,10 @@ void ncclDebugInit() {
          *dfn++ = '%';
          break;
        case 'h': // %h = hostname
-          char hostname[1024];
-          getHostName(hostname, 1024, '.');
          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
          break;
        case 'p': // %p = pid
-          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+          dfn += snprintf(dfn, PATH_MAX, "%d", pid);
          break;
        default: // Echo everything we don't understand
          *dfn++ = '%';
@ -110,15 +108,30 @@ void ncclDebugInit() {
    *dfn = '\0';
    if (debugFn[0] != '\0') {
      FILE *file = fopen(debugFn, "w");
-      if (file != NULL) {
+      if (file != nullptr) {
+        setbuf(file, nullptr); // disable buffering
        ncclDebugFile = file;
      }
    }
  }

-#ifdef ENABLE_TRACE
-  ncclEpoch = std::chrono::high_resolution_clock::now();
-#endif
+  int tempNcclDebugLevel = -1;
+  if (nccl_debug == NULL) {
+    tempNcclDebugLevel = NCCL_LOG_NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_TRACE;
+  }
+
+  ncclEpoch = std::chrono::steady_clock::now();
+  __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
  pthread_mutex_unlock(&ncclDebugLock);
 }

@ -127,45 +140,53 @@ void ncclDebugInit() {
 * they can share the debugging mechanisms and output files
 */
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
-  if (ncclDebugLevel == -1) ncclDebugInit();
+  if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
  if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
+
+  // Save the last error (WARN) as a human readable string
+  if (level == NCCL_LOG_WARN) {
+    pthread_mutex_lock(&ncclDebugLock);
+    va_list vargs;
+    va_start(vargs, fmt);
+    (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
+    va_end(vargs);
+    pthread_mutex_unlock(&ncclDebugLock);
+  }
  if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;

-  // Gather the rank information. This can take > 1us so we want to make sure
-  // we only do it when needed.
-  char hostname[1024];
-  getHostName(hostname, 1024, '.');
+  if (tid == -1) {
+    tid = syscall(SYS_gettid);
+  }
+
  int cudaDev;
-  cudaGetDevice(&cudaDev);
-  int pid = getpid();
-  int tid = syscall(SYS_gettid);
+  if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
+    cudaGetDevice(&cudaDev);
+  }

  char buffer[1024];
  size_t len = 0;
-  pthread_mutex_lock(&ncclDebugLock);
-  if (level == NCCL_LOG_WARN)
-    len = snprintf(buffer, sizeof(buffer),
-        "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
-  else if (level == NCCL_LOG_INFO)
-    len = snprintf(buffer, sizeof(buffer),
-        "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
-#ifdef ENABLE_TRACE
-  else if (level == NCCL_LOG_TRACE) {
-    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
+  if (level == NCCL_LOG_WARN) {
+    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
+                   hostname, pid, tid, cudaDev, filefunc, line);
+  } else if (level == NCCL_LOG_INFO) {
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
+  } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
+  } else if (level == NCCL_LOG_TRACE) {
+    auto delta = std::chrono::steady_clock::now() - ncclEpoch;
    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer),
-        "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
+                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
  }
-#endif
+
  if (len) {
    va_list vargs;
    va_start(vargs, fmt);
-    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+    len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
    va_end(vargs);
-    fprintf(ncclDebugFile,"%s\n", buffer);
-    fflush(ncclDebugFile);
+    buffer[len++] = '\n';
+    fwrite(buffer, 1, len, ncclDebugFile);
  }
-  pthread_mutex_unlock(&ncclDebugLock);
 }

 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@ -428,10 +428,10 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);

 // Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
 // remote proxies without risking deadlocks
-int ncclPxnDisable() {
+int ncclPxnDisable(struct ncclComm* comm) {
  static int pxnDisable = -1;
  if (pxnDisable == -1) {
-    if (ncclNetVersion() == 4) {
+    if (comm && ncclNetVersion(comm) == 4) {
      INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
      pxnDisable = 1;
    } else {
@ -470,7 +470,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
  return ncclSuccess;
 }

-ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
  // Precompute paths between GPUs/NICs.

  // Remove everything in case we're re-computing
@ -498,16 +498,16 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
      }
    }

-    if (peerInfos == NULL) continue;
+    if (comm == NULL) continue;
    // Remove GPUs we can't talk to because of containers.
-    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank;
+    struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank;
    for (int p=0; p<system->nodes[GPU].count; p++) {
      if (p == g) continue;
-      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank;
+      struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
      int shm;
-      NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+      NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
      int p2p;
-      NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+      NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
      if (shm == 0 && p2p == 0) {
        // Mark this peer as inaccessible. We'll trim it later.
        system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
@ -523,7 +523,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
    for (int g=0; g<system->nodes[GPU].count; g++) {
      // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-      if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
+      if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) {
        int pxnGpu = -1;

        for (int p=0; p<system->nodes[GPU].count; p++) {
@ -670,7 +670,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
  // We want to spread channels used when there aren't many and progressively
  // fill the whole space of nChannels. To do so we mirror the bits in the
  // nChannels space.
-  for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+  for (int c=0; c<comm->p2pnChannels; c++) {
    int mirror = 0;
    for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
    comm->p2pChannels[c] = mirror;
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@ -958,10 +958,14 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
    *proxyRank = rank;

-    int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel();
+    int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
    // See whether we can use the remote rank preferred device.
    if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
-      int netDev = comm->peerInfo[peerRank].netDev;
+      // Find local NIC number close to local cudaDev
+      int cudaDev = comm->peerInfo[peerRank].cudaDev;
+      int localRank;
+      if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess;
+      int netDev = comm->peerInfo[localRank].netDev;
      int n;
      // Check that device exists on our node
      if (ncclParamCrossNic() == 0) {
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@ -626,11 +626,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
  // so we start with collnet so that it has precedence.
  int netDevCount = 0;
-  if (collNetSupport()) {
-    NCCLCHECK(collNetDevices(&netDevCount));
+  if (collNetSupport(comm)) {
+    NCCLCHECK(collNetDevices(comm, &netDevCount));
    for (int n=0; n<netDevCount; n++) {
      ncclNetProperties_t props;
-      NCCLCHECK(collNetGetProperties(n, &props));
+      NCCLCHECK(collNetGetProperties(comm, n, &props));
      struct ncclXmlNode* netNode;
      NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
      NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@ -639,16 +639,18 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
      NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
      NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
      NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+      bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+      INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
      NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
    }
  }
  if (netDevCount == 0) {
-    NCCLCHECK(ncclNetDevices(&netDevCount));
+    NCCLCHECK(ncclNetDevices(comm, &netDevCount));
  }
  for (int n=0; n<netDevCount; n++) {
    ncclNetProperties_t props;
-    NCCLCHECK(ncclNetGetProperties(n, &props));
+    NCCLCHECK(ncclNetGetProperties(comm, n, &props));
    struct ncclXmlNode* netNode;
    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@ -658,7 +660,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
    NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
  }

  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@ -181,6 +181,17 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
  return ncclInternalError;
 }

+static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) {
+  *rank = -1;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
+      *rank = system->nodes[GPU].nodes[i].gpu.rank;
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
+}
+
 // Returns NVLink speed in GB/s
 static float ncclTopoNVLinkSpeed(int cudaCompCap) {
  return
--- a/src/group.cc
+++ b/src/group.cc
@ -10,399 +10,259 @@
 #include "transport.h"
 #include "channel.h"

-#define MAX_ASYNC_OPS 128
-thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
-thread_local int ncclGroupIndex = 0;
-thread_local int ncclGroupMode = 0;
-thread_local ncclResult_t ncclGroupError = ncclSuccess;
+__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
+__thread ncclResult_t ncclGroupError = ncclSuccess;
+__thread struct ncclComm* ncclGroupCommHead = nullptr;
+__thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
+__thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;

-bool ncclAsyncMode() {
-  return ncclGroupMode > 0;
-}
-
-ncclResult_t ncclAsyncErrCheck(ncclResult_t ret) {
-  if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
-  return ret;
-}
-
-struct ncclInitArgs {
-  ncclInitFunc_t func;
-  int cudaDev;
-  ncclComm_t* newcomm;
-  int ndev;
-  ncclUniqueId commId;
-  int myrank;
-};
-struct ncclCollArgs {
-  ncclComm_t comm;
-};
-
-enum ncclAsyncFuncType {
-  ASYNC_FUNC_INVALID = 0,
-  ASYNC_FUNC_INIT = 1,
-  ASYNC_FUNC_COLL = 2,
-};
-struct ncclAsyncArgs {
-  ncclResult_t ret;
-  enum ncclAsyncFuncType funcType;
-  union {
-    ncclCollArgs coll;
-    ncclInitArgs init;
-  };
-};
-
-thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
-
-void* ncclAsyncThreadMain(void* args_) {
-  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
-  return args;
-}
-
-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) {
-  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
-    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInvalidUsage);
+ncclResult_t ncclAsyncLaunch(
+    struct ncclAsyncJob* job,
+    ncclResult_t(*func)(struct ncclAsyncJob*),
+    void(*undo)(struct ncclAsyncJob*),
+    void(*destructor)(void*)
+  ) {
+  if (0 == ncclGroupDepth) {
+    ncclResult_t res = func(job);
+    if (res != ncclSuccess && undo) undo(job);
+    if (destructor) destructor(job);
+    return res;
+  } else {
+    job->func = func;
+    job->undo = undo;
+    job->destructor = destructor;
+    ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
+    return ncclSuccess;
  }
-  int index = ncclGroupIndex++;
-  struct ncclAsyncArgs* args = ncclGroupArgs+index;
-  args->funcType = ASYNC_FUNC_INIT;
-  args->init.func = func;
-  args->init.cudaDev = cudaDev;
-  args->init.newcomm = newcomm;
-  args->init.ndev = ndev;
-  memcpy(&args->init.commId, &commId, sizeof(commId));
-  args->init.myrank = myrank;
-  return ncclSuccess;
 }

-ncclResult_t ncclAsyncColl(ncclComm_t comm) {
-  struct ncclAsyncArgs* args = ncclGroupArgs;
-  for (int i=0; i<ncclGroupIndex; i++) {
-    if (args->coll.comm == comm) return ncclSuccess;
-    args++;
+void* ncclAsyncJobMain(void* arg) {
+  struct ncclAsyncJob* job = (struct ncclAsyncJob*)arg;
+  job->result = job->func(job);
+  if (job->result != ncclSuccess) {
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result);
  }
-  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
-    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInvalidUsage);
-  }
-  ncclGroupIndex++;
-  args->funcType = ASYNC_FUNC_COLL;
-  args->coll.comm = comm;
-  return ncclSuccess;
+  return arg;
 }

 NCCL_API(ncclResult_t, ncclGroupStart);
 ncclResult_t ncclGroupStart() {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
-  if (ncclGroupMode == 0) {
-    memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
-  }
-  ncclGroupMode++;
+  NCCLCHECK(ncclGroupStartInternal());
+  TRACE_CALL("ncclGroupStart()");
  return ncclSuccess;
 }

-static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff) {
-  struct ncclInfo info = { ncclFuncSend, "Send",
-    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
-    1, 1 };
-  int channelId;
-  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
-  info.channelId = channelId;
-  NCCLCHECK(ncclSetupP2pKernel(&info));
-  return ncclSuccess;
-}
-static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff) {
-  struct ncclInfo info = { ncclFuncRecv, "Recv",
-    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
-    1, 1 };
-  int channelId;
-  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
-  info.channelId = channelId;
-  NCCLCHECK(ncclSetupP2pKernel(&info));
-  return ncclSuccess;
-}
-
-void* ncclAsyncThreadPreconnect(void* args_) {
-  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  struct ncclComm* comm = args->coll.comm;
-  CUDACHECKTHREAD(cudaSetDevice(comm->cudaDev));
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 1));
-  return args;
-}
-
-static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
-  size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
-  int nChannels = minChannels;
-  while (size > maxSize && nChannels <= maxChannels/2) {
-    nChannels *= 2;
-    size = DIVUP(totalSize, nChannels);
-  }
-  ALIGN_SIZE(size, minSize);
-  return size;
-}
-
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
-  if (ncclGroupMode == 0) {
+  NCCLCHECK(ncclGroupEndInternal());
+  TRACE_CALL("ncclGroupEnd()");
+  return ncclSuccess;
+}
+
+struct ncclPreconnectJob {
+  struct ncclAsyncJob base;
+  struct ncclComm* comm;
+};
+ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
+  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
+  struct ncclComm* comm = job->comm;
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
+  return ncclSuccess;
+}
+
+static ncclResult_t doLaunches(struct ncclComm* head) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclComm* cliqueComm0 = head->intraComm0;
+  struct ncclComm* cliqueHead = head;
+  struct ncclComm* cliqueNextHead;
+  bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
+  // This outer loop iterates over cliques of comms which are siblings of the
+  // same global entity. We calculate a clique as all comms which have the same
+  // `intraComm0` value.
+  do {
+    struct ncclComm* comm = cliqueHead;
+    bool capturingYes = false, capturingNo = false;
+    do {
+      (ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
+      NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
+      if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
+      comm = comm->groupNext;
+    } while (comm != nullptr && comm->intraComm0 == cliqueComm0);
+    cliqueNextHead = comm;
+
+    if (capturingYes && capturingNo) {
+      // We have entered barriers but are aborting without leaving them. Thus
+      // these comms are permanently trashed. We need a good mechanism for
+      // tracking and reporting that.
+      WARN("Either none or all communicators in a ncclGroup() can be CUDA graph captured.");
+      result = ncclInvalidUsage;
+      goto failure;
+    }
+
+    while (true) { // Iterate rounds of launches for clique.
+      bool moreRounds;
+      comm = cliqueHead;
+      do { // Iterate clique members.
+        struct ncclComm* next = comm->groupNext;
+        if (useBarrier) {
+          // Barrier reduction result tells us if this was the final round.
+          moreRounds = 0 != ncclCommIntraBarrierOut(comm);
+        } else {
+          moreRounds = comm->unlaunchedPlansHead != nullptr;
+        }
+        if (moreRounds) {
+          // Pop next unlaunched kernel
+          struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
+          if (plan != nullptr) {
+            comm->unlaunchedPlansHead = plan->next;
+            CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
+            NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
+            NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
+          }
+          // Barrier reduction input indicates if we require further rounds.
+          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
+          if (plan != nullptr) {
+            NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
+          }
+        } else { // Final round.
+          CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
+          NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure);
+        }
+        comm = next;
+      } while (comm != cliqueNextHead);
+      if (!moreRounds) break;
+    }
+    cliqueHead = cliqueNextHead;
+  } while (cliqueHead != nullptr);
+failure:
+  return result;
+}
+
+ncclResult_t ncclGroupEndInternal() {
+  if (ncclGroupDepth == 0) {
    WARN("ncclGroupEnd: not in a group call.");
    return ncclInvalidUsage;
  }
-  ncclGroupMode--;
-  if (ncclGroupMode > 0) return ncclSuccess;
+  ncclGroupDepth--;
+  if (ncclGroupDepth > 0) return ncclSuccess;
+
  int savedDev;
  CUDACHECK(cudaGetDevice(&savedDev));
-  int activeThreads = 0;
-  int doneArray[MAX_ASYNC_OPS];
-  for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
+
  ncclResult_t ret = ncclGroupError;
-  int usingCudaGraphAll = -1;
-  cudaGraph_t* graphs = NULL;
-  if (ret != ncclSuccess) goto group_cleanup;
+  bool jobsDone = false;
+  if (ret != ncclSuccess) goto failure;

-  /* Launch async ncclCommInitRank */
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_INIT) {
-      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
-      activeThreads++;
-      doneArray[i] = 0;
-    }
-  }
-  /* For init, since we use threads, we just wait for threads to complete */
-  while (activeThreads) {
-    for (int i=0; i<ncclGroupIndex; i++) {
-      struct ncclAsyncArgs* args = ncclGroupArgs+i;
-      if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
-        int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
-        if (err == EBUSY) continue;
-        if (err != 0) ret = ncclSystemError;
-        if (args->ret != ncclSuccess) ret = args->ret;
-        doneArray[i] = 1;
-        activeThreads--;
-      }
-    }
+  if (ncclGroupCommPreconnectHead != nullptr) {
+    struct ncclComm* comm = ncclGroupCommPreconnectHead;
+    do {
+      struct ncclPreconnectJob* job;
+      NCCLCHECK(ncclCalloc(&job, 1));
+      job->base.func = ncclPreconnectFunc;
+      job->base.undo = nullptr;
+      job->base.destructor = free;
+      job->comm = comm;
+      ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base);
+
+      struct ncclComm* next = comm->preconnectNext;
+      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+      comm = next;
+    } while (comm != nullptr);
  }

-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
-      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
-    }
-  }
+  if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
+    struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
+    do {
+      pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job);
+      job = job->next;
+    } while (job != nullptr);

-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
-      int err = pthread_join(ncclGroupThreads[i], NULL);
+    job = ncclIntruQueueHead(&ncclAsyncJobs);
+    do {
+      int err = pthread_join(job->thread, nullptr);
      if (err != 0) {
        WARN("Error waiting for pthread_join : %s", strerror(errno));
-        return ncclSystemError;
+        ret = ncclSystemError;
      }
-      NCCLCHECKGOTO(args->ret, ret, end);
-      args->coll.comm->connect = 0;
-    }
+      if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result;
+      job = job->next;
+    } while (job != nullptr);
+
+    jobsDone = true;
+    if (ret != ncclSuccess) goto failure;
  }

-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      struct ncclComm* comm = args->coll.comm;
-      int node = comm->node;
-      int nNodes = comm->nNodes;
-      int localRank = comm->localRank;
+  if (ncclGroupCommHead != nullptr) {
+    NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure);
+    do {
+      struct ncclComm* comm = ncclGroupCommHead;
+      struct ncclComm* next = comm->groupNext;
+      ncclGroupCommLeave(comm);
+      ncclGroupCommHead = next;
+    } while (ncclGroupCommHead != nullptr);
+  }

-      // Compute how much to split operations
-      // Natural step size matching buffer steps.
-      ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
-      // Try to use all channels
-      int nChannelsMax = comm->p2pnChannelsPerPeer;
-      int nChannelsMin = nChannelsMax;
-      // Try to use all channels, but one channel per operation.
-      while (nChannelsMin*comm->nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
-      // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
-      while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
-
-      while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
-        // schedule delta 0, +1, -1, +2, -2, ...
-        // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
-        for (int d=0; d<=nNodes/4; d++) {
-          int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
-          int index = 0;
-          int delta = deltas[index];
-sched_delta:
-          uint32_t recvNode = (node+nNodes-delta)%nNodes;
-          uint32_t sendNode = (node+delta)%nNodes;
-          int steps = comm->maxLocalRanks;
-          for (int s=0; s<steps; s++) {
-            int recvIndex = (localRank-s+steps)%steps;
-            int recvPeer = recvIndex<comm->nodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
-            int sendIndex = (localRank+s)%steps;
-            int sendPeer = sendIndex<comm->nodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
-            struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL;
-            struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL;
-            if (recv != NULL || send != NULL) {
-              ssize_t totRecvBytes = -1, totSendBytes = -1;
-              if (recv != NULL) totRecvBytes = recv->nbytes;
-              if (send != NULL) totSendBytes = send->nbytes;
-              if (recv) comm->p2pRecvCount--;
-              if (send) comm->p2pSendCount--;
-              if (recvPeer == comm->rank) { // Check self send/recv
-                if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; }
-                if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; }
-                if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; }
-              }
-              void* recvBuff = recv ? recv->buff : NULL;
-              void* sendBuff = send ? send->buff : NULL;
-              // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL.
-              if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle();
-              if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle();
-
-              ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-              ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-
-              ssize_t sendOffset = 0;
-              ssize_t recvOffset = 0;
-              int sendRemaining = 1, recvRemaining = 1;
-              int chunk = 0;
-              do {
-                // Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
-                // to use multiple channels to guarantee progress on all ranks from the same node.
-                ssize_t recvbytes = totRecvBytes-recvOffset;
-                ssize_t sendbytes = totSendBytes-sendOffset;
-                if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
-                if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
-                // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
-                // (total size == 0), otherwise set size to -1.
-                if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
-                if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
-                if (recv) {
-                  NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, chunk, recvbytes, ((char*)recvBuff)+recvOffset), ret, group_cleanup);
-                }
-                if (send) {
-                  NCCLCHECKGOTO(scheduleSend(comm, sendPeer, chunk, sendbytes, ((char*)sendBuff)+sendOffset), ret, group_cleanup);
-                }
-                recvOffset += recvChunkSize;
-                sendOffset += sendChunkSize;
-                chunk++;
-              } while (sendRemaining || recvRemaining);
+  if (false) {
+  failure:
+    struct ncclComm* comm = ncclGroupCommHead;
+    while (comm != nullptr) {
+      struct ncclComm* next = comm->groupNext;
+      ncclGroupCommLeave(comm); // overwrites comm->groupNext
+      // We don't know if preconnect succeeded or happened at all, so clear
+      // the flags that let `taskAppend()` skip over checking if preconnect
+      // is needed.
+      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+      for (int i=0; i < comm->nRanks; i++) {
+        comm->tasks.peers[i].sendSeen = false;
+        comm->tasks.peers[i].recvSeen = false;
+        comm->connectSend[i] = 0;
+        comm->connectRecv[i] = 0;
+      }
+      comm->unlaunchedPlansHead = nullptr;
+      // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
+      // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
+      while (!ncclIntruQueueEmpty(&comm->planQueue)) {
+        struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
+        // Persistent plans will be reclaimed via the callbackQueue when the
+        // graph drops its UserObject reference.
+        if (!plan->persistent) {
+          for (int c=0; c < MAXCHANNELS; c++) {
+            while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
+              struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
+              ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
            }
          }
-          index++;
-          if (index == 1 && deltas[1] == deltas[0]) index++;
-          if (index == 2 && deltas[2] == deltas[0]) index++;
-          if (index == 3 && deltas[3] == deltas[2]) index++;
-          if (index == 3 && deltas[3] == deltas[1]) index++;
-          if (index < 4) {
-            delta = deltas[index];
-            goto sched_delta;
-          }
+          ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
        }
      }
+      // Reset comm->tasks to empty.
+      comm->tasks.nTasksColl = 0;
+      comm->tasks.nTasksP2p = 0;
+      comm->tasks.streams = nullptr;
+      ncclIntruQueueConstruct(&comm->tasks.collQueue);
+      comm->tasks.collBytesTotal = 0;
+      for (int i=0; i < comm->nRanks; i++) {
+        ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
+        ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
+      }
+      comm = next;
    }
  }

-  /* Collectives are done in three steps :
-   * 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
-   * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
-   * 2. Barrier Wait. No CUDA call is permitted
-   * 3. Enqueue Events. CUDA event wait/enqueue.
-   * This is needed because step 2 cannot call any CUDA primitive, otherwise if
-   * cudaFree happens between 1 and 3, it could block that CUDA call and
-   * prevent some ranks from launching their network threads, which would
-   * prevent the NCCL call from completing, blocking the cudaFree call.
-   */
-
-  // Check whether we are in cuda graph mode
-  NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      ncclComm_t comm = args->coll.comm;
-      NCCLCHECKGOTO(ncclGetCudaGraph(comm, graphs+i), ret, group_cleanup);
-      if (usingCudaGraphAll == -1) {
-        usingCudaGraphAll = comm->usingCudaGraph;
-      } else if (usingCudaGraphAll != comm->usingCudaGraph) {
-        WARN("Illegal to have some communicators in graph mode while others not");
-        ret = ncclInvalidUsage;
-        goto group_cleanup;
-      }
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      ncclComm_t comm = args->coll.comm;
-      NCCLCHECKGOTO(ncclSetupAsyncKernels(comm), ret, group_cleanup);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      if (args->coll.comm->userStream == cudaStreamDefault ||
-          args->coll.comm->userStream == cudaStreamPerThread ||
-          args->coll.comm->userStream == cudaStreamLegacy)
-        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
-      if (usingCudaGraphAll == 1) {
-        NCCLCHECKGOTO(ncclCudaGraphHostSetup(args->coll.comm, graphs[i]), ret, end);
-      } else {
-        ncclEnqueueHostSetup<0>(args->coll.comm->enqueueInfo);
-      }
-      NCCLCHECKGOTO(ncclLaunchBarrier(args->coll.comm), ret, end);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
-      NCCLCHECKGOTO(ncclLaunchKernel(args->coll.comm), ret, end);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      if (args->coll.comm->userStream == cudaStreamDefault ||
-          args->coll.comm->userStream == cudaStreamPerThread ||
-          args->coll.comm->userStream == cudaStreamLegacy)
-        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
-      NCCLCHECKGOTO(ncclRecordEvents(args->coll.comm), ret, end);
-      NCCLCHECKGOTO(ncclLaunchReset(args->coll.comm), ret, end);
-    }
+  while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
+    struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs);
+    if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job);
+    if (job->destructor) job->destructor((void*)job);
  }

-  goto end;
-group_cleanup:
-  if (ret != ncclSuccess) {
-    // At least one call in the group failed. Since we want to make that group
-    // an atomic operation, we need to cancel all operations.
-    for (int i=0; i<ncclGroupIndex; i++) {
-      struct ncclAsyncArgs* args = ncclGroupArgs+i;
-      if (args->funcType == ASYNC_FUNC_INIT) {
-        if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm);
-        *args->init.newcomm = NULL;
-      } else {
-        struct ncclComm* comm = args->coll.comm;
-        // Reset aggregation counters
-        comm->asyncOpCount = 0;
-        comm->asyncTotalSize = 0;
-        // Dequeue p2p lists
-        if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
-          for (int peer=0; peer<comm->nRanks; peer++) {
-            if (comm->p2pSends[peer]) comm->p2pSends[peer]->recycle();
-            if (comm->p2pRecvs[peer]) comm->p2pRecvs[peer]->recycle();
-          }
-          comm->p2pSendCount = comm->p2pRecvCount = 0;
-        }
-        ncclLaunchReset(comm);
-      }
-    }
-  }
-end:
  ncclGroupError = ncclSuccess;
-  ncclGroupIndex = 0;
+  ncclGroupCommHead = nullptr;
+  ncclGroupCommPreconnectHead = nullptr;
  CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
-  if (graphs) free(graphs);
  return ret;
 }
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@ -10,27 +10,39 @@
 #include "nccl.h"
 #include "checks.h"
 #include "align.h"
+#include "utils.h"
 #include <sys/mman.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>

+uint64_t clockNano(); // from utils.h with which we have a circular dependency
+
 template <typename T>
-static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
+ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time = 0;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  time = clockNano();
+  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
+  time = clockNano() - time;
  memset(*ptr, 0, nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return ncclSuccess;
+  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: cudaHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }
 #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+inline ncclResult_t ncclCudaHostFree(void* ptr) {
  CUDACHECK(cudaFreeHost(ptr));
  return ncclSuccess;
 }

 template <typename T>
-static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
  void* p = malloc(nelem*sizeof(T));
  if (p == NULL) {
    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
@ -44,7 +56,7 @@ static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc,
 #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

 template <typename T>
-static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
  if (nelem < oldNelem) return ncclInternalError;
  if (nelem == oldNelem) return ncclSuccess;

@ -63,29 +75,105 @@ static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
 }

 template <typename T>
-static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  // Need async stream for P2P pre-connect + CUDA Graph
+ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  uint64_t time = clockNano();
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time = clockNano() - time;
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+  return result;
+}
+#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time0=0, time1=0, time2=0;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  // Need a side stream so as not to interfere with graph capture.
  cudaStream_t stream;
+  time0 = clockNano();
  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
-  CUDACHECK(cudaStreamSynchronize(stream));
-  CUDACHECK(cudaStreamDestroy(stream));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return ncclSuccess;
+  time1 = clockNano();
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time2 = clockNano();
+  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
+  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaStreamCreateWithFlags=%g cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }
 #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

 template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
-  return ncclSuccess;
+ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time = 0;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  time = clockNano();
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time = clockNano() - time;
+  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  // Need a side stream so as not to interfere with graph capture.
+  cudaStream_t stream;
+  CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
+  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+template <typename T>
+ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+template <typename T>
+ncclResult_t ncclCudaFree(T* ptr) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  CUDACHECKGOTO(cudaFree(ptr), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }

 // Allocate memory to be potentially ibv_reg_mr'd. This needs to be
 // allocated on separate pages as those pages will be marked DONTFORK
 // and if they are shared, that could cause a crash in a child process
-static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
+inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
  size_t page_size = sysconf(_SC_PAGESIZE);
  void* p;
  int size_aligned = ROUNDUP(size, page_size);
--- a/src/include/channel.h
+++ b/src/include/channel.h
@ -31,7 +31,8 @@ static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int
 }

 static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
-  *channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
  return ncclSuccess;
 }

--- a/src/include/checks.h
+++ b/src/include/checks.h
@ -9,7 +9,7 @@

 #include "debug.h"

-// Check CUDA calls
+// Check CUDA RT calls
 #define CUDACHECK(cmd) do {                                 \
    cudaError_t err = cmd;                                  \
    if( err != cudaSuccess ) {                              \
@ -142,9 +142,9 @@
  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
 } while (!(cond));

-#define NCCLCHECKTHREAD(a) do { \
-  if ((args->ret = (a)) != ncclSuccess) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+#define NCCLCHECKTHREAD(a, args) do { \
+  if (((args)->ret = (a)) != ncclSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
    return args; \
  } \
 } while(0)
--- a/src/include/coll_net.h
+++ b/src/include/coll_net.h
@ -10,25 +10,26 @@
 #include "nccl.h"
 #include "nccl_net.h"

-extern ncclCollNet_t* ncclCollNet;
 typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];

 // Translation to external API
-static const char* collNetName() { return ncclCollNet->name; }
-static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
-static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
-static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
-static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
-  NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
-static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
+static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
+static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
+/* DMA-BUF support */
+static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
+  NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
+static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }

-static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
+static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }

 #endif
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@ -36,7 +36,7 @@ struct ncclDevRedOpFull {
 /* Declare all collective operations */
 #define DECL5(func, algo, proto, devredop, type) \
  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \

 #define CONCAT(a,b) a##b
 #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
--- a/src/include/comm.h
+++ b/src/include/comm.h
@ -10,6 +10,8 @@
 #include "transport.h"
 #include "p2p.h"
 #include "collectives.h"
+#include "proxy.h"
+#include "strongstream.h"

 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@ -58,8 +60,6 @@ struct ncclRecvMem {
  };
 };

-typedef cudaError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
-
 enum helperThreadState {ThreadStart, ThreadStop};

 #define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
@ -85,15 +85,87 @@ struct ncclNodeRanks {
  int* localRankToRank;
 };

-struct ncclComm {
-  struct ncclChannel channels[MAXCHANNELS];
+struct ncclDestructor {
+  struct ncclDestructor* next;
+  void* obj;
+  ncclResult_t(*fn)(struct ncclDestructor* me);
+};

+struct ncclCommCallback {
+  struct ncclCommCallback* next;
+  ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
+};
+
+struct ncclChannel {
+  struct ncclChannelPeer* peers;
+  struct ncclDevChannelPeer* devPeers;
+  struct ncclRing ring;
+  int* devRingUserRanks;
+  struct ncclTree tree;
+  struct ncclDirect collTree;
+  int id; // index of this channel
+  uint32_t workFifoSent; // last used work index+1
+  uint64_t p2pOpCount;
+};
+
+struct ncclWorkList {
+  struct ncclWorkList* next;
+  struct ncclWork work;
+};
+
+struct ncclPointerList {
+  struct ncclPointerList* next;
+  void *ptr;
+};
+
+struct ncclKernelPlan {
+  // A kernel plan is also a callback that reclaims itself. Hence this must
+  // be the first member.
+  struct ncclCommCallback reclaimer;
+  struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
+
+  struct ncclComm* comm;
+  struct ncclKernelPlan* next;
+
+  bool persistent; // aka captured in a graph
+  void *kernelFn;
+  int channelUbound; // only channels c < channelUbound are present
+  int channelCount; // number of channels present
+  uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
+  bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
+  int threadPerBlock;
+  // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
+  struct ncclWork* workHead;
+
+  int collOpCount; // zero based for this plan
+
+  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
+
+  struct Channel {
+    int nWork;
+    union {
+      int nWorkElem; // used for coll and reg coll
+      int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
+    };
+    size_t collBytes;
+    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
+    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+  } channels[MAXCHANNELS];
+};
+
+struct ncclComm {
+  struct ncclMemoryStack memPermanent, memScoped;
+  // List of destructors to run when comm is destructed
+  struct ncclDestructor* destructorHead;
+
+  struct ncclChannel channels[MAXCHANNELS];
  struct ncclPeerInfo* peerInfo;
  struct ncclTopoSystem* topo;

+  ncclNet_t* ncclNet;
+  ncclCollNet_t* ncclCollNet;
  void* bootstrap;
  // Bitmasks for ncclTransportP2pSetup
-  int connect;
  uint32_t* connectSend;
  uint32_t* connectRecv;

@ -114,12 +186,8 @@ struct ncclComm {
  // localRanks and localRanktoRank for all nodes
  struct ncclNodeRanks* nodeRanks;

-  enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
-  cudaStream_t userStream;
-  bool userStreamSet;
-  cudaEvent_t doneEvent;
-  cudaEvent_t intDoneEvent;
  bool checkPointers;
+  bool dmaBufSupport;

  // Counter for tracking CUDA launches (P2P and collectives included)
  uint64_t opCount;
@ -142,36 +210,37 @@ struct ncclComm {
  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];

-  // An internal CUDA stream for NCCL kernel CGMD launches
-  int groupCudaStream;
-  cudaStream_t groupStream;
-
  // Whether there has been a fatal error in this communicator.
  ncclResult_t fatalError;

  // Flag to ask NCCL kernels to abort
  volatile uint32_t *abortFlag;

-  // Device side of the communicator
-  struct ncclDevComm *devComm;
-  // Host copy of the devComm (to free CUDA allocs)
-  struct ncclDevComm hostDevComm;
+  // Device side of the communicator (for cudaFree's)
+  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
+
+  // Operation pool.
+  int workFifoDepth; // size of workFifoHeap[], power of 2
+  struct ncclWork* workFifoHeap;
+  struct ncclWork* devWorkFifoHeap;
+  void* workFifoHeapGdrHandle;
+
+  // Work completion notificaion
+  uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
+  uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
+  uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.

  // Intra-process sync
+  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
+  struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
+  int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks)
  int intraRank;
  int intraRanks;
-  int* intraBarrier;
-  int intraPhase;
-
-  // Storage for deferred intra-process launch
-  struct cudaLaunchParams * intraParams;
-  struct cudaLaunchParams *myParams;
-  pthread_t* intraThreads;
-  int* intraCudaDevs;
-  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
-  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclWorkElem args;
-  void* argsptrs[2];
+  uint32_t intraBarrierPhase;
+  char intraPad1[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierCounter; // only used if this is intraComm0
+  char intraPad2[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierGate; // only used if this is intraComm0

  struct ncclProxyState proxyState;

@ -179,39 +248,98 @@ struct ncclComm {
  int collNetSupport;
  int intraHighestTransportType;

-  // Store info of async operations
-  struct ncclInfo* asyncOps;
-  int asyncOpCount;
-  size_t asyncTotalSize;
-  ssize_t channelSize;
-  int lastChannel;
-  enum { ROUND_ROBIN, SHORTEST_QUEUE } asyncAllocMode;
+  size_t channelSize; // User requested work size (bytes) for channel partitions

-  //list of async p2p operation queued in a group semantics
-  ncclP2Plist** p2pSends;
-  ncclP2Plist** p2pRecvs;
-  int p2pSendCount;
-  int p2pRecvCount;
+  // Internal streams
+  struct ncclStrongStream deviceStream, hostStream;

-  // Store info for cudaGraph
-  int usingCudaGraph; // Only use it during capture time, not launch time
-  struct ncclQueueInfo* enqueueInfo;
-  int nQueueInfoCreated;
-  int nQueueInfoDestroyed;
-  cudaGraphNode_t lastSetupNode;
-  unsigned long long lastCudaGraphId;
-  int driverVersion;
-  pfn_cuMemGetAddressRange_t pfnCuMemGetAddressRange;
-  pthread_t graphHelperThread;
-  struct ncclGraphHelperResources* graphHelperResources;
-  int disableGraphHelper;
-  int graphRegister;
+  // pools backed by comm->memPermanent
+  struct ncclMemoryPool memPool_ncclProxyOp;
+  struct ncclMemoryPool memPool_ncclKernelPlan;
+  struct ncclMemoryPool memPool_ncclPointerList;
+  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
+  // this comm is not yet in a group.
+  struct ncclComm* groupNext;
+  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
+  struct ncclComm* preconnectNext;
+  int persistentRefs; // number of persistent plan-lists capturing this comm
+  struct ncclTasks tasks;

  // user-created reduction ops
  int userRedOpCapacity, userRedOpFreeHead;
  ncclUserRedOp *userRedOps;
+
+  // Queue of things for the main thread to do
+  struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
+
+  // List of kernel plans built form tasks.
+  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
+  // First of the unlaunched kernels in `planQueue`
+  struct ncclKernelPlan* unlaunchedPlansHead;
 };

+// Set to true during an `atexit()` handler. We use this to intentionally leak
+// unfreed CUDA resources when cleaning up after return of `main()` to avoid
+// CUDA calls after CUDA runtime teardown.
+extern bool ncclMainExited;
+
+enum ncclLaunchMode {
+  ncclLaunchModeInvalid=0,
+  ncclLaunchModeParallel,
+  ncclLaunchModeGroup
+};
+extern enum ncclLaunchMode ncclParamLaunchMode;
+
+void ncclCommPushFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
+
+inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) {
+  struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false);
+  while (cb != nullptr) {
+    struct ncclCommCallback* next = cb->next;
+    NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb
+    cb = next;
+  }
+  return ncclSuccess;
+}
+
+inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
+  int phase = comm->intraBarrierPhase;
+  if (comm->intraRanks == 1) {
+    // Release everyone (just me).
+    comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
+  } else {
+    struct ncclComm* comm0 = comm->intraComm0;
+    uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
+    if (uint32_t(count) == uint32_t(comm->intraRanks)) {
+      // Reset.
+      __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
+      // Release everyone.
+      __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
+    }
+  }
+}
+
+// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
+inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
+  struct ncclComm* comm0 = comm->intraComm0;
+  comm->intraBarrierPhase ^= 1;
+  uint32_t phase = comm->intraBarrierPhase;
+  uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+  if ((gate & 1) != phase) {
+    uint64_t t0 = clockNano();
+    do {
+      // Spin vigorously for first 5us.
+      if (clockNano()-t0 >= 5*1000) sched_yield();
+      gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+    } while ((gate & 1) != phase);
+  }
+  if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
+  return gate>>32;
+}
+
 // Scrambles the bits of non-builtin values of ncclRedOp_t according to the
 // communicator memory address. Used to catch bugs so that integer handles
 // associated with this communicator won't collide with handles of other
--- a/src/include/core.h
+++ b/src/include/core.h
@ -55,6 +55,7 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {

 #include "debug.h"
 #include "checks.h"
+#include "cudawrap.h"
 #include "alloc.h"
 #include "utils.h"
 #include "param.h"
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@ -0,0 +1,88 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CUDAWRAP_H_
+#define NCCL_CUDAWRAP_H_
+
+#include <cuda.h>
+
+#if CUDART_VERSION >= 11030
+#include <cudaTypedefs.h>
+#else
+typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+#endif
+
+#define CUPFN(symbol) pfn_##symbol
+
+// Check CUDA PFN driver calls
+#define CUCHECK(cmd) do {				      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure '%s'", errStr);		      \
+      return ncclUnhandledCudaError;			      \
+    }							      \
+} while(false)
+
+#define CUCHECKGOTO(cmd, res, label) do {		      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure '%s'", errStr);		      \
+      res = ncclUnhandledCudaError;			      \
+      goto label;					      \
+    }							      \
+} while(false)
+
+// Report failure but clear error and continue
+#define CUCHECKIGNORE(cmd) do {						\
+    CUresult err = pfn_##cmd;						\
+    if( err != CUDA_SUCCESS ) {						\
+      const char *errStr;						\
+      (void) pfn_cuGetErrorString(err, &errStr);			\
+      INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr);	\
+    }									\
+} while(false)
+
+#define CUCHECKTHREAD(cmd, args) do {					\
+    CUresult err = pfn_##cmd;						\
+    if (err != CUDA_SUCCESS) {						\
+      INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
+      args->ret = ncclUnhandledCudaError;				\
+      return args;							\
+    }									\
+} while(0)
+
+#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
+
+#if CUDART_VERSION >= 11030
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
+DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
+DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
+#if CUDA_VERSION >= 11070
+DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
+#endif
+#endif
+
+/* CUDA Driver functions loaded with dlsym() */
+DECLARE_CUDA_PFN_EXTERN(cuInit);
+DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion);
+DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress);
+
+
+ncclResult_t cudaLibraryInit(void);
+
+#endif
--- a/src/include/debug.h
+++ b/src/include/debug.h
@ -10,8 +10,8 @@
 #include "nccl_net.h"
 #include <stdio.h>
 #include <chrono>
+#include <type_traits>

-#include <sys/syscall.h>
 #include <limits.h>
 #include <string.h>
 #include <pthread.h>
@ -21,7 +21,7 @@

 extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
-extern pthread_mutex_t ncclDebugOutputLock;
+extern pthread_mutex_t ncclDebugLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);

@ -29,13 +29,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file

 // Let code temporarily downgrade WARN into INFO
 extern thread_local int ncclDebugNoWarn;
+extern char ncclLastError[];

 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)

 #ifdef ENABLE_TRACE
 #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
-extern std::chrono::high_resolution_clock::time_point ncclEpoch;
+extern std::chrono::steady_clock::time_point ncclEpoch;
 #else
 #define TRACE(...)
 #endif
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@ -121,7 +121,6 @@ struct ncclRing {
  // since we need to know how the user expects data to be ordered across
  // devices. Ordered from current device.
  int* userRanks;
-  int* devUserRanks;

  int index; // This rank's index in the ring
 };
@ -146,7 +145,7 @@ struct ncclDirect {
 };

 #define NCCL_MAX_CONNS 2
-struct ncclPeer {
+struct ncclChannelPeer {
  struct ncclConnector send[NCCL_MAX_CONNS];
  struct ncclConnector recv[NCCL_MAX_CONNS];
 };
@ -158,30 +157,38 @@ struct ncclDevComm;
 /* Make sure to adjust padding at the end of ncclWorkElem. */
 #define NCCL_WORK_SIZE 512

-enum ncclWorkElemType : uint8_t {
+enum ncclWorkType : uint8_t {
   ncclWorkTypeUnused=0,
   ncclWorkTypeColl=1,
   ncclWorkTypeP2p=2,
   ncclWorkTypeRegColl=3
 };
-enum ncclWorkElemSubType : uint8_t {
-  ncclWorkSubTypeUnused =0,
-  ncclWorkSubTypeSend,
-  ncclWorkSubTypeRecv
+enum ncclWorkP2PType : uint8_t {
+  ncclWorkP2pTypeUnused=0,
+  ncclWorkP2pTypeSend,
+  ncclWorkP2pTypeRecv
 };

-struct ncclWorkElemHeader {
+struct ncclWorkHeader {
+  union {
+    int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
+    uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
+  };
  uint16_t funcIndex;
-  enum ncclWorkElemType type;
-  unsigned nWarps:5;
-  unsigned isLast:1;
+  uint8_t isLast:1; // last work for this kernel
+  uint8_t inFifo:1; // is this work in the fifo
+  enum ncclWorkType type;
 };

 struct ncclWorkElem {
-  struct ncclWorkElemHeader header;
-  uint8_t regUsed;
+  union {
+    uint8_t flagBits;
+    struct {
+      uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1;
+    };
+  };
+  uint8_t nWarps;
  uint8_t direct;
-  uint8_t redOpArgIsPtr;

  const void * sendbuff;
  void * recvbuff;
@ -192,22 +199,29 @@ struct ncclWorkElem {
  uint8_t bid;
  uint8_t nChannels;
  uint64_t redOpArg;
-  uint64_t pad;
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size");
+
+#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
+static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");

 struct ncclWorkElemP2p {
-  struct ncclWorkElemHeader header;
  int32_t peer;
-  void* buff;
-  size_t count;
-  int chunkSize;
-  uint8_t ngroups;
-  uint8_t warpStart;
+  enum ncclWorkP2PType p2pType;
  uint8_t nWarps;
-  enum ncclWorkElemSubType subType;
+  uint8_t warpStart;
+  uint8_t ngroups;
+  // Important not to use any fields with greater than 4-byte alignment since
+  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
+  // there were 8-byte fields.
+  //void* buff;
+  uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
+  //size_t count;
+  uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
+  int chunkSize;
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size");
+
+static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16");
+#define NCCL_MAX_WORK_ELEMENTS_P2P 16

 struct ncclWorkElemReg {
  struct ncclWorkElem elem;
@ -215,72 +229,59 @@ struct ncclWorkElemReg {
  void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
  void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size");
-static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size");

-#define NCCL_MAX_WORK_ELEMENTS (NCCL_WORK_SIZE/sizeof(struct ncclWorkElem))
-#define NCCL_MAX_WORK_ELEMENTS_P2P (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemP2p))
-#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg))
+#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
+static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2");
+
 // Number of named barriers supported by CUDA
 #define NCCL_MAX_GROUPS 16

 struct ncclWork {
+  struct ncclWorkHeader header;
  union {
-    char pad[NCCL_WORK_SIZE];
-    struct ncclWorkElemHeader header;
+    char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
    struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
  };
 };
+static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
+static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");

-static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned");
-
-struct ncclChannel {
-  union {
-    struct {
-      struct ncclRing ring;
-      struct ncclTree tree;
-      struct ncclDirect collTree;
-
-      int id;
-
-      // Communication structures
-      struct ncclPeer* peers;
-      struct ncclPeer* devPeers;
-
-      // Operation list for aggregation
-      struct ncclWork* workFifo;
-      int workCount;
-      size_t totalSize;
-      uint64_t workFifoTail; // Only used by CPU
-      uint16_t index;        // Only used by GPU
-
-      // GDRCOPY support
-      struct ncclWork* workFifoGdr;
-      struct ncclWork* workFifoDev;
-      void* gdrMemDesc;
-    };
-    int data[0x80];
-  };
+struct ncclDevChannelPeer {
+  // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
+  // instead of the full ncclConnector.
+  struct ncclConnInfo send[NCCL_MAX_CONNS];
+  struct ncclConnInfo recv[NCCL_MAX_CONNS];
+};
+
+struct alignas(16) ncclDevChannel {
+  struct ncclDevChannelPeer *peers;
+  struct ncclRing ring;
+  struct ncclTree tree;
+  struct ncclDirect collTree;
+  uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
 };
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");

 struct ncclDevComm {
  int rank;
  int nRanks;
  int buffSizes[NCCL_NUM_PROTOCOLS];

+  // Operation list for aggregation
+  int workFifoDepth;
+  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
+
  // Flag to ask NCCL kernels to abort
-  volatile uint32_t *abortFlag;
+  volatile uint32_t* abortFlag;

  // Channels, device side
-  struct ncclChannel* channels;
+  struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
 };

-struct ncclDevCommAndChannels {
-  ncclDevComm comm;
-  ncclChannel channels[MAXCHANNELS];
+struct alignas(16) ncclDevCommAndChannels {
+  struct ncclDevComm comm;
+  struct ncclDevChannel channels[MAXCHANNELS];
 };

 #endif
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@ -10,6 +10,7 @@
 #include "comm.h"
 #include "group.h"
 #include "collectives.h"
+#include "utils.h"

 #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
 #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
@ -17,117 +18,10 @@
 size_t ncclKernMaxLocalSize();
 ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
-ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
-ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
-ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
-ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
-ncclResult_t ncclLaunchKernel(ncclComm_t comm);
-ncclResult_t ncclRecordEvents(struct ncclComm* comm);
-ncclResult_t ncclLaunchReset(ncclComm_t comm);
-ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
-ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
-template<int USING_CUDA_GRAPH>
-void CUDART_CB ncclEnqueueHostSetup(void* arg);
-ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
-ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
+ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
+ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchFinish(struct ncclComm* comm);

-struct ncclBuffRegInfo {
-  void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
-  void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
-  void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
-  void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
-  int nBuffs;
-};
-
-// Enqueue information (for kernel and proxy) for each operation
-struct ncclQueueElem {
-  struct ncclWork work;
-  struct ncclProxyOp proxyOp;
-  struct ncclBuffRegInfo buffRegInfo;
-};
-
-typedef ncclRecyclableList<struct ncclQueueElem> ncclQueueElemList;
-
-// Structure passed to CUDA graph
-struct ncclQueueInfo {
-  ncclComm_t comm;
-  int maxChannels;    // Dynamic version of gridDim
-  ncclResult_t ret;   // Return value of host setup call
-  int nRegBuffs;
-  ncclQueueElemList* elemList;
-};
-
-static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_t comm) {
-  NCCLCHECK(ncclCalloc(eqInfo, 1));
-  (*eqInfo)->comm = comm;
-  (*eqInfo)->elemList = new ncclQueueElemList();
-  (*eqInfo)->comm->nQueueInfoCreated++;
-  return ncclSuccess;
-}
-
-// Reset element queue
-static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
-  if (eqInfo == NULL) return ncclInternalError;
-  eqInfo->maxChannels = 0;
-  eqInfo->ret = ncclSuccess;
-  eqInfo->nRegBuffs = 0;
-  eqInfo->elemList->recycle();
-  return ncclSuccess;
-}
-
-// Destroy enqueue info space
-// used by both CUDA graph and non CUDA graph
-static void ncclDestroyQueueInfo(void* ptr) {
-  if (ptr == NULL) return;
-  struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
-  struct ncclComm* comm = eqInfo->comm;
-  // Close IPC mem handles for registered buffers
-  struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
-#if 0
-  // Ideally, the deregistration should happen here
-  // but currently the destroy function of CUDA objects does not allow CUDA API calls
-  while (eqElem != NULL) {
-    for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
-      if (i == eqInfo->comm->localRank) continue;
-      CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
-      CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
-    }
-    eqElem = eqInfo->elemList->getNext();
-  }
-#else
-  // Instead, we push these pointers to a pool owned by ncclComm
-  // and asks a helper thread to close mem handles
-  struct ncclGraphHelperResources* res = comm->graphHelperResources;
-  int ipcTailOld = 0;
-  if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip;
-
-  pthread_mutex_lock(&res->threadLock);
-  ipcTailOld = res->ipcTail;
-  while (eqElem != NULL) {
-    for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
-      if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) {
-        res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i];
-        res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
-      }
-      if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) {
-        res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i];
-        res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
-      }
-    }
-    eqElem = eqInfo->elemList->getNext();
-  }
-  if (res->ipcTail != ipcTailOld) {
-    res->threadState = ThreadStart;
-    TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld);
-    pthread_cond_signal(&res->threadCond);
-  }
-  pthread_mutex_unlock(&res->threadLock);
-#endif
-
-skip:
-  delete eqInfo->elemList;
-  free(eqInfo);
-  comm->nQueueInfoDestroyed++;
-  return;
-}
 #endif // End include guard
--- a/src/include/graph.h
+++ b/src/include/graph.h
@ -23,7 +23,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
 ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);

-ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
 void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
@ -33,7 +33,7 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
-int ncclPxnDisable();
+int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
 ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);

--- a/src/include/group.h
+++ b/src/include/group.h
@ -10,15 +10,82 @@
 #include "nccl.h"
 #include "comm.h"

-bool ncclAsyncMode();
-ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
+ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
+void ncclGroupCommJoin(struct ncclComm* comm);
+void ncclGroupCommPreconnect(struct ncclComm* comm);
+void ncclGroupCommLeave(struct ncclComm* comm);

 typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);

 ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);

-typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+struct ncclAsyncJob {
+  struct ncclAsyncJob* next;
+  pthread_t thread;
+  ncclResult_t result;
+  ncclResult_t(*func)(struct ncclAsyncJob*);
+  void(*undo)(struct ncclAsyncJob*);
+  void(*destructor)(void*);
+};
+
+ncclResult_t ncclAsyncLaunch(
+  struct ncclAsyncJob* job,
+  ncclResult_t(*func)(struct ncclAsyncJob*),
+  void(*undo)(struct ncclAsyncJob*),
+  void(*destructor)(void*)
+);
+
+ncclResult_t ncclGroupStartInternal();
+ncclResult_t ncclGroupEndInternal();
+
+////////////////////////////////////////////////////////////////////////////////
+
+extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
+extern __thread ncclResult_t ncclGroupError;
+extern __thread struct ncclComm* ncclGroupCommHead;
+extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
+
+inline ncclResult_t ncclGroupStartInternal() {
+  ncclGroupDepth++;
+  return ncclSuccess;
+}
+
+inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
+  if (ncclGroupDepth > 0) {
+    if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
+  }
+  return ret;
+}
+
+// Add comm to this thread's group
+inline void ncclGroupCommJoin(struct ncclComm* comm) {
+  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
+    // the users program order yet insures siblings occur consecutively. This
+    // is required by doLaunches() in "group.cc".
+    struct ncclComm** pp = &ncclGroupCommHead;
+    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
+      pp = &(*pp)->groupNext;
+    comm->groupNext = *pp;
+    *pp = comm;
+    // Comms gets a new memory stack scope upon joining. Each task batched for
+    // this comm is allocated there.
+    ncclMemoryStackPush(&comm->memScoped);
+  }
+}
+
+// Add comm to this thread's group needing preconnect
+inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
+  if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    comm->preconnectNext = ncclGroupCommPreconnectHead;
+    ncclGroupCommPreconnectHead = comm;
+  }
+}
+
+// Comm has left group
+inline void ncclGroupCommLeave(struct ncclComm* comm) {
+  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+  ncclMemoryStackPop(&comm->memScoped);
+}

-ncclResult_t ncclAsyncColl(ncclComm_t comm);
 #endif
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@ -1067,6 +1067,9 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
 ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
+/* DMA-BUF support */
+ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
 ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
 ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
--- a/src/include/info.h
+++ b/src/include/info.h
@ -10,6 +10,9 @@
 #include "nccl.h"
 #include "devcomm.h"
 #include "collectives.h"
+#include "core.h"
+#include "utils.h"
+#include "strongstream.h"

 typedef enum : uint8_t {
  ncclPatternRing,
@ -54,4 +57,62 @@ struct ncclInfo {
  int channelId;
 };

+inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
+  return ncclSuccess;
+}
+
+struct ncclTaskColl {
+  struct ncclTaskColl* next;
+  ncclFunc_t func;
+  void const* sendbuff;
+  void* recvbuff;
+  size_t count;
+  int root;
+  ncclDataType_t datatype;
+  ncclDevRedOpFull op;
+  int chunkSteps, sliceSteps;
+};
+struct ncclTaskP2p {
+  ncclTaskP2p *next;
+  void *buff;
+  size_t bytes;
+  // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
+  // of where it left off.
+  int chunk;
+};
+
+struct ncclCudaStreamList {
+  struct ncclCudaStreamList *next;
+  cudaStream_t stream;
+};
+
+struct ncclTasks {
+  struct Peer {
+    bool sendSeen, recvSeen;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
+  };
+  struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
+  size_t collBytesTotal;
+  struct Peer* peers/*[nRanks]*/;
+  int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
+  int nTasksColl, nTasksP2p;
+
+  // The list of user streams aggregated over all tasks present.
+  struct ncclCudaStreamList* streams;
+  // The most recent user stream. Ignored if streams==nullptr
+  cudaStream_t streamRecent;
+  // The graph capturing all user streams or invalid if none. Thus we restrict the
+  // user that all streams must be captured in the same graph or not captured
+  // at all. Technically we could probably relax this, but that would mean
+  // collecting a different `ncclTasks` per graph and one for non-graph.
+  struct ncclCudaGraph capturingGraph;
+};
+
 #endif
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@ -14,12 +14,13 @@

 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4

 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 8

 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;

 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

@ -28,15 +29,15 @@ typedef struct {
  char* pciPath;  // Path to the PCI device in /sys.
  uint64_t guid;  // Unique identifier for the NIC chip. Important for
                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
  int speed;      // Port speed in Mbps.
  int port;       // Port number.
  float latency;  // Network latency
  int maxComms;   // Maximum number of comms we can create
  int maxRecvs;   // Maximum number of grouped receives.
-}ncclNetProperties_v5_t;
+}ncclNetProperties_v6_t;

-typedef ncclNetProperties_v5_t ncclNetProperties_t;
+typedef ncclNetProperties_v6_t ncclNetProperties_t;

 typedef struct {
  // Name of the network (mainly for logs)
@ -46,7 +47,103 @@ typedef struct {
  // Return the number of adapters.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef ncclNet_v6_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+typedef ncclCollNet_v6_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create a connection.
@ -83,10 +180,7 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v5_t;

-typedef ncclNet_v5_t ncclNet_t;
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5
-
+// v5 struct for backwards compatibility
 typedef struct {
  // Name of the collective network (mainly for logs)
  const char* name;
@ -96,7 +190,7 @@ typedef struct {
  // If ndev returns 0, all other functions might be set to NULL.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create connections.
@ -125,10 +219,7 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v5_t;

-typedef ncclCollNet_v5_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5
-
+// v4 struct for backwards compatibility
 typedef struct {
  char* name;     // Used mostly for logging.
  char* pciPath;  // Path to the PCI device in /sys.
@ -140,6 +231,7 @@ typedef struct {
  int maxComms;   // Maximum number of comms we can create
 } ncclNetProperties_v4_t;

+// v4 struct for backwards compatibility
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
@ -179,6 +271,7 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v4_t;

+// v4 struct for backwards compatibility
 typedef struct {
  // Name of the collective network (mainly for logs)
  const char* name;
--- a/src/include/net.h
+++ b/src/include/net.h
@ -9,33 +9,36 @@

 #include "nccl.h"
 #include "nccl_net.h"
+#include "comm.h"
 #include "checks.h"

-extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];

-ncclResult_t ncclNetInit();
-int ncclNetVersion();
+ncclResult_t ncclNetPluginInit();
+ncclResult_t ncclNetInit(struct ncclComm* comm);
+int ncclNetVersion(struct ncclComm* comm);

 // Translation to external API
-static const char* ncclNetName() { return ncclNet->name; }
-static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
+static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
+static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
+/* DMA-BUF support */
+static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }

 // Test whether the current GPU support GPU Direct RDMA.
-ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);

 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
--- a/src/include/p2p.h
+++ b/src/include/p2p.h
@ -9,19 +9,4 @@
 #ifndef NCCL_P2P_H_
 #define NCCL_P2P_H_

-struct ncclP2Pinfo {
-  void* buff;
-  ssize_t nbytes;
-};
-
-typedef ncclRecyclableList<struct ncclP2Pinfo> ncclP2Plist;
-
-static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes) {
-  if (p2p == NULL) p2p = new ncclP2Plist();
-  struct ncclP2Pinfo* next;
-  NCCLCHECK(p2p->getNewElem(&next));
-  next->buff = buff;
-  next->nbytes = nBytes;
-  return ncclSuccess;
-}
 #endif
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@ -32,11 +32,16 @@ struct ncclProxyOp {
  int sliceSteps;
  int chunkSteps;
  int chunkSize;
-  ncclDataType_t dtype;
-  ncclRedOp_t redOp;
-  ncclPattern_t pattern; // uint8_t
+  uint8_t /*ncclDataType_t*/ dtype;
+  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclPattern_t*/ pattern;
  uint8_t protocol;
-  uint16_t pad;
+
+  union {
+    uint64_t unused;
+    // For use by enqueue.cc
+    struct ncclProxyOp *enqNext;
+  };
 };
 static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");

@ -68,9 +73,9 @@ struct ncclProxyArgs {
  int sliceSteps;
  int chunkSteps;
  int chunkSize;
-  ncclDataType_t dtype;
-  ncclRedOp_t redOp;
-  ncclPattern_t pattern;
+  uint8_t /*ncclDataType_t*/ dtype;
+  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclPattern_t*/ pattern;
  uint8_t protocol;
  int state;
  char* sharedBuff[NCCL_STEPS];
@ -158,6 +163,7 @@ struct ncclProxyState {
  pthread_t thread;
  struct ncclSocket* listenSock;
  int stop;
+  CUcontext cudaCtx;

  // Used by main thread
  union ncclSocketAddress* peerAddresses;
@ -187,9 +193,8 @@ enum proxyMode {
  proxyTo = 2
 };

-ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks);
+ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
 ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
--- a/src/include/strongstream.h
+++ b/src/include/strongstream.h
@ -0,0 +1,142 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_STRONGSTREAM_H_
+#define NCCL_STRONGSTREAM_H_
+
+#include "nccl.h"
+#include "checks.h"
+
+#include <stdint.h>
+
+/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
+ * easily.
+ */
+struct ncclCudaGraph {
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graph;
+  uint64_t graphId;
+#endif
+};
+
+inline struct ncclCudaGraph ncclCudaGraphNull() {
+  struct ncclCudaGraph tmp;
+  #if CUDART_VERSION >= 11030
+    tmp.graph = nullptr;
+    tmp.graphId = ULLONG_MAX;
+  #endif
+  return tmp;
+}
+
+inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
+  #if CUDART_VERSION >= 11030
+    return graph.graph != nullptr;
+  #else
+    return false;
+  #endif
+}
+
+inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) {
+  #if CUDART_VERSION >= 11030
+    return a.graphId == b.graphId;
+  #else
+    return true;
+  #endif
+}
+
+ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream);
+ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg);
+
+
+/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
+ * identity while being captured. Regular streams have the deficiency that the
+ * captured form of a stream in one graph launch has no relation to the
+ * uncaptured stream or to the captured form in other graph launches. This makes
+ * streams unfit for the use of serializing access to a persistent resource.
+ * Strong streams have been introduced to address this need.
+ *
+ * Constraints of using strong streams:
+ *
+ * - Operations that enqueue work to the strong stream need to be enclosed by
+ *   ncclStrongStream[Acquire/Release] pairs. Acquire/release act like fences,
+ *   the strong stream is not stateful so there is no harm in redundant acquire
+ *   or releases.
+ *
+ * - An {Acquire; ...; Release} sequence must not be concurrent with any
+ *   other operations against the strong stream including graph launches which
+ *   reference this stream.
+ *
+ * - All strong stream functions take a "graph" parameter which must reference
+ *   the currently capturing graph, or null if none.
+ */
+struct ncclStrongStream;
+
+ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
+ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
+
+// Has this strong stream ever been captured in a graph.
+bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss);
+
+// Acquire-fence the strong stream.
+ncclResult_t ncclStrongStreamAcquire(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss
+);
+
+// Acquire-fence the strong stream assuming no graph is capturing. This permits
+// the caller to enqueue directly to the `ss->stream` member using native CUDA
+// calls. Strong stream must be released via:
+//   ncclStrongStreamRelease(ncclCudaGraphNull(), graphRefs, ss);
+ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
+
+// Release-fence of the strong stream.
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
+
+// Add a host launch to the stream.
+ncclResult_t ncclStrongStreamLaunchHost(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+  cudaHostFn_t fn, void* arg
+);
+// Add a kernel launch to the stream.
+ncclResult_t ncclStrongStreamLaunchKernel(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
+);
+// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
+);
+// `b` must be capturing within `graph`.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
+);
+// `a` must be capturing within `graph`.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
+);
+
+// Synchrnoization does not need the strong stream to be acquired.
+ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclStrongStream {
+  cudaStream_t stream;
+  cudaEvent_t event;
+  #if CUDART_VERSION >= 11030
+  cudaGraphNode_t node; // null if never captured, otherwise never null again
+  uint64_t graphId:63, eventIsLagging:1;
+  #endif
+};
+
+inline bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    return ss->node != nullptr;
+  #else
+    return false;
+  #endif
+}
+
+#endif
--- a/src/include/transport.h
+++ b/src/include/transport.h
@ -20,7 +20,12 @@

 #include "proxy.h"

-extern struct ncclTransport ncclTransports[];
+extern struct ncclTransport p2pTransport;
+extern struct ncclTransport shmTransport;
+extern struct ncclTransport netTransport;
+extern struct ncclTransport collNetTransport;
+
+extern struct ncclTransport* ncclTransports[];

 // Forward declarations
 struct ncclRing;
@ -63,7 +68,7 @@ struct ncclTransport {
  struct ncclTransportComm recv;
 };

-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);

 enum { collNetRecv=0, collNetSend=1 };
--- a/src/include/utils.h
+++ b/src/include/utils.h
@ -8,8 +8,12 @@
 #define NCCL_UTILS_H_

 #include "nccl.h"
+#include "alloc.h"
 #include "checks.h"
 #include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <new>

 int ncclCudaCompCap();

@ -38,81 +42,446 @@ static long log2i(long n) {
 return l;
 }

-// Recyclable list that avoids frequent malloc/free
+inline uint64_t clockNano() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename Int>
+inline void ncclAtomicRefCountIncrement(Int* refs) {
+  __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
+}
+
+template<typename Int>
+inline Int ncclAtomicRefCountDecrement(Int* refs) {
+  return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
+ * granularity of LIFO is not per object, instead frames containing many objects
+ * are pushed and popped. Therefor deallocation is extremely cheap since its
+ * done at the frame granularity.
+ *
+ * The initial state of the stack is with one frame, the "nil" frame, which
+ * cannot be popped. Therefor objects allocated in the nil frame cannot be
+ * deallocated sooner than stack destruction.
+ */
+struct ncclMemoryStack;
+
+void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
+void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
+void ncclMemoryStackPush(struct ncclMemoryStack* me);
+void ncclMemoryStackPop(struct ncclMemoryStack* me);
 template<typename T>
-struct ncclListElem {
-  T data;
-  struct ncclListElem* next;
+T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
+ * a pool instance to ever hold objects whose type have differing
+ * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
+ * a backing `ncclMemoryStack` passed during Alloc(). If memory
+ * backing any currently held object is deallocated then it is an error to do
+ * anything other than reconstruct it, after which it is a valid empty pool.
+ */
+struct ncclMemoryPool;
+
+// Equivalent to zero-initialization
+void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
+template<typename T>
+T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
+template<typename T>
+void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
+void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
+ * field is given via the `next` template argument.
+ *
+ * Example:
+ *   struct Foo {
+ *     struct Foo *next1, *next2; // can be a member of two lists at once
+ *   };
+ *   ncclIntruQueue<Foo, &Foo::next1> list1;
+ *   ncclIntruQueue<Foo, &Foo::next2> list2;
+ */
+template<typename T, T *T::*next>
+struct ncclIntruQueue;
+
+template<typename T, T *T::*next>
+void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
+template<typename T, T *T::*next>
+T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
+ * and "cond" fields are part of the public interface.
+ */
+struct ncclThreadSignal {
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
 };

-template<typename T>
-class ncclRecyclableList {
- private:
-  struct ncclListElem<T>* head;
-  struct ncclListElem<T>* tail;
-  struct ncclListElem<T>* cursor;
-  int n;
+// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
+constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();

- public:
-  ncclRecyclableList() {
-    tail = cursor = head = NULL;
-    n = 0;
-  }
+void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
+void ncclThreadSignalDestruct(struct ncclThreadSignal* me);

-  int count() const { return n; }
+// A convenience instance per-thread.
+extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;

-  // Get a new element from the list and return pointer
-  ncclResult_t getNewElem(T** dataOut) {
-    if (tail != NULL) {
-      *dataOut = &tail->data;
-      memset(*dataOut, 0, sizeof(T));
-    } else {
-      NCCLCHECK(ncclCalloc(&tail, 1));
-      *dataOut = &tail->data;
-      cursor = head = tail;
-    }
-    if (tail->next == NULL) {
-      NCCLCHECK(ncclCalloc(&tail->next, 1));
-    }
-    tail = tail->next;
-    n += 1;
-    return ncclSuccess;
-  }
+////////////////////////////////////////////////////////////////////////////////

-  T* begin() {
-    if (head == NULL || head == tail) return NULL;
-    cursor = head->next;
-    return &head->data;
-  }
+template<typename T, T *T::*next>
+struct ncclIntruQueueMpsc;

-  // Get next element from the list during an iteration
-  T* getNext() {
-    // tail always points to the next element to be enqueued
-    // hence does not contain valid data
-    if (cursor == NULL || cursor == tail) return NULL;
-    T* rv = &cursor->data;
-    cursor = cursor->next;
-    return rv;
-  }
+template<typename T, T *T::*next>
+void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me);
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me);
+// Enqueue element. Returns true if queue is not abandoned. Even if queue is
+// abandoned the element enqueued, so the caller needs to make arrangements for
+// the queue to be tended.
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T,next>* me, T* x);
+// Dequeue all elements at a glance. If there aren't any and `waitSome` is
+// true then this call will wait until it can return a non empty list.
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T,next>* me, bool waitSome);
+// Dequeue all elements and set queue to abandoned state.
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T,next>* me);

-  T* peakNext() {
-    if (cursor == NULL || cursor == tail) return NULL;
-    return &cursor->data;
-  }
+////////////////////////////////////////////////////////////////////////////////

-  // Recycle the list without freeing the space
-  void recycle() {
-    tail = cursor = head;
-    n = 0;
-  }
+struct ncclMemoryStack {
+  struct Hunk {
+    struct Hunk* above; // reverse stack pointer
+    size_t size; // size of this allocation (including this header struct)
+  };
+  struct Unhunk { // proxy header for objects allocated out-of-hunk
+    struct Unhunk* next;
+    void* obj;
+  };
+  struct Frame {
+    struct Hunk* hunk; // top of non-empty hunks
+    uintptr_t bumper, end; // points into top hunk
+    struct Unhunk* unhunks;
+    struct Frame* below;
+  };

-  ~ncclRecyclableList() {
-    while (head != NULL) {
-      struct ncclListElem<T>* temp = head;
-      head = head->next;
-      free(temp);
-    }
-  }
+  static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
+  static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
+
+  struct Hunk stub;
+  struct Frame topFrame;
 };

+inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
+  me->stub.above = nullptr;
+  me->stub.size = 0;
+  me->topFrame.hunk = &me->stub;
+  me->topFrame.bumper = 0;
+  me->topFrame.end = 0;
+  me->topFrame.unhunks = nullptr;
+  me->topFrame.below = nullptr;
+}
+
+inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
+  uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align);
+  void* obj;
+  if (__builtin_expect(o + size <= me->topFrame.end, true)) {
+    me->topFrame.bumper = o + size;
+    obj = reinterpret_cast<void*>(o);
+  } else {
+    obj = allocateSpilled(me, size, align);
+  }
+  return obj;
+}
+
+template<typename T>
+inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
+  void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
+  memset(obj, 0, n*sizeof(T));
+  return (T*)obj;
+}
+
+inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
+  using Frame = ncclMemoryStack::Frame;
+  Frame tmp = me->topFrame;
+  Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
+  *snapshot = tmp; // C++ struct assignment
+  me->topFrame.unhunks = nullptr;
+  me->topFrame.below = snapshot;
+}
+
+inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
+  ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
+  while (un != nullptr) {
+    free(un->obj);
+    un = un->next;
+  }
+  me->topFrame = *me->topFrame.below; // C++ struct assignment
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclMemoryPool {
+  struct Cell {
+    Cell *next;
+  };
+  template<int Size, int Align>
+  union CellSized {
+    Cell cell;
+    alignas(Align) char space[Size];
+  };
+  struct Cell* head;
+  struct Cell* tail; // meaningful only when head != nullptr
+};
+
+inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
+  me->head = nullptr;
+}
+
+template<typename T>
+inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
+  using Cell = ncclMemoryPool::Cell;
+  using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
+  Cell* cell;
+  if (__builtin_expect(me->head != nullptr, true)) {
+    cell = me->head;
+    me->head = cell->next;
+  } else {
+    // Use the internal allocate() since it doesn't memset to 0 yet.
+    cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
+  }
+  memset(cell, 0, sizeof(T));
+  return reinterpret_cast<T*>(cell);
+}
+
+template<typename T>
+inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
+  using Cell = ncclMemoryPool::Cell;
+  Cell* cell = reinterpret_cast<Cell*>(obj);
+  cell->next = me->head;
+  if (me->head == nullptr) me->tail = cell;
+  me->head = cell;
+}
+
+inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
+  if (from->head != nullptr) {
+    from->tail->next = me->head;
+    if (me->head == nullptr) me->tail = from->tail;
+    me->head = from->head;
+    from->head = nullptr;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueue {
+  T *head, *tail;
+};
+
+template<typename T, T *T::*next>
+inline void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me) {
+  me->head = nullptr;
+  me->tail = nullptr;
+}
+
+template<typename T, T *T::*next>
+inline bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me) {
+  return me->head == nullptr;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me) {
+  return me->head;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueTail(ncclIntruQueue<T,next> *me) {
+  return me->tail;
+}
+
+template<typename T, T *T::*next>
+inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
+  x->*next = nullptr;
+  (me->head ? me->tail->*next : me->head) = x;
+  me->tail = x;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
+  T *ans = me->head;
+  me->head = ans->*next;
+  if (me->head == nullptr) me->tail = nullptr;
+  return ans;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
+  T *ans = me->head;
+  if (ans != nullptr) {
+    me->head = ans->*next;
+    if (me->head == nullptr) me->tail = nullptr;
+  }
+  return ans;
+}
+
+template<typename T, T *T::*next>
+void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
+  T *head = me->head;
+  me->head = nullptr;
+  me->tail = nullptr;
+  while (head != nullptr) {
+    T *tmp = head->*next;
+    ncclMemoryPoolFree(pool, tmp);
+    head = tmp;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
+  return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
+}
+
+inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
+  pthread_mutex_init(&me->mutex, nullptr);
+  pthread_cond_init(&me->cond, nullptr);
+}
+
+inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
+  pthread_mutex_destroy(&me->mutex);
+  pthread_cond_destroy(&me->cond);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueueMpsc {
+  T* head;
+  uintptr_t tail;
+  struct ncclThreadSignal* waiting;
+};
+
+template<typename T, T *T::*next>
+void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me) {
+  me->head = nullptr;
+  me->tail = 0x0;
+  me->waiting = nullptr;
+}
+
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me) {
+  return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
+}
+
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T,next>* me, T* x) {
+  __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
+  uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
+  T* prev = reinterpret_cast<T*>(utail);
+  T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
+  __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
+  if (utail == 0x1) { // waiting
+    __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
+    // This lock/unlock is essential to ensure we don't race ahead of the consumer
+    // and signal the cond before they begin waiting on it.
+    struct ncclThreadSignal* waiting = me->waiting;
+    pthread_mutex_lock(&waiting->mutex);
+    pthread_mutex_unlock(&waiting->mutex);
+    pthread_cond_broadcast(&waiting->cond);
+  }
+  return utail != 0x2; // not abandoned
+}
+
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T,next>* me, bool waitSome) {
+  T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+  if (head == nullptr) {
+    if (!waitSome) return nullptr;
+    uint64_t t0 = clockNano();
+    bool sleeping = false;
+    do {
+      if (clockNano()-t0 >= 10*1000) { // spin for first 10us
+        struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
+        pthread_mutex_lock(&waitSignal->mutex);
+        uintptr_t expected = sleeping ? 0x1 : 0x0;
+        uintptr_t desired = 0x1;
+        me->waiting = waitSignal; // release done by successful compare exchange
+        if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
+          sleeping = true;
+          pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
+        }
+        pthread_mutex_unlock(&waitSignal->mutex);
+      }
+      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+    } while (head == nullptr);
+  }
+
+  __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+  uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
+  T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+  T *x = head;
+  while (x != tail) {
+    T *x1;
+    int spins = 0;
+    while (true) {
+      x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+      if (x1 != nullptr) break;
+      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+    }
+    x = x1;
+  }
+  return head;
+}
+
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T,next>* me) {
+  uintptr_t expected = 0x0;
+  if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+    return nullptr;
+  } else {
+    int spins = 0;
+    T* head;
+    while (true) {
+      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+      if (head != nullptr) break;
+      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+    }
+    __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+    uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
+    T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+    T *x = head;
+    while (x != tail) {
+      T *x1;
+      spins = 0;
+      while (true) {
+        x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+        if (x1 != nullptr) break;
+        if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+      }
+      x = x1;
+    }
+    return head;
+  }
+}
 #endif
--- a/src/init.cc
+++ b/src/init.cc
@ -28,10 +28,6 @@
 #define STR2(v) #v
 #define STR(v) STR2(v)

-#ifdef ENABLE_TRACE
-std::chrono::high_resolution_clock::time_point ncclEpoch;
-#endif
-
 #if CUDART_VERSION >= 9020
 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
 #else
@ -46,6 +42,17 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);

 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);

+static uint64_t hashUniqueId(ncclUniqueId const &id) {
+  char const *bytes = (char const*)&id;
+  uint64_t h = 0xdeadbeef;
+  for(int i=0; i < (int)sizeof(ncclUniqueId); i++) {
+    h ^= h >> 32;
+    h *= 0x8db3db47fa2994ad;
+    h += bytes[i];
+  }
+  return h;
+}
+
 // GDRCOPY support: Off by default
 NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);

@ -65,18 +72,28 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static size_t maxLocalSizeBytes = 0;
+
+bool ncclMainExited = false;
+
+static void atexitHandler() {
+  ncclMainExited = true;
+}
+
 static ncclResult_t ncclInit() {
-  if (initialized) return ncclSuccess;
+  if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
  pthread_mutex_lock(&initLock);
  if (!initialized) {
+    atexit(atexitHandler);
    initEnv();
    initGdrCopy();
    maxLocalSizeBytes = ncclKernMaxLocalSize();
    int carveout = ncclParamL1SharedMemoryCarveout();
    if (carveout) ncclKernSetSharedMemoryCarveout(carveout);
-    NCCLCHECK(ncclNetInit());
-    INFO(NCCL_INIT, "Using network %s", ncclNetName());
-    initialized = true;
+    // Always initialize bootstrap network
+    NCCLCHECK(bootstrapNetInit());
+    NCCLCHECK(ncclNetPluginInit());
+
+    __atomic_store_n(&initialized, true, __ATOMIC_RELEASE);
  }
  pthread_mutex_unlock(&initLock);
  return ncclSuccess;
@ -93,7 +110,9 @@ NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
 ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
  NCCLCHECK(ncclInit());
  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  return bootstrapGetUniqueId(out);
+  ncclResult_t res = bootstrapGetUniqueId(out);
+  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
+  return res;
 }

 // Prevent compiler from optimizing out these operations
@ -104,11 +123,96 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
 #endif

 void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
+  // Important that this does not trash intraComm0 & intraRefs.
  comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1;
 }

 #undef NCCL_NO_OPTIMIZE

+
+static ncclResult_t ncclDestructorFnFree(struct ncclDestructor* dtor) {
+  free(dtor->obj);
+  return ncclSuccess;
+}
+void ncclCommPushFree(struct ncclComm* comm, void* obj) {
+  struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
+  dtor->fn = ncclDestructorFnFree;
+  dtor->obj = obj;
+  dtor->next = comm->destructorHead;
+  comm->destructorHead = dtor;
+}
+
+static ncclResult_t ncclDestructorFnCudaFree(struct ncclDestructor* dtor) {
+  CUDACHECK(cudaFree(dtor->obj));
+  return ncclSuccess;
+}
+void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) {
+  struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
+  dtor->fn = ncclDestructorFnCudaFree;
+  dtor->obj = obj;
+  dtor->next = comm->destructorHead;
+  comm->destructorHead = dtor;
+}
+
+static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) {
+  CUDACHECK(cudaFreeHost(dtor->obj));
+  return ncclSuccess;
+}
+void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) {
+  struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
+  dtor->fn = ncclDestructorFnCudaHostFree;
+  dtor->obj = obj;
+  dtor->next = comm->destructorHead;
+  comm->destructorHead = dtor;
+}
+
+static ncclResult_t ncclDestructorFnCudaGdrFree(struct ncclDestructor* dtor) {
+  NCCLCHECK(ncclGdrCudaFree(dtor->obj));
+  return ncclSuccess;
+}
+void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) {
+  struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
+  dtor->fn = ncclDestructorFnCudaGdrFree;
+  dtor->obj = handle;
+  dtor->next = comm->destructorHead;
+  comm->destructorHead = dtor;
+}
+
+void commZombieCleanup(struct ncclComm* comm) {
+  ncclMemoryStackDestruct(&comm->memScoped);
+  ncclMemoryStackDestruct(&comm->memPermanent);
+
+  struct ncclComm* intraComm0 = comm->intraComm0;
+  if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) {
+    // Wait for all service threads to be done. We could not
+    // do it earlier because it could have blocked and prevented
+    // other ranks in the process to call ncclCommDestroy
+    comm = intraComm0;
+    while (comm != nullptr) {
+      if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr);
+      struct ncclComm* next = comm->intraNext;
+      free(comm);
+      comm = next;
+    }
+  }
+}
+
+static void* commZombieMain(void* arg) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclComm* comm = (struct ncclComm*)arg;
+  while (comm->persistentRefs != 0) {
+    struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/true);
+    while (cb != nullptr) {
+      struct ncclCommCallback* next = cb->next;
+      NCCLCHECKGOTO(cb->fn(comm, cb), result, ignore); // may reclaim memory of cb
+    ignore:
+      cb = next;
+    }
+  }
+  commZombieCleanup(comm);
+  return arg;
+}
+
 static ncclResult_t commFree(ncclComm_t comm) {
  if (comm == NULL)
    return ncclSuccess;
@ -120,13 +224,6 @@ static ncclResult_t commFree(ncclComm_t comm) {

  free(comm->connectSend);
  free(comm->connectRecv);
-  for (int peer=0; peer<comm->nRanks; peer++) {
-    delete comm->p2pSends[peer];
-    delete comm->p2pRecvs[peer];
-  }
-  free(comm->p2pSends);
-  free(comm->p2pRecvs);
-  free(comm->asyncOps);

  free(comm->peerInfo);
  ncclTopoFree(comm->topo);
@ -138,51 +235,60 @@ static ncclResult_t commFree(ncclComm_t comm) {
  if (comm->bootstrap)
    NCCLCHECK(bootstrapClose(comm->bootstrap));

-  CUDACHECK(cudaFree((ncclDevCommAndChannels*)comm->devComm));
-
  for (int channel=0; channel<MAXCHANNELS; channel++)
    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));

-  if (comm->doneEvent != NULL)
-    CUDACHECK(cudaEventDestroy(comm->doneEvent));
+  NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream));
+  NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));

-  if (comm->intDoneEvent != NULL)
-    CUDACHECK(cudaEventDestroy(comm->intDoneEvent));
-
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamDestroy(comm->groupStream));
-  }
-
-  // Last rank frees shared resources between threads
-  int isLast;
-  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-  if (isLast) {
-    // Wait for all service threads to be done. We could not
-    // do it earlier because it could have blocked and prevented
-    // other ranks in the process to call ncclCommDestroy
-    for (int i=0; i<comm->intraRanks; i++) {
-      void* ret;
-      if (comm->intraThreads[i]) pthread_join(comm->intraThreads[i], &ret);
-    }
-    free(comm->intraBarrier);
-    free(comm->intraParams);
-    free(comm->intraThreads);
-    free(comm->intraCudaDevs);
-    free(comm->intraCGMode);
-    free(comm->intraCC);
-  }
  NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));

-  // Poison comm to try and catch a double free
-  commPoison(comm);
+  struct ncclDestructor* dtor = comm->destructorHead;
+  while (dtor != nullptr) {
+    NCCLCHECK(dtor->fn(dtor));
+    dtor = dtor->next;
+  }

-  free(comm);
+  commPoison(comm); // Important that this does not interfere with anything used below.
+
+  if (comm->persistentRefs == 0) {
+    commZombieCleanup(comm);
+  } else {
+    // Spawn a thread to listen for remaining messages from graph cleanup.
+    pthread_t zombie;
+    pthread_create(&zombie, nullptr, commZombieMain, comm);
+    pthread_detach(zombie);
+  }
  return ncclSuccess;
 }

 NCCL_PARAM(AggChannelSize, "AGG_CHANNEL_SIZE", -2);
 NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0);
-NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0);
+// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
+NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
+NCCL_PARAM(WorkFifoDepth, "WORK_FIFO_DEPTH", 64<<10);
+enum ncclLaunchMode ncclParamLaunchMode;
+
+NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1);
+
+// Detect DMA-BUF support
+static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
+  if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL) return ncclInternalError;
+#if CUDA_VERSION >= 11070
+  int flag = 0;
+  CUdevice dev;
+  int cudaDriverVersion;
+  CUCHECK(cuDriverGetVersion(&cudaDriverVersion));
+  if (cudaDriverVersion < 11070) return ncclInternalError;
+  CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
+  // Query device to see if DMA-BUF support is available
+  (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
+  if (flag == 0) return ncclInternalError;
+  INFO(NCCL_INIT, "DMA-BUF is available on GPU device %d", comm->cudaDev);
+  return ncclSuccess;
+#endif
+  return ncclInternalError;
+}

 static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  if (ndev < 1) {
@ -194,100 +300,114 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
    return ncclInvalidArgument;
  }

-  // Try to create a CUDA object right away. If there is something wrong with
-  // the device we're on (failure cause #1) , better know it early.
-  cudaEvent_t doneEvent;
-  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
-  cudaEvent_t intDoneEvent;
-  CUDACHECK(cudaEventCreateWithFlags(&intDoneEvent, cudaEventDisableTiming));
-
  struct ncclComm* comm;
  NCCLCHECK(ncclCalloc(&comm, 1));
+  ncclMemoryStackConstruct(&comm->memPermanent);
+  ncclMemoryStackConstruct(&comm->memScoped);
+  comm->destructorHead = nullptr;
+  comm->rank = rank;
+  comm->nRanks = ndev;
+
+  NCCLCHECK(ncclNetInit(comm));
+  INFO(NCCL_INIT, "Using network %s", ncclNetName(comm));
+
+  // Try to create a CUDA object right away. If there is something wrong with
+  // the device we're on (failure cause #1) , better know it early.
+  NCCLCHECK(ncclStrongStreamConstruct(&comm->deviceStream));
+  NCCLCHECK(ncclStrongStreamConstruct(&comm->hostStream));

-  comm->rank = comm->hostDevComm.rank = rank;
-  comm->nRanks = comm->hostDevComm.nRanks = ndev;
  cudaGetDevice(&comm->cudaDev);
  NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);

-  comm->doneEvent = doneEvent;
-  comm->intDoneEvent = intDoneEvent;
  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9020
-  comm->groupCudaStream = ncclParamGroupCudaStream();
-#else
-  // Don't allow the user to overload the default setting in older CUDA builds
-  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
-#endif
+  comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
  comm->fatalError = ncclSuccess;

  NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
-  comm->hostDevComm.abortFlag = comm->abortFlag;
  *comm->abortFlag = 0;

-  comm->argsptrs[0] = &comm->devComm;
-  comm->argsptrs[1] = &comm->args;
  comm->collNetSupport = 0;

-  NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS));
-  comm->asyncOpCount = 0;
-  comm->asyncTotalSize = 0;
+  ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
+  ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
+  ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList);
+
+  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+  comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
  comm->channelSize = ncclParamAggChannelSize();
-  comm->asyncAllocMode = ncclComm::SHORTEST_QUEUE;
-  char* str = getenv("NCCL_AGG_ALLOC_MODE");
-  if (str) INFO(NCCL_ENV, "NCCL_AGG_ALLOC_MODE set by environment to %s", str);
-  if (str && strcmp(str, "ROUND_ROBIN") == 0) {
-    comm->asyncAllocMode = ncclComm::ROUND_ROBIN;
-  }
-
-  CUDACHECK(cudaDriverGetVersion(&comm->driverVersion));
-
-  NCCLCHECK(ncclCreateQueueInfo(&comm->enqueueInfo, comm));
-  comm->lastSetupNode = NULL;
-  comm->lastCudaGraphId = -1;
-  comm->disableGraphHelper = ncclParamDisableGraphHelper();
-  comm->graphRegister = ncclParamGraphRegister();
-#if CUDART_VERSION >= 11030
-  NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
-  comm->graphHelperResources->comm = comm;
-  if (comm->driverVersion >= 11030)
-    // cudaGetDriverEntryPoint requires R465 or above (enhanced compat need)
-    CUDACHECK(cudaGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, cudaEnableDefault));
-#endif

  static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
  static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
  NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
  NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));

-  comm->p2pSendCount = comm->p2pRecvCount = 0;
-  NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
-
  // Mark channels as non initialized.
-  for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
+  for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
+
+  ncclIntruQueueMpscConstruct(&comm->callbackQueue);

  *comret = comm;
  return ncclSuccess;
 }

 static ncclResult_t devCommSetup(ncclComm_t comm) {
-  ncclDevCommAndChannels *devCommAndChans;
-  NCCLCHECK(ncclCudaCalloc(&devCommAndChans, 1));
+  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
+
+  int nRanks = comm->nRanks;
+  struct ncclDevCommAndChannels *devCommAndChans, tmpCommAndChans;
+  NCCLCHECK(ncclCudaCallocAsync(&devCommAndChans, 1, comm->deviceStream.stream));
+  ncclCommPushCudaFree(comm, devCommAndChans);
  comm->devComm = &devCommAndChans->comm;
-  comm->hostDevComm.channels = devCommAndChans->channels;
+  tmpCommAndChans.comm.rank = comm->rank;
+  tmpCommAndChans.comm.nRanks = nRanks;
+  tmpCommAndChans.comm.abortFlag = comm->abortFlag;
+  for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
+    tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
+  }
+  tmpCommAndChans.comm.channels = &devCommAndChans->channels[0];

-  // Duplicate the channels on the device
-  int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
-  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, nChannels));
+  comm->workFifoDepth = ncclParamWorkFifoDepth();
+  if (0 != (comm->workFifoDepth & (comm->workFifoDepth-1))) {
+    WARN("NCCL_WORK_FIFO_DEPTH=%d is being ignored because it is not a power of 2.", comm->workFifoDepth);
+    comm->workFifoDepth = 64<<10;
+  }
+  tmpCommAndChans.comm.workFifoDepth = comm->workFifoDepth;

-  // Copy userRanks and peers
-  for (int r=0; r<comm->nChannels; r++) {
-    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
+  if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
+    // The workFifoHeap lives in GDR mapped CUDA memory.
+    NCCLCHECK(ncclGdrCudaCalloc(&comm->workFifoHeap, &comm->devWorkFifoHeap, comm->workFifoDepth, &comm->workFifoHeapGdrHandle));
+    ncclCommPushCudaGdrFree(comm, comm->workFifoHeapGdrHandle);
+  } else {
+    // The workFifoHeap lives in cudaHost memory.
+    comm->workFifoHeapGdrHandle = nullptr;
+    NCCLCHECK(ncclCudaHostCalloc(&comm->workFifoHeap, comm->workFifoDepth));
+    ncclCommPushCudaHostFree(comm, comm->workFifoHeap);
+    comm->devWorkFifoHeap = comm->workFifoHeap;
+  }
+  tmpCommAndChans.comm.workFifoHeap = comm->devWorkFifoHeap;
+
+  NCCLCHECK(ncclCudaHostCalloc(&comm->workFifoDone, MAXCHANNELS));
+  ncclCommPushCudaHostFree(comm, comm->workFifoDone);
+  comm->workFifoSent = 0;
+  comm->workFifoAckdMin = 0;
+
+  for (int c=0; c < MAXCHANNELS; c++) {
+    tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers;
+    tmpCommAndChans.channels[c].ring = comm->channels[c].ring;
+    tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks;
+    tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
+    tmpCommAndChans.channels[c].collTree = comm->channels[c].collTree;
+    tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
+
+    if (comm->channels[c].ring.userRanks != nullptr) {
+      NCCLCHECK(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->deviceStream.stream));
+    }
  }

-  // Duplicate the dev comm on the device
-  NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
+  NCCLCHECK(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.stream));
+  CUDACHECK(cudaStreamSynchronize(comm->deviceStream.stream));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
  return ncclSuccess;
 }

@ -319,7 +439,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u

  info->busId = comm->busId;

-  NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
+  NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport));
  info->comm = comm;
  info->cudaCompCap = ncclCudaCompCap();
  return ncclSuccess;
@ -343,84 +463,6 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
  return ncclSuccess;
 }

-void* waitForNonNullPtr(void* p) {
-  volatile void** ptr = (volatile void**) p;
-  while (*ptr == NULL) sched_yield();
-  return (void*)*ptr;
-}
-
-ncclResult_t initParams(struct ncclComm* comm) {
-  struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args = comm->argsptrs;
-  params->stream = NULL;
-  params->sharedMem = 0;
-  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
-  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
-  return ncclSuccess;
-}
-
-// Allocate/Set Intra Process Structures and set CG options
-ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
-  comm->intraRank = rank;
-  comm->intraRanks = ranks;
-  comm->intraPhase = 0;
-
-  // Alloc shared structures
-  if (rank == 0) {
-    assert(comm == comm0);
-    int* bar;
-    NCCLCHECK(ncclCalloc(&bar, 2));
-    bar[0] = bar[1] = 0;
-    comm->intraBarrier = bar;
-    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
-    NCCLCHECK(ncclCalloc(&comm->intraThreads, comm->intraRanks));
-    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
-    int* CGMode;
-    NCCLCHECK(ncclCalloc(&CGMode, 1));
-    *CGMode = 0x11;
-    comm->intraCGMode = CGMode;
-    int* CC;
-    NCCLCHECK(ncclCalloc(&CC, 1));
-    *CC = ncclCudaCompCap();
-    comm->intraCC = CC;
-  } else {
-    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
-    comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
-    comm->intraThreads = (pthread_t*)waitForNonNullPtr(&comm0->intraThreads);
-    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
-    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
-    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
-  }
-  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
-  comm->intraThreads[comm->intraRank] = comm->proxyState.thread;
-  NCCLCHECK(initParams(comm));
-
-  int cgMdLaunch = 0;
-
-  // Set CG Mode
-  comm->launchMode = ncclComm::PARALLEL;
-  char* str = getenv("NCCL_LAUNCH_MODE");
-  if (str) INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", str);
-  if (str && strcmp(str, "GROUP") == 0) {
-    comm->launchMode = ncclComm::GROUP;
-  }
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
-#if CUDART_VERSION >= 9000
-    if (*comm->intraCC && (ncclCudaCompCap() == *comm->intraCC)) {
-      // Check whether the GPU supports Cooperative Group Multi Device Launch
-      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
-    }
-#endif
-  }
-
-  // Disable cgMdLaunch if any rank does not support it
-  if (cgMdLaunch == 0) {
-    *comm->intraCGMode = 0x10;
-  }
-  return ncclSuccess;
-}
-
 #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
 #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
 #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
@ -439,7 +481,7 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
  if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;

  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    comm->buffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
+    comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
  }
  return ncclSuccess;
 }
@ -476,11 +518,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  // Topo detection / System graph creation
  NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
  // Compute paths between GPUs and NICs
-  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
  // Remove inaccessible GPUs and unused NICs
  NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
  // Recompute paths after trimming
-  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
  // Init search
  NCCLCHECK(ncclTopoSearchInit(comm->topo));
  // Print final topology
@ -532,7 +574,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  }

  // Determine local CollNet support before all-gather
-  if (collNetSupport()) {
+  if (collNetSupport(comm)) {
    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
    if (collNetEnable != NULL) {
      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
@ -564,6 +606,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  } *allGather3Data;

  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+
  NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev));
  allGather3Data[rank].tree.pattern = treeGraph.pattern;
  allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
@ -725,7 +768,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    struct ncclChannel* channel = comm->channels+c;
    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
  }
  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
  free(rings);
@ -735,8 +778,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
  }
  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
  INFO(NCCL_INIT, "Connected all trees");
@ -773,12 +816,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    int highestTransportType0, highestTransportType1;
    for (int c=0; c<comm->nChannels; c++) {
      struct ncclChannel* channelRecv = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
    }
    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
    for (int c=0; c<comm->nChannels; c++) {
      struct ncclChannel* channelSend = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
    }
    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);

@ -816,6 +859,52 @@ collnet_cleanup:
  // Compute nChannels per peer for p2p
  NCCLCHECK(ncclTopoComputeP2pChannels(comm));

+  do { // Setup p2p structures in comm->tasks
+    struct ncclTasks* tasks = &comm->tasks;
+    int nRanks = comm->nRanks;
+    int node = comm->node;
+    int nNodes = comm->nNodes;
+    struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
+    int localRank = comm->localRank;
+    tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, nRanks);
+    tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+    tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+    int s=0, r=0;
+    // schedule delta 0, +1, -1, +2, -2, ...
+    // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
+    for (int d=0; d <= nNodes/4; d++) {
+      int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
+      int index = 0;
+      int delta = deltas[index];
+    sched_delta:
+      int recvNode = (node+nNodes-delta)%nNodes;
+      int sendNode = (node+delta)%nNodes;
+      int steps = comm->maxLocalRanks;
+      for (int step=0; step < steps; step++) {
+        int recvIndex = (localRank-step+steps)%steps;
+        if (recvIndex < nodeRanks[recvNode].localRanks) {
+          tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex];
+          r++;
+        }
+        int sendIndex = (localRank+step)%steps;
+        if (sendIndex < nodeRanks[sendNode].localRanks) {
+          tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex];
+          s++;
+        }
+      }
+      index++;
+      if (index == 1 && deltas[1] == deltas[0]) index++;
+      if (index == 2 && deltas[2] == deltas[0]) index++;
+      if (index == 3 && deltas[3] == deltas[2]) index++;
+      if (index == 3 && deltas[3] == deltas[1]) index++;
+      if (index < 4) {
+        delta = deltas[index];
+        goto sched_delta;
+      }
+    }
+    assert(s == nRanks && r == nRanks);
+  } while (0);
+
  if (ncclParamNvbPreconnect()) {
    // Connect p2p when using NVB path
    int nvbNpeers;
@ -847,7 +936,7 @@ collnet_cleanup:
  NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));

  // Then to remote ones when using PXN
-  if (ncclPxnDisable() == 0) {
+  if (ncclPxnDisable(comm) == 0) {
    int nranks;
    int* pxnPeers;
    NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
@ -868,6 +957,10 @@ collnet_cleanup:
        if (intraProcRanks == 0) intraProcRank0 = i;
        if (i == rank) intraProcRank = intraProcRanks;
        intraProcRanks++;
+        if (intraProcRank0 == rank && rank != i) {
+          comm->peerInfo[i].comm->intraNext = comm->intraNext;
+          comm->intraNext = comm->peerInfo[i].comm;
+        }
      }
    }
    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
@ -878,9 +971,33 @@ collnet_cleanup:
          intraProcRank, intraProcRanks, intraProcRank0);
      return ncclInternalError;
    }
-    NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
+    struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
+    assert(intraProcRank==0 ? comm==comm0 : true);
+    comm->intraComm0 = comm0;
+    comm->intraRefs = intraProcRank==0 ? intraProcRanks : 0;
+    comm->intraRank = intraProcRank;
+    comm->intraRanks = intraProcRanks;
+    comm->intraBarrierPhase = 0;
+    comm->intraBarrierCounter = 0;
+    comm->intraBarrierGate = 0;
  } while(0);

+  if (comm->intraRank == 0) { // Load ncclParamLaunchMode
+    char* str = getenv("NCCL_LAUNCH_MODE");
+    enum ncclLaunchMode mode, modeOld;
+    if (str && strcasecmp(str, "GROUP") == 0) {
+      mode = ncclLaunchModeGroup;
+    } else {
+      mode = ncclLaunchModeParallel;
+    }
+    // In theory we could be racing with other communicators not associated with
+    // this one if the user is connecting to multiple ncclUniqueId's concurrently.
+    modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
+    if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
+      INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
+    }
+  }
+
  /* Local intra-node barrier */
  NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));

@ -899,8 +1016,22 @@ affinity_restore:

 NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);

-ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
-  ncclResult_t res;
+struct ncclCommInitRankAsyncJob {
+  struct ncclAsyncJob base;
+  ncclComm_t* newcomm;
+  int nranks, myrank;
+  ncclUniqueId commId;
+  int cudaDev;
+};
+
+static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
+  struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
+  ncclComm_t* newcomm = job->newcomm;
+  int nranks = job->nranks;
+  ncclUniqueId commId = job->commId; // C++ struct assignment
+  int myrank = job->myrank;
+  int cudaDev = job->cudaDev;
+  ncclResult_t res = ncclSuccess;

  CUDACHECK(cudaSetDevice(cudaDev));
  // Set the maximum kernel stack size of all kernels to avoid
@ -915,7 +1046,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);

  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
-
+  TRACE_CALL("ncclCommInitRank(%p,%d,0x%llx,%d,%d)", *newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev);
  return ncclSuccess;
 cleanup:
  if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
@ -923,6 +1054,12 @@ cleanup:
  return res;
 }

+static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
+  struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
+  ncclCommDestroy(*job->newcomm);
+  *job->newcomm = nullptr;
+}
+
 static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
  ncclResult_t res;
  char* env = getenv("NCCL_COMM_ID");
@ -944,20 +1081,26 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
    goto end;
  }

-  if (ncclAsyncMode()) {
-    NCCLCHECKGOTO(ncclAsyncInit(ncclCommInitRankSync, newcomm, nranks, commId, myrank, cudaDev), res, end);
-  } else {
-    NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end);
-  }
+  struct ncclCommInitRankAsyncJob *job;
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, end);
+  job->newcomm = newcomm;
+  job->nranks = nranks;
+  job->commId = commId; // C++ struct assignment
+  job->myrank = myrank;
+  job->cudaDev = cudaDev;
+  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, ncclCommInitRankUndo, free), res, end);

 end:
-  if (ncclAsyncMode()) return ncclAsyncErrCheck(res);
-  else return res;
+  return ncclGroupErrCheck(res);
 }

 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
 ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
+
+  // Load the CUDA driver and dlsym hooks (can fail on old drivers)
+  (void) cudaLibraryInit();
+
  int cudaDev;
  CUDACHECK(cudaGetDevice(&cudaDev));
  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
@ -967,6 +1110,10 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
 NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
 ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
+
+  // Load the CUDA driver and dlsym hooks (can fail on old drivers)
+  (void) cudaLibraryInit();
+
  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
  if (ndev < 0) {
    WARN("Invalid device count requested : %d", ndev);
@ -984,22 +1131,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
  return ncclSuccess;
 }

-static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) {
-  auto res = comm->graphHelperResources;
-  if (comm->graphHelperThread && res) {
-    pthread_mutex_lock(&res->threadLock);
-    res->threadState = ThreadStop;
-    pthread_cond_signal(&res->threadCond);
-    pthread_mutex_unlock(&res->threadLock);
-    pthread_join(comm->graphHelperThread, NULL);
-  }
-  if (res) {
-    free(res);
-    res = NULL;
-  }
-  return ncclSuccess;
-}
-
 static ncclResult_t commDestroy(ncclComm_t comm) {
  // Try and prevent a double free of the comm struct (user error)
  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
@ -1017,13 +1148,9 @@ static ncclResult_t commDestroy(ncclComm_t comm) {

  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError);

-  CUDACHECK(cudaStreamSynchronize(comm->groupStream));
-
-  ncclDestroyQueueInfo(comm->enqueueInfo);
-#if CUDART_VERSION >= 11030
-  NCCLCHECK(ncclGraphHelperDestroy(comm));
-#endif
-  INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
+  NCCLCHECK(ncclStrongStreamSynchronize(&comm->hostStream));
+  NCCLCHECK(ncclStrongStreamSynchronize(&comm->deviceStream));
+  NCCLCHECK(ncclCommPollCallbacks(comm));

  NCCLCHECK(commFree(comm));

@ -1075,10 +1202,19 @@ const char* ncclGetErrorString(ncclResult_t code) {
    case ncclInternalError          : return "internal error";
    case ncclInvalidArgument        : return "invalid argument";
    case ncclInvalidUsage           : return "invalid usage";
+    case ncclRemoteError            : return "remote process exited or there was a network error";
    default                         : return "unknown result code";
  }
 }

+/* Returns a human-readable message of the last error that occurred.
+ * comm is currently unused and can be set to NULL
+ */
+NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
+const char* ncclGetLastError(ncclComm_t comm) {
+  return ncclLastError;
+}
+
 NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
  NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@ -44,12 +44,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
    return ncclInvalidArgument;
  }
  // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
-  info->nBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
-    info->count = info->nBytes;
-    info->datatype = ncclInt8;
-  }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+  NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));

  if (info->op < 0 || ncclMaxRedOp < info->op) {
    WARN("%s : invalid reduction operation %d", info->opName, info->op);
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@ -0,0 +1,163 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "debug.h"
+#include "cudawrap.h"
+
+#include <dlfcn.h>
+
+#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+
+#if CUDART_VERSION >= 11030
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+DECLARE_CUDA_PFN(cuDeviceGet);
+DECLARE_CUDA_PFN(cuDeviceGetAttribute);
+DECLARE_CUDA_PFN(cuGetErrorString);
+DECLARE_CUDA_PFN(cuGetErrorName);
+/* enqueue.cc */
+DECLARE_CUDA_PFN(cuMemGetAddressRange);
+/* proxy.cc */
+DECLARE_CUDA_PFN(cuCtxCreate_v3020);
+DECLARE_CUDA_PFN(cuCtxDestroy);
+DECLARE_CUDA_PFN(cuCtxSetCurrent);
+#if CUDA_VERSION >= 11070
+/* transport/collNet.cc/net.cc*/
+DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
+#endif
+#endif
+
+/* CUDA Driver functions loaded with dlsym() */
+DECLARE_CUDA_PFN(cuInit);
+DECLARE_CUDA_PFN(cuDriverGetVersion);
+DECLARE_CUDA_PFN(cuGetProcAddress);
+
+static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized;
+
+#define CUDA_DRIVER_MIN_VERSION 11030
+
+static void *cudaLib;
+static int cudaDriverVersion;
+
+#if CUDART_VERSION >= 11030
+/*
+  Load the CUDA symbols
+ */
+static int cudaPfnFuncLoader(void) {
+  CUresult res;
+
+#define LOAD_SYM(symbol, ignore) do {                                   \
+    res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \
+    if (res != 0) {                                                     \
+      if (!ignore) {                                                    \
+        WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \
+        return ncclSystemError; }                                       \
+    } } while(0)
+
+  LOAD_SYM(cuGetErrorString, 0);
+  LOAD_SYM(cuGetErrorName, 0);
+  LOAD_SYM(cuDeviceGet, 0);
+  LOAD_SYM(cuDeviceGetAttribute, 0);
+  LOAD_SYM(cuMemGetAddressRange, 1);
+  LOAD_SYM(cuCtxCreate_v3020, 1);
+  LOAD_SYM(cuCtxDestroy, 1);
+  LOAD_SYM(cuCtxSetCurrent, 1);
+#if CUDA_VERSION >= 11070
+  LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
+#endif
+  return ncclSuccess;
+}
+#endif
+
+ncclResult_t cudaLibraryInit(void) {
+  CUresult res;
+
+  if (cudaState == cudaInitialized)
+    return ncclSuccess;
+  if (cudaState == cudaError)
+    return ncclSystemError;
+
+  if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) {
+    // Another thread raced in front of us. Wait for it to be done.
+    while (cudaState == cudaInitializing) sched_yield();
+    return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError;
+  }
+
+  /*
+   * Load CUDA driver library
+   */
+  char path[1024];
+  char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
+  if (ncclCudaPath == NULL)
+    snprintf(path, 1024, "%s", "libcuda.so");
+  else
+    snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
+
+  cudaLib = dlopen(path, RTLD_LAZY);
+  if (cudaLib == NULL) {
+    WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath);
+    goto error;
+  }
+
+  /*
+   * Load initial CUDA functions
+   */
+
+  pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit");
+  if (pfn_cuInit == NULL) {
+    WARN("Failed to load CUDA missing symbol cuInit");
+    goto error;
+  }
+
+  pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion");
+  if (pfn_cuDriverGetVersion == NULL) {
+    WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
+    goto error;
+  }
+
+  res = pfn_cuDriverGetVersion(&cudaDriverVersion);
+  if (res != 0) {
+    WARN("cuDriverGetVersion failed with %d", res);
+    goto error;
+  }
+
+  INFO(NCCL_INIT, "cudaDriverVersion %d", cudaDriverVersion);
+
+  if (cudaDriverVersion < CUDA_DRIVER_MIN_VERSION) {
+    // WARN("CUDA Driver version found is %d. Minimum requirement is %d", cudaDriverVersion, CUDA_DRIVER_MIN_VERSION);
+    // Silently ignore version check mismatch for backwards compatibility
+    goto error;
+  }
+
+  pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress");
+  if (pfn_cuGetProcAddress == NULL) {
+    WARN("Failed to load CUDA missing symbol cuGetProcAddress");
+    goto error;
+  }
+
+  /*
+   * Required to initialize the CUDA Driver.
+   * Multiple calls of cuInit() will return immediately
+   * without making any relevant change
+   */
+  pfn_cuInit(0);
+
+#if CUDART_VERSION >= 11030
+  if (cudaPfnFuncLoader()) {
+    WARN("CUDA some PFN functions not found in the library");
+    goto error;
+  }
+#endif
+
+  cudaState = cudaInitialized;
+  return ncclSuccess;
+
+error:
+  cudaState = cudaError;
+  return ncclSystemError;
+}
+
+
--- a/src/misc/gdrwrap.cc
+++ b/src/misc/gdrwrap.cc
@ -57,7 +57,7 @@ ncclResult_t wrap_gdr_symbols(void) {

  if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) {
    // Another thread raced in front of us. Wait for it to be done.
-    while (gdrState == gdrInitializing) pthread_yield();
+    while (gdrState == gdrInitializing) sched_yield();
    return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError;
  }

--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@ -30,6 +30,8 @@ struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
 int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
 struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
 struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
+/* DMA-BUF support */
+struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
 int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
 struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
 int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
@ -49,7 +51,7 @@ ncclResult_t wrap_ibv_symbols(void) {

  if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
    // Another thread raced in front of us. Wait for it to be done.
-    while (ibvState == ibvInitializing) pthread_yield();
+    while (ibvState == ibvInitializing) sched_yield();
    return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
  }

@ -98,6 +100,8 @@ ncclResult_t wrap_ibv_symbols(void) {
  LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
  // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
+  // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
+  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
  LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
  LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
  LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
@ -126,6 +130,7 @@ teardown:
  ibv_internal_dealloc_pd = NULL;
  ibv_internal_reg_mr = NULL;
  ibv_internal_reg_mr_iova2 = NULL;
+  ibv_internal_reg_dmabuf_mr = NULL;
  ibv_internal_dereg_mr = NULL;
  ibv_internal_create_cq = NULL;
  ibv_internal_destroy_cq = NULL;
@ -259,7 +264,7 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or
 }

 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
-  IBV_PTR_CHECK(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
 }

 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
@ -275,7 +280,19 @@ ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void
    return ncclInternalError;
  }
  if (ret == NULL) { return ncclSuccess; } // Assume dummy call
-  IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+}
+
+/* DMA-BUF support */
+ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
+}
+
+struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
+  if (ibv_internal_reg_dmabuf_mr == NULL) {
+    return NULL;
+  }
+  return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
 }

 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@ -332,9 +332,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
 #endif
  }

-  /* make all new sockets non-blocking */
-  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  if (sock->asyncFlag) {
+    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }

  // addr port should be 0 (Any port)
  SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
@ -373,7 +374,7 @@ static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
      SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
    }

-    if (ret == EINPROGRESS)
+    if (ret == EINPROGRESS || ret == ECONNREFUSED)
      *state = ncclSocketConnecting;
    else if (ret == 0)
      *state = ncclSocketConnected;
@ -409,10 +410,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {

  const int one = 1;
  SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
+  
  /* support non-blocking socket; by default, the socket is non-blocking */
-  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  if (sock->asyncFlag) {
+    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }

  /*  const int bufsize = 128*1024;
    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
@ -424,31 +427,26 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
  int timedout_retries = 0;
  int refused_retries = 0;
 retry:
-  /* async connect; abort when error happens and abortFlag is present. */
+  /* blocking/non-blocking connect() is determined by asyncFlag. */
  ret = connect(fd, &sock->addr.sa, salen);

-  if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
-    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
-    if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+  if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) {
+    if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
    usleep(SLEEP_INT);
    goto retry;
-  } else if (errno == EINPROGRESS && !sock->asyncFlag) {
-    enum ncclSocketState state;
-    do {
-      if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0);
-      NCCLCHECK(getFdState(fd, &state));
-    } while (state == ncclSocketConnecting);
-    EQCHECK(state, ncclSocketError);
-    ret = 0;
  }

-  if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) {
+  /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
+   * However, it can return EISCONN instead of success which indicates connection is built up in
+   * background already. No need to call connect() again. */
+  if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) {
    sock->fd = fd;
    return ncclSuccess;
  }

  WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-  return ncclSystemError;
+  return ncclRemoteError;
 }

 ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
@ -501,7 +499,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
    if (bytes == -1) {
      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
        WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-        return ncclSystemError;
+        return ncclRemoteError;
      } else {
        bytes = 0;
      }
@ -521,7 +519,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
  if (closed) {
    char line[SOCKET_NAME_MAXLEN+1];
    WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
-    return ncclSystemError;
+    return ncclRemoteError;
  }
  return ncclSuccess;
 }
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@ -0,0 +1,272 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "strongstream.h"
+#include "checks.h"
+#include "param.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+ncclResult_t ncclCudaGetCapturingGraph(
+    struct ncclCudaGraph* graph, cudaStream_t stream
+  ) {
+  #if CUDART_VERSION >= 11030
+    thread_local int driver = -1;
+    if (driver == -1) {
+      CUDACHECK(cudaDriverGetVersion(&driver));
+    }
+    if (driver < 11030) {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid;
+      graph->graph = nullptr;
+      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid));
+      if (status != cudaStreamCaptureStatusNone) {
+        WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+        return ncclInvalidUsage;
+      }
+    } else {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid;
+      CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
+      if (status != cudaStreamCaptureStatusActive) {
+        graph->graph = nullptr;
+        gid = ULLONG_MAX;
+      }
+      graph->graphId = gid;
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg) {
+  #if CUDART_VERSION >= 11030
+    cudaUserObject_t object;
+    CUDACHECK(cudaUserObjectCreate(
+      &object, arg, fn, /*initialRefcount=*/1, cudaUserObjectNoDestructorSync
+    ));
+    // Hand over ownership to CUDA Graph
+    CUDACHECK(cudaGraphRetainUserObject(graph.graph, object, 1, cudaGraphUserObjectMove));
+    return ncclSuccess;
+  #else
+    return ncclInvalidUsage;
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
+  CUDACHECK(cudaStreamCreateWithFlags(&ss->stream, cudaStreamNonBlocking));
+  CUDACHECK(cudaEventCreateWithFlags(&ss->event, cudaEventDisableTiming));
+  #if CUDART_VERSION >= 11030
+    ss->node = nullptr;
+    ss->graphId = (1ull<<(8*sizeof(long long)-1))-1;
+    ss->eventIsLagging = 0;
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    CUDACHECK(cudaEventDestroy(ss->event));
+  #endif
+  CUDACHECK(cudaStreamDestroy(ss->stream));
+  return ncclSuccess;
+}
+
+NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1)
+
+ncclResult_t ncclStrongStreamAcquire(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss
+  ) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (graph.graph == nullptr) {
+      if (mixing && ncclStrongStreamEverCaptured(ss)) {
+        CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+        ss->eventIsLagging = 0;
+      }
+    } else {
+      if (ss->graphId != graph.graphId) {
+        if (mixing && ss->eventIsLagging) {
+          // Can only be here if previous release was for uncaptured work that
+          // elided updating the event because no capture had yet occurred.
+          CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+          CUDACHECK(cudaEventRecord(ss->event, ss->stream));
+        }
+        ss->graphId = graph.graphId;
+        ss->eventIsLagging = 0;
+        if (mixing) {
+          CUDACHECK(cudaGraphAddEventWaitNode(&ss->node, graph.graph, nullptr, 0, ss->event));
+        } else {
+          CUDACHECK(cudaGraphAddEmptyNode(&ss->node, graph.graph, nullptr, 0));
+        }
+      }
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (mixing && ncclStrongStreamEverCaptured(ss)) {
+      CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+    }
+    ss->eventIsLagging = 1; // Assume the caller is going to add work to stream.
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (mixing && ss->eventIsLagging) {
+      if (graph.graph == nullptr) {
+        if (ncclStrongStreamEverCaptured(ss)) {
+          CUDACHECK(cudaEventRecord(ss->event, ss->stream));
+          ss->eventIsLagging = 0;
+        }
+      } else {
+        CUDACHECK(cudaGraphAddEventRecordNode(&ss->node, graph.graph, &ss->node, 1, ss->event));
+        ss->eventIsLagging = 0;
+      }
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamLaunchHost(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg));
+    } else {
+      cudaHostNodeParams p;
+      p.fn = fn;
+      p.userData = arg;
+      CUDACHECK(cudaGraphAddHostNode(&ss->node, graph.graph, &ss->node, 1, &p));
+    }
+    ss->eventIsLagging = 1;
+  #else
+    CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamLaunchKernel(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+    void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
+    } else {
+      cudaGraphNode_t tip = ss->node;
+      cudaKernelNodeParams p;
+      p.func = fn;
+      p.gridDim = grid;
+      p.blockDim = block;
+      p.kernelParams = args;
+      p.sharedMemBytes = sharedMemBytes;
+      p.extra = nullptr;
+      CUDACHECK(cudaGraphAddKernelNode(&ss->node, graph.graph, &tip, 1, &p));
+    }
+    ss->eventIsLagging = 1;
+  #else
+    CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      if (b->eventIsLagging) {
+        b->eventIsLagging = 0;
+        CUDACHECK(cudaEventRecord(b->event, b->stream));
+      }
+      CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
+      a->eventIsLagging = 1;
+    } else {
+      cudaGraphNode_t pair[2] = {a->node, b->node};
+      CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
+    }
+  #else
+    CUDACHECK(cudaEventRecord(b->event, b->stream));
+    CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaEventRecord(a->event, b));
+      CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0));
+      // We used a->event to record b so it no longer reflects anything about a.
+      a->eventIsLagging = 1;
+    } else {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid1;
+      cudaGraphNode_t const* deps;
+      size_t depN = 0;
+      CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &gid1, nullptr, &deps, &depN));
+      if (status != cudaStreamCaptureStatusActive || graph.graphId != gid1) {
+        WARN("Stream is not being captured by the expected graph.");
+        return ncclInvalidUsage;
+      }
+      if (depN > 0 && (depN > 1 || deps[0] != a->node)) {
+        cudaGraphNode_t tie;
+        if (depN == 1) {
+          tie = deps[0];
+        } else {
+          CUDACHECK(cudaGraphAddEmptyNode(&tie, graph.graph, deps, depN));
+        }
+        cudaGraphNode_t pair[2] = {a->node, tie};
+        CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
+      }
+      // a->eventIsLagging doesn't change since we are just updating the
+      // dependencies of a->node.
+    }
+  #else
+    CUDACHECK(cudaEventRecord(a->event, b));
+    CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      if (b->eventIsLagging) {
+        b->eventIsLagging = 0;
+        CUDACHECK(cudaEventRecord(b->event, b->stream));
+      }
+      CUDACHECK(cudaStreamWaitEvent(a, b->event, 0));
+    } else {
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(a, &b->node, 1, cudaStreamAddCaptureDependencies));
+    }
+  #else
+    CUDACHECK(cudaEventRecord(b->event, b->stream));
+    CUDACHECK(cudaStreamWaitEvent(a, b->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+  #endif
+  CUDACHECK(cudaStreamSynchronize(ss->stream));
+  return ncclSuccess;
+}
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@ -9,6 +9,8 @@

 #include "nvmlwrap.h"

+#include <stdlib.h>
+
 // Get current Compute Capability
 int ncclCudaCompCap() {
  int cudaDev;
@ -190,3 +192,102 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
  }
  return false;
 }
+
+__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
+
+void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {
+  // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
+  // this (reachable via `->above`) are empty.
+  struct Hunk* top = me->topFrame.hunk;
+  size_t mallocSize = 0;
+
+  // If we have lots of space left in hunk but that wasn't enough then we'll
+  // allocate the object unhunked.
+  if (me->topFrame.end - me->topFrame.bumper >= 8<<10)
+    goto unhunked;
+
+  // If we have another hunk (which must be empty) waiting above this one and
+  // the object fits then use that.
+  if (top && top->above) {
+    struct Hunk* top1 = top->above;
+    uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align);
+    if (uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
+      me->topFrame.hunk = top1;
+      me->topFrame.bumper = uobj + size;
+      me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
+      return reinterpret_cast<void*>(uobj);
+    }
+  }
+
+  { // If the next hunk we're going to allocate wouldn't be big enough but the
+    // Unhunk proxy fits in the current hunk then go allocate as unhunked.
+    size_t nextSize = (top ? top->size : 0) + (64<<10);
+    constexpr size_t maxAlign = 64;
+    if (nextSize < sizeof(struct Hunk) + maxAlign + size) {
+      uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
+      if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
+        goto unhunked;
+    }
+
+    // At this point we must need another hunk, either to fit the object
+    // itself or its Unhunk proxy.
+    mallocSize = nextSize;
+    INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+    struct Hunk *top1 = (struct Hunk*)malloc(mallocSize);
+    if (top1 == nullptr) goto malloc_exhausted;
+    top1->size = nextSize;
+    top1->above = nullptr;
+    if (top) top->above = top1;
+    top = top1;
+    me->topFrame.hunk = top;
+    me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
+    me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
+  }
+
+  { // Try to fit object in the new top hunk.
+    uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align);
+    if (uobj + size <= me->topFrame.end) {
+      me->topFrame.bumper = uobj + size;
+      return reinterpret_cast<void*>(uobj);
+    }
+  }
+
+unhunked:
+  { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
+    // to keep track of it.
+    uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
+    Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
+    me->topFrame.bumper = uproxy + sizeof(Unhunk);
+    proxy->next = me->topFrame.unhunks;
+    me->topFrame.unhunks = proxy;
+    mallocSize = size;
+    proxy->obj = malloc(mallocSize);
+    INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+    if (proxy->obj == nullptr) goto malloc_exhausted;
+    return proxy->obj;
+  }
+
+malloc_exhausted:
+  WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
+  abort();
+}
+
+void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
+  // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
+  struct ncclMemoryStack::Frame* f = &me->topFrame;
+  while (f != nullptr) {
+    struct ncclMemoryStack::Unhunk* u = f->unhunks;
+    while (u != nullptr) {
+      free(u->obj);
+      u = u->next;
+    }
+    f = f->below;
+  }
+  // Free hunks
+  struct ncclMemoryStack::Hunk* h = me->stub.above;
+  while (h != nullptr) {
+    struct ncclMemoryStack::Hunk *h1 = h->above;
+    free(h);
+    h = h1;
+  }
+}
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@ -38,7 +38,8 @@ typedef enum { ncclSuccess                 =  0,
               ncclInternalError           =  3,
               ncclInvalidArgument         =  4,
               ncclInvalidUsage            =  5,
-               ncclNumResults              =  6 } ncclResult_t;
+               ncclRemoteError             =  6,
+               ncclNumResults              =  7 } ncclResult_t;

 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
 * This integer is coded with the MAJOR, MINOR and PATCH level of the
@ -81,10 +82,16 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm);
 ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);

-/* Returns a human-readable error message. */
+/* Returns a string for each error code. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);

+/* Returns a human-readable message of the last error that occurred.
+ * comm is currently unused and can be set to NULL
+ */
+const char*  ncclGetLastError(ncclComm_t comm);
+const char* pncclGetError(ncclComm_t comm);
+
 /* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
--- a/src/net.cc
+++ b/src/net.cc
@ -9,15 +9,16 @@
 //#include <sys/stat.h>
 //#include <unistd.h>

-ncclNet_t *ncclNet;
-ncclCollNet_t *ncclCollNet;
-
-static ncclNet_v5_t ncclNet_v4_as_v5;
+static ncclNet_v6_t ncclNet_v4_as_v6;
+static ncclNet_v6_t ncclNet_v5_as_v6;
 static ncclNet_v4_t *ncclNet_v4;
-static ncclCollNet_v5_t ncclCollNet_v4_as_v5;
+static ncclNet_v5_t *ncclNet_v5;
+static ncclCollNet_v6_t ncclCollNet_v4_as_v6;
+static ncclCollNet_v6_t ncclCollNet_v5_as_v6;
 static ncclCollNet_v4_t *ncclCollNet_v4;
+static ncclCollNet_v5_t *ncclCollNet_v5;

-static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
  ncclNetProperties_v4_t p4;
  ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
  if (ans != ncclSuccess) return ans;
@ -33,17 +34,17 @@ static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
  return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
 }

-static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
  if (n == 0) return ncclSuccess;
  if (n != 1) return ncclInvalidArgument;
  return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
 }

-static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
  if (n == 0) return ncclSuccess;
  if (n != 1) return ncclInvalidArgument;
  return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
@ -51,27 +52,51 @@ static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data,

 // We use a wrapper around the v4 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclNet_v4->init(logfn));
-  ncclNet_v4_as_v5.name = ncclNet_v4->name;
-  ncclNet_v4_as_v5.devices = ncclNet_v4->devices;
-  ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties;
-  ncclNet_v4_as_v5.listen = ncclNet_v4->listen;
-  ncclNet_v4_as_v5.connect = ncclNet_v4->connect;
-  ncclNet_v4_as_v5.accept = ncclNet_v4->accept;
-  ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr;
-  ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr;
-  ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend;
-  ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv;
-  ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush;
-  ncclNet_v4_as_v5.test = ncclNet_v4->test;
-  ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend;
-  ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv;
-  ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen;
+  ncclNet_v4_as_v6.name = ncclNet_v4->name;
+  ncclNet_v4_as_v6.devices = ncclNet_v4->devices;
+  ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties;
+  ncclNet_v4_as_v6.listen = ncclNet_v4->listen;
+  ncclNet_v4_as_v6.connect = ncclNet_v4->connect;
+  ncclNet_v4_as_v6.accept = ncclNet_v4->accept;
+  ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr;
+  ncclNet_v4_as_v6.regMrDmaBuf = NULL;
+  ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr;
+  ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend;
+  ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv;
+  ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush;
+  ncclNet_v4_as_v6.test = ncclNet_v4->test;
+  ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend;
+  ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv;
+  ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+// We use a wrapper around the v5 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v5->init(logfn));
+  ncclNet_v5_as_v6.name = ncclNet_v5->name;
+  ncclNet_v5_as_v6.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties;
+  ncclNet_v5_as_v6.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v6.connect = ncclNet_v5->connect;
+  ncclNet_v5_as_v6.accept = ncclNet_v5->accept;
+  ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr;
+  ncclNet_v5_as_v6.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v6.isend = ncclNet_v5->isend;
+  ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv;
+  ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v6.test = ncclNet_v5->test;
+  ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
  ncclNetProperties_v4_t p4;
  ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
  if (ans != ncclSuccess) return ans;
@ -89,25 +114,58 @@ static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetPropertie

 // We use a wrapper around the v4 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclCollNet_v4->init(logfn));
-  ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name;
-  ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices;
-  ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties;
-  ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen;
-  ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect;
-  ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport;
-  ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr;
-  ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr;
-  ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce;
-  ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush;
-  ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test;
-  ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl;
-  ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen;
+  ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
+  ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices;
+  ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties;
+  ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen;
+  ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect;
+  ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport;
+  ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr;
+  ncclCollNet_v4_as_v6.regMrDmaBuf = NULL;
+  ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr;
+  ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce;
+  ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush;
+  ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test;
+  ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl;
+  ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen;
  return ncclSuccess;
 }

-static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
+// We use a wrapper around the v5 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v5->init(logfn));
+  ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties;
+  ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr;
+  ncclCollNet_v5_as_v6.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce;
+  ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen;
+  return ncclSuccess;
+}
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
+ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr };
+enum ncclNetState {
+  ncclNetStateInit = 0,
+  ncclNetStateEnabled = 1,
+  ncclNetStateDisabled = 2
+};
+enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+
+ncclResult_t ncclNetPluginInit() {
  char ncclNetPluginName[128];
  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
  if (envPluginName && strlen(envPluginName)) {
@ -126,67 +184,104 @@ static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
    } else {
      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
    }
-    return;
+    return ncclSuccess;
  }

-  *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-  if (*net == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol.");
-    ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
-    if (ncclNet_v4 == nullptr) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol.");
-      if (netPluginLib != nullptr) dlclose(netPluginLib);
-      return;
+  ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+  if (ncclNets[0] == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
+    // Try v5 plugin
+    ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+    if (ncclNet_v5 == nullptr) {
+      ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
+      if (ncclNet_v4 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5).");
+        if (netPluginLib != nullptr) dlclose(netPluginLib);
+        return ncclSuccess;
+      }
+      ncclNets[0] = &ncclNet_v4_as_v6;
+      ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init;
+      // Set the name right away to allow for NCCL_NET=... to work
+      ncclNet_v4_as_v6.name = ncclNet_v4->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name);
+    } else {
+      ncclNets[0] = &ncclNet_v5_as_v6;
+      ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init;
+      // Set the name right away to allow for NCCL_NET=... to work
+      ncclNet_v5_as_v6.name = ncclNet_v5->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
    }
-    *net = &ncclNet_v4_as_v5;
-    ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init;
  }

  // Check for CollNet
-  *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-  if (*collnet == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.");
-    ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
-    if (ncclCollNet_v4 == nullptr) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.");
+  ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+  if (ncclCollNets[0] == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
+    ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+    if (ncclCollNet_v5 == nullptr) {
+      ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
+      if (ncclCollNet_v4 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).");
+      } else {
+        ncclCollNets[0] = &ncclCollNet_v4_as_v6;
+        ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init;
+        ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name);
+      }
    } else {
-      *collnet = &ncclCollNet_v4_as_v5;
-      ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init;
+      ncclCollNets[0] = &ncclCollNet_v5_as_v6;
+      ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init;
+      ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
    }
  }
-  return;
+  return ncclSuccess;
 }

-ncclResult_t ncclNetInit() {
-  // Always initialize bootstrap network
-  NCCLCHECK(bootstrapNetInit());
+static ncclResult_t netGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
+    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
+    else ncclNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}

+static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
+  if (ncclCollNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else ncclCollNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclCollNetStates[i];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetInit(struct ncclComm* comm) {
  // Initialize main communication network
-  ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
-  ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr };
-  initPlugin(&nets[0], &collNets[0]);
  char* netName = getenv("NCCL_NET");
  bool ok = false;

  for (int i=0; i<3; i++) {
-    if (nets[i] == nullptr) continue;
-    if (netName && strcmp(netName, nets[i]->name) != 0) continue;
+    if (ncclNets[i] == nullptr) continue;
+    enum ncclNetState state;
+    NCCLCHECK(netGetState(i, &state));
+    if (state != ncclNetStateEnabled) continue;
+    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;

-    // net plugin is already initialized
-    int ndev;
-    if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue;
-    if (nets[i]->devices(&ndev) != ncclSuccess) continue;
-    if (ndev <= 0) continue;
-    ncclNet = nets[i];
+    comm->ncclNet = ncclNets[i];
    ok = true;

-    if (collNets[i]) {
-      do {
-        if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break;
-        if (collNets[i]->devices(&ndev) != ncclSuccess) break;
-        if (ndev <= 0) break;
-        ncclCollNet = collNets[i];
-      } while(0);
+    if (ncclCollNets[i]) {
+      NCCLCHECK(collNetGetState(i, &state));
+      if (state == ncclNetStateEnabled) {
+        comm->ncclCollNet = ncclCollNets[i];
+      }
    }
    break;
  }
@ -198,7 +293,7 @@ ncclResult_t ncclNetInit() {
  return ncclSuccess;
 }

-ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
  constexpr int GPU_BUF_SIZE = 2*1024*1024;
 #if CUDART_VERSION >= 11030
  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
@ -213,12 +308,12 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
  }
 #endif
  int netDevs;
-  NCCLCHECK(ncclNetDevices(&netDevs));
+  NCCLCHECK(ncclNetDevices(comm, &netDevs));
  *gdrSupport = 0;
  for (int dev=0; dev<netDevs; dev++) {
    // Find a net device which is GDR-capable
    ncclNetProperties_t props;
-    NCCLCHECK(ncclNetGetProperties(dev, &props));
+    NCCLCHECK(ncclNetGetProperties(comm, dev, &props));
    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;

    // Allocate memory on the GPU and try to register it on the NIC.
@ -228,34 +323,34 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
    void* mHandle = NULL;
    ncclResult_t ret;
    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
    while (sComm == NULL) {
-      NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
+      NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
    }
    while (rComm == NULL) {
-      NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
+      NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
    }
    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
-    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
-      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+    if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
+      NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle));
      *gdrSupport = 1;
    }
    ncclDebugNoWarn = 0;
    CUDACHECK(cudaFree(gpuPtr));
 cleanup4:
-    NCCLCHECK(ncclNetCloseRecv(rComm));
+    NCCLCHECK(ncclNetCloseRecv(comm, rComm));
 cleanup3:
-    NCCLCHECK(ncclNetCloseSend(sComm));
+    NCCLCHECK(ncclNetCloseSend(comm, sComm));
 cleanup2:
-    NCCLCHECK(ncclNetCloseListen(lComm));
+    NCCLCHECK(ncclNetCloseListen(comm, lComm));
 cleanup1:
    break;
  }
  return ncclSuccess;
 }

-int ncclNetVersion() {
-  return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5;
+int ncclNetVersion(struct ncclComm* comm) {
+  return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6);
 }
--- a/src/proxy.cc
+++ b/src/proxy.cc
@ -13,6 +13,8 @@
 #define ENABLE_TIMER 0
 #include "timer.h"

+#include <sys/syscall.h>
+
 enum { proxyRecv=0, proxySend=1 };

 static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
@ -349,10 +351,10 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
  return ncclSuccess;
 }

-static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
+static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
  if (peer < 0) return ncclSuccess;

-  struct ncclPeer* peerComm = channel->peers+peer;
+  struct ncclChannelPeer* peerComm = channel->peers+peer;
  struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
  if (connector->transportComm == NULL) {
    WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
@ -361,35 +363,62 @@ static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, s
  }
  if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;

-  NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
+  if (justInquire) *justInquire = true;
+  else {
+    NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
+  }
  return ncclSuccess;
 }

-ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) {
-  struct ncclChannel* channel = comm->channels+op->channelId;
-  int pattern = op->pattern;
-  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
-    struct ncclRing* ring = &channel->ring;
-    if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0));
-    if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0));
-  }
-  if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
-    // Tree up
-    struct ncclTree* tree = &channel->tree;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0));
-    NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0));
-  }
-  if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
-    // Tree down
-    struct ncclTree* tree = &channel->tree;
-    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0));
-    NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0));
-  }
-  if (pattern == ncclPatternCollTreeUpDown) {
-    // CollTree up
-    NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1));  // For CollTree up, we are using push
-    // CollTree down
-    NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0));
+// justInquire != nullptr means don't actually do anything, just assertain need of
+// ncclProxySaveOp for this op.
+ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
+  struct ncclChannel* channel = &comm->channels[op->channelId];
+  if (justInquire) *justInquire = false;
+  switch (op->pattern) {
+  case ncclPatternRing:
+  case ncclPatternRingTwice:
+  case ncclPatternPipelineFrom:
+  case ncclPatternPipelineTo: {
+      struct ncclRing* ring = &channel->ring;
+      if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
+        NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0, justInquire));
+      }
+      if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
+        NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0, justInquire));
+      }
+    } break;
+  case ncclPatternTreeUp:
+  case ncclPatternTreeDown:
+  case ncclPatternTreeUpDown: {
+      if (op->pattern != ncclPatternTreeDown) { // Tree up
+        struct ncclTree* tree = &channel->tree;
+        for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) {
+          NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0, justInquire));
+        }
+        NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0, justInquire));
+      }
+      if (op->pattern != ncclPatternTreeUp) { // Tree down
+        struct ncclTree* tree = &channel->tree;
+        for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) {
+          NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0, justInquire));
+        }
+        NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire));
+      }
+    } break;
+  case ncclPatternCollTreeUpDown: {
+      // CollTree up
+      NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire));  // For CollTree up, we are using push
+      // CollTree down
+      NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire));
+    } break;
+  case ncclPatternSend:
+  case ncclPatternRecv: {
+      if (op->root == comm->rank) return ncclSuccess;
+      op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+      if (op->nsteps == 0) op->nsteps = 1;
+      NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire));
+    } break;
  }
  return ncclSuccess;
 }
@ -406,22 +435,23 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
  op->protocol = NCCL_PROTO_SIMPLE;
  op->dtype = info->datatype;

-  int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
+  int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+  if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
  info->chunkSize = stepSize;
  op->root = info->root;
  op->nbytes = info->count;
-  struct ncclPeer* peer = channel->peers + op->root;
+  struct ncclChannelPeer* peer = channel->peers + op->root;

  if (info->coll == ncclFuncSend) {
    op->pattern = ncclPatternSend;
-    if (op->root != info->comm->rank && peer->send[1].transportComm && peer->send[1].transportComm->proxyProgress) {
+    if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
      // Tune chunk size for the network
      if (info->count < stepSize) info->chunkSize /= 4;
      else if (info->count < 8*stepSize) info->chunkSize /= 2;
    }
  } else if (info->coll == ncclFuncRecv) {
    op->pattern = ncclPatternRecv;
-    if (op->root != info->comm->rank && peer->recv[1].transportComm && peer->recv[1].transportComm->proxyProgress) {
+    if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
      // Tune chunk size for the network
      if (info->count < stepSize) info->chunkSize /= 4;
      else if (info->count < 8*stepSize) info->chunkSize /= 2;
@ -437,22 +467,6 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
  return ncclSuccess;
 }

-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) {
-  struct ncclChannel* channel = comm->channels+op->channelId;
-  op->opCount = channel->workFifoTail-1;
-  if (op->root == comm->rank) return ncclSuccess;
-  if (op->pattern == ncclPatternRecv) {
-    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
-    if (op->nsteps == 0) op->nsteps = 1;
-    NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, 1));
-  } else if (op->pattern == ncclPatternSend) {
-    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
-    if (op->nsteps == 0) op->nsteps = 1;
-    NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, 1));
-  }
-  return ncclSuccess;
-}
-
 static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
  struct ncclProxyArgs* freeOp = *opPtr;
  struct ncclProxyArgs* next = freeOp->next;
@ -594,8 +608,48 @@ void ncclDumpProxyState(int signal) {
  dumpProxyState(ncclLastProxyState);
 }

+NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
+ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
+#if CUDART_VERSION >= 11030
+  static int createThreadContext = -1;
+
+  if (createThreadContext == -1) {
+    createThreadContext = ncclParamCreateThreadContext();
+    if (createThreadContext) {
+      if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
+        WARN("Unable to create thread context due to old driver, disabling.");
+        createThreadContext = 0;
+      }
+    }
+  }
+  if (createThreadContext) {
+    if (comm->proxyState.cudaCtx == NULL) {
+      if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx,
+                                  CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) {
+        WARN("Failed to create CUDA context on device %d", comm->cudaDev);
+        createThreadContext = 0;
+        return ncclSuccess;
+      }
+    } else {
+      if (CUPFN(cuCtxSetCurrent(comm->proxyState.cudaCtx)) != CUDA_SUCCESS) {
+        WARN("Failed to set CUDA context on device %d", comm->cudaDev);
+        return ncclUnhandledCudaError;
+      }
+    }
+  }
+#endif
+  return ncclSuccess;
+}
+
 void* ncclProxyProgress(void *comm_) {
  struct ncclComm* comm = (struct ncclComm*)comm_;
+  if (ncclSetThreadContext(comm) != ncclSuccess) {
+    WARN("[Proxy Progress] Failed to set CUDA context on device %d", comm->cudaDev);
+  } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
+    WARN("[Proxy Progress] Failed to set CUDA device %d", comm->cudaDev);
+  }
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+
  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
  state->nextOps = -1;
  signal(SIGUSR1, ncclDumpProxyState);
@ -728,9 +782,9 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool,

 static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
  if (connection->send) {
-    NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm));
+    NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
  } else {
-    NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm));
+    NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
  }
  return ncclSuccess;
 }
@ -774,7 +828,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
  NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
  NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
  NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
-  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv;
+  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv;
  // If we need proxy progress, map progress ops
  if (tcomm->proxyProgress) {
    char poolPath[] = "/dev/shm/nccl-XXXXXX";
@ -881,7 +935,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
  NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
  connection->localRank = peer->localRank;
  NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
-  connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv;
+  connection->tcomm = connection->send ? &ncclTransports[connection->transport]->send : &ncclTransports[connection->transport]->recv;
  // If we need proxy progress, let's allocate ops and start the thread
  if (connection->tcomm->proxyProgress) {
    NCCLCHECK(proxyProgressInit(comm));
@ -947,7 +1001,10 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p

 void* ncclProxyService(void* _args) {
  struct ncclComm* comm =  (struct ncclComm *) _args;
-  if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  if (ncclSetThreadContext(comm) != ncclSuccess) {
+    WARN("[Proxy Service] Failed to set CUDA context on device %d", comm->cudaDev);
+  } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
    WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
  }
  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
--- a/src/transport.cc
+++ b/src/transport.cc
@ -10,16 +10,11 @@
 #define ENABLE_TIMER 0
 #include "timer.h"

-extern struct ncclTransport p2pTransport;
-extern struct ncclTransport shmTransport;
-extern struct ncclTransport netTransport;
-extern struct ncclTransport collNetTransport;
-
-struct ncclTransport ncclTransports[NTRANSPORTS] = {
-  p2pTransport,
-  shmTransport,
-  netTransport,
-  collNetTransport
+struct ncclTransport* ncclTransports[NTRANSPORTS] = {
+  &p2pTransport,
+  &shmTransport,
+  &netTransport,
+  &collNetTransport
 };

 template <int type>
@ -29,7 +24,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
  struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
                                                  comm->channels[channelId].peers[peer].recv + connIndex;
  for (int t=0; t<NTRANSPORTS; t++) {
-    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransport *transport = ncclTransports[t];
    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
    int ret = 0;
    NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
@ -44,9 +39,10 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
  return ncclSystemError;
 }

-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
-  uint32_t mask = 1 << channel->id;
+  struct ncclChannel* channel = &comm->channels[channelId];
+  uint32_t mask = 1 << channelId;
  for (int i=0; i<nrecv; i++) {
    int peer = peerRecv[i];
    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
@ -71,9 +67,10 @@ void dumpData(struct ncclConnect* data, int ndata) {

 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
  // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
+  int highestType = TRANSPORT_P2P;  // track highest transport type
+
  cudaStream_t transportSetupStream;
  CUDACHECK(cudaStreamCreateWithFlags(&transportSetupStream, cudaStreamNonBlocking));
-  int highestType = TRANSPORT_P2P;  // track highest transport type

  struct ncclConnect data[2*MAXCHANNELS];
  for (int i=1; i<comm->nRanks; i++) {
@ -126,7 +123,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
        struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
        NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
        conn->connected = 1;
-        CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
+        CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, transportSetupStream));
      }
    }
    TIME_STOP(3);
@ -136,7 +133,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
        struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
        NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
        conn->connected = 1;
-        CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
+        CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, transportSetupStream));
      }
    }
    TIME_STOP(4);
@ -168,10 +165,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
  // check if we can connect to collnet, whose root is the nranks-th rank
  struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
  peerInfo->rank = nranks;
-  int support = 1;
-  if (isMaster) {
-    NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo));
-  }

  // send master receives connect info from peer recv master
  if (isMaster && type == collNetSend) {
@ -181,14 +174,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
  }

  // select
-  struct ncclPeer* root = channel->peers+nranks;
+  struct ncclChannelPeer* root = channel->peers+nranks;
  // connector index: 0 for recv, 1 for send
  struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
  struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
  conn->transportComm = transportComm;
  // setup
  struct ncclConnect myConnect;
-  if (isMaster && support) {
+  if (isMaster) {
    NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
  }
  // prepare connect handles
@ -218,11 +211,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
    if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
  }
  // connect
-  if (isMaster && support) {
+  if (isMaster) {
    NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
-    struct ncclPeer* devRoot = channel->devPeers+nranks;
-    struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
-    CUDACHECKGOTO(cudaMemcpy(devConn, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice), res, cleanup);
+    struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
+    struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
+    CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
  }
  // recv side sends connect info to send side
  if (isMaster && type == collNetRecv) {
@ -231,7 +224,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
    NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
    TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
  }
-  if (support) fail = 0;
+  fail = 0;
 cleanup:
  if (allConnects != NULL) free(allConnects);
  if (masterConnects != NULL) free(masterConnects);
@ -260,7 +253,7 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
  // Free collNet resources
  for (int r=0; r<comm->nChannels; r++) {
    struct ncclChannel* channel = comm->channels+r;
-    struct ncclPeer* peer = channel->peers+comm->nRanks;
+    struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
    for (int b=0; b<NCCL_MAX_CONNS; b++) {
      struct ncclConnector* send = peer->send + b;
      if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@ -128,9 +128,9 @@ struct recvResources {
  int collNetRank;
 };

-/* Determine if we can communicate with the peer */
 static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
-  *ret = 1;
+  // This transport cannot be used for p2p
+  *ret = 0;
  return ncclSuccess;
 }

@ -154,7 +154,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));

-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "");
  return ncclSuccess;
 }
@ -172,7 +172,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));

-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "");
  return ncclSuccess;
 }
@ -297,7 +297,7 @@ ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle
    comm->proxyState.progressState.collNet.resources = resources;
  }
  if (resources->collNetComms[netDev] == NULL)
-    NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
+    NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev));
  return ncclSuccess;
 }

@ -311,13 +311,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
      struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
      handlePtrs[i] = &(info->collNetHandle);
    }
-    ncclResult_t ret = collNetConnect((void**)handlePtrs, nranks, rank,
+    ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank,
          resources->collNetListenComms[netDev],
          resources->collNetComms+netDev);
    free(handlePtrs);
    if (ret == ncclSuccess) {
      // Close listen comm
-      NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
+      NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev]));
    } else {
      resources->collNetListenComms[netDev] = NULL;
    }
@ -331,7 +331,7 @@ static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
  resources->commRefCount[netDev]--;
  if (resources->commRefCount[netDev] == 0) {
-    NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+    NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev]));
  }
  for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
  comm->proxyState.progressState.collNet.resources = NULL;
@ -447,9 +447,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);

-  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
-        &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+#if CUDA_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && comm->dmaBufSupport) {
+    int dmabuf_fd;
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                 NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                 &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+    (void)close(dmabuf_fd);
+  } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+  {
+    NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                           resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
+                           &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+  }

  *((struct connectMap**)respBuff) = &resources->map;
  return ncclSuccess;
@ -503,9 +516,22 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);

-  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
-        &resources->mhandles[NCCL_PROTO_SIMPLE]));
+#if CUDA_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && comm->dmaBufSupport) {
+    int dmabuf_fd;
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                 NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                 &resources->mhandles[NCCL_PROTO_SIMPLE]));
+    (void)close(dmabuf_fd);
+  } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+  {
+    NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                           resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
+                           &resources->mhandles[NCCL_PROTO_SIMPLE]));
+  }

  // Pass info to send side
  info->reqFifo = resources->reqFifo;
@ -521,7 +547,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (resources->sendMhandles[p]) {
-      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p]));
+      NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
    }
  }
  struct connectMapMem* mems = resources->map.mems;
@ -538,7 +564,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (resources->mhandles[p]) {
-      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p]));
+      NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
    }
  }
  struct connectMapMem* mems = resources->map.mems;
@ -625,10 +651,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
        if (reqFifo[group][buffSlot].recvBuff != NULL) {
          int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
-          int count = totalSize / ncclTypeSize(args->dtype);
+          int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
          reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
          char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
-          NCCLCHECK(collNetIallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
+          NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
          if (sub->requests[buffSlot] == NULL) continue;

          TRACE(NCCL_NET, "sendProxy [%d/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
@ -644,7 +670,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        int done, size;
        int group = s / COLLNET_GROUP_NSUBS;
        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(collNetTest((void*)(sub->requests[buffSlot]), &done, &size));
+        NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size));
        if (done) {
          TRACE(NCCL_NET, "sendProxy [%d/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
          // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
@ -735,7 +761,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
              int startChannel = group*COLLNET_GROUP_NSUBS;
              int offset;
              NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
-              NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
+              NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
            }
          } else {
            for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@ -749,7 +775,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        int group = s / COLLNET_GROUP_NSUBS;
        int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
        int done = 1;
-        if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(sub->requests[buffSlot], &done, NULL));
+        if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL));
        if (done) {
          TRACE(NCCL_NET, "recvProxy [%d/%d/%d] flushed", sub->flushed, group, buffSlot);
          for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@ -181,10 +181,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));

  if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  }
  *((int*)connectInfo) = proxyRank;
@ -217,7 +217,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  req.remoteRank = peerInfo->rank;
  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));

-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  return ncclSuccess;
 }
@ -447,7 +447,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
  resources->channelId = req->channelId;
  resources->connIndex = req->connIndex;
  ncclNetProperties_t props;
-  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
  resources->maxRecvs = props.maxRecvs;

  // We don't return any data
@ -473,11 +473,11 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
  resources->channelId = req->channelId;
  resources->connIndex = req->connIndex;
  ncclNetProperties_t props;
-  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
  resources->maxRecvs = props.maxRecvs;

  if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
-  NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm));
+  NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
  *done = 1;
  return ncclSuccess;
 }
@ -504,15 +504,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
      }
      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId));
+      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId));
      resources->netSendComm = comms->sendComm[resources->channelId];
      if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
    } else {
-      NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+      NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
    }
  } else {
    // Connect to remote peer
-    NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+    NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
    connection->proxyAppendPtr = &connection->proxyAppend;
  }
  if (resources->netSendComm == NULL) {
@ -586,7 +586,19 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+#if CUDA_VERSION >= 11070
+      /* DMA-BUF support */
+      int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
+        int dmabuf_fd;
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        (void)close(dmabuf_fd);
+      } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+      {
+        NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+      }
    }
  }

@ -620,15 +632,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
      }
      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
-      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId));
+      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId));
      resources->netRecvComm = comms->recvComm[resources->channelId];
      if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
    } else {
-      NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+      NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
    }
  } else {
    // Connect to remote peer
-    NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+    NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
    connection->proxyAppendPtr = &connection->proxyAppend;
  }
  if (resources->netRecvComm == NULL) {
@ -636,7 +648,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
    return ncclSuccess;
  }
  *done = 1;
-  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+  NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));

  // Create structures
  struct connectMap* map = &resources->map;
@ -691,7 +703,19 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+#if CUDA_VERSION >= 11070
+      /* DMA-BUF support */
+      int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
+        int dmabuf_fd;
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        (void)close(dmabuf_fd);
+      } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+      {
+        NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+      }
    }
  }

@ -709,7 +733,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
  }
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p]));
+      NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
    }
  }
  struct connectMapMem* mems = resources->map.mems;
@ -725,12 +749,12 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
      comms->sendRefCount[resources->channelId]--;
-      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId]));
+      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
    } else {
-      NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+      NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
    }
  } else {
-    NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+    NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
  }
  free(resources);
  return ncclSuccess;
@ -744,7 +768,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
  }
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p]));
+      NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
    }
  }
  struct connectMapMem* mems = resources->map.mems;
@ -756,12 +780,12 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
      comms->recvRefCount[resources->channelId]--;
-      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId]));
+      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
    } else {
-      NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+      NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
    }
  } else {
-    NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+    NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
  }
  free(resources);
  return ncclSuccess;
@ -849,7 +873,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
          }
          if (ready) {
            // Data is ready, try to send.
-            NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
+            NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
            if (sub->requests[buffSlot] != NULL) {
              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
              sizesFifo[buffSlot] = -1;
@ -867,7 +891,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
      if (sub->done < sub->transmitted) {
        int done;
        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
+        NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
        if (done) {
          TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
          sub->done += args->sliceSteps;
@ -971,7 +995,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        uint64_t step = subGroup->posted;
        struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
        void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
-        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
        if (*requestPtr) {
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup+i;
@ -993,7 +1017,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        int sizes[NCCL_PROXY_MAX_SUBS];
        void* mhandles[NCCL_PROXY_MAX_SUBS];
        for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
-        NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes));
+        NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
        if (done) {
          int useGdr = 0;
          int totalSize = 0;
@ -1034,7 +1058,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
                }
              }
              struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
-              NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
+              NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
            }
          }
          args->idle = 0;
@ -1049,7 +1073,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        uint64_t step = subGroup->transmitted;
        int done = 1;
        void* request = subGroup->requests[step%NCCL_STEPS];
-        if (request) NCCLCHECK(ncclNetTest(request, &done, NULL));
+        if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL));
        if (done) {
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup + i;
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@ -274,6 +274,31 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
  return ncclSuccess;
 }

+// Detect whether DMA-BUF support is present in the kernel
+// Returns :
+// ncclSuccess : DMA-BUF support is available
+// ncclSystemError : DMA-BUF is not supported by the kernel
+ncclResult_t ncclIbDmaBufSupport(int dev) {
+  static int dmaBufSupported = -1;
+  if (dmaBufSupported == -1) {
+    ncclResult_t res;
+    struct ibv_pd* pd;
+    struct ibv_context* ctx;
+    ctx = ncclIbDevs[dev].context;
+    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
+    // Test kernel DMA-BUF support with a dummy call (fd=-1)
+    (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
+    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise)
+    dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0;
+    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+  }
+  if (dmaBufSupported == 0) return ncclSystemError;
+  return ncclSuccess;
+failure:
+  dmaBufSupported = 0;
+  return ncclSystemError;
+}
+
 static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
  return ncclSuccess;
@ -286,10 +311,11 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
  props->pciPath = ncclIbDevs[dev].pciPath;
  props->guid = ncclIbDevs[dev].guid;
  props->ptrSupport = NCCL_PTR_HOST;
-  if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
-  } else {
-    props->ptrSupport |= NCCL_PTR_CUDA;
+  if (ncclIbGdrSupport(dev) == ncclSuccess) {
+    props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem
+  }
+  if (ncclIbDmaBufSupport(dev) == ncclSuccess) {
+    props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
  }
  props->speed = ncclIbDevs[dev].speed;
  props->latency = 0; // Not set
@ -546,6 +572,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
  static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
  memset(handle, 0, sizeof(struct ncclIbHandle));
  comm->dev = dev;
+  comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */
  NCCLCHECK(GetSocketAddr(&comm->sock.addr));
  NCCLCHECK(ncclSocketListen(&comm->sock));
  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
@ -580,7 +607,7 @@ ib_connect_check:
    /* expect user to call again */
    return ncclSuccess;
  } else if (conState == ncclSocketError) {
-    return ncclSystemError;
+    return ncclRemoteError;
  }

  // IB Setup
@ -658,7 +685,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
  stage->comm = rComm;
  stage->state = ncclIbCommStateAccept;
  lComm->sock.asyncFlag = 1;
-  rComm->sock.asyncFlag = 1;

 ib_accept:
  NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
@ -812,7 +838,8 @@ ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {

 ncclResult_t ncclIbTest(void* request, int* done, int* size);

-ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+/* DMA-BUF support */
+ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
  static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
  assert(size > 0);

@ -822,7 +849,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
  uintptr_t addr = (uintptr_t)data & -pageSize;
-  int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
  ncclResult_t res;
  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
  for (int slot=0; /*true*/; slot++) {
@ -834,14 +861,20 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
      // Deregister / register
      struct ibv_mr* mr;
      unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
-      if (ncclIbRelaxedOrderingEnabled) {
-        // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
-        NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning);
+      if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
+      if (fd != -1) {
+        /* DMA-BUF support */
+        NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
+      } else {
+        if (ncclIbRelaxedOrderingEnabled) {
+          // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+          NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning);
+        }
+        else {
+          NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
+        }
      }
-      else {
-        NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
-      }
-      TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey);
+      TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x fd %d", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd);
      cache->population += 1;
      cache->slots[slot].addr = addr;
      cache->slots[slot].pages = pages;
@ -863,6 +896,10 @@ returning:
  return res;
 }

+ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle);
+}
+
 ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
@ -916,13 +953,16 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {

  // Write size as immediate data. In the case of multi-send, only write
  // 0 or 1 as size to indicate whether there was data sent or received.
-  uint64_t immData = 0;
+  uint32_t immData = 0;
  if (nreqs == 1) {
    immData = reqs[0]->send.size;
  } else {
-    uint8_t* multiImmData = (uint8_t*)&immData;
+    if (nreqs > 32) {
+      WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs);
+      return ncclInternalError;
+    }
    for (int r=0; r<nreqs; r++) {
-      multiImmData[r] = reqs[r]->send.size ? 1 : 0;
+      immData |= (reqs[r]->send.size ? 1 : 0) << r;
    }
  }

@ -1197,7 +1237,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
        char line[SOCKET_NAME_MAXLEN+1];
        WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
             ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
-        return ncclSystemError;
+        return ncclRemoteError;
      }

      struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
@ -1212,9 +1252,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
          if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
          if (req->nreqs > 1) {
            // In the case of a multi recv, we only set sizes to 0 or 1.
-            uint8_t* sizes = (uint8_t*)&wc->imm_data;
            for (int i=0; i<req->nreqs; i++) {
-              req->recv.sizes[i] |= sizes[i];
+              req->recv.sizes[i] = (wc->imm_data >> i) & 0x1;
            }
          } else {
            req->recv.sizes[0] += wc->imm_data;
@ -1275,6 +1314,7 @@ ncclNet_t ncclNetIb = {
  ncclIbConnect,
  ncclIbAccept,
  ncclIbRegMr,
+  ncclIbRegMrDmaBuf,
  ncclIbDeregMr,
  ncclIbIsend,
  ncclIbIrecv,
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@ -311,6 +311,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
  struct ncclSocketListenComm* comm;
  NCCLCHECK(ncclSocketNewListenComm(&comm));
  NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
+  comm->sock.asyncFlag = 1;
  NCCLCHECK(ncclSocketListen(&comm->sock));
  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
  NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
@ -359,7 +360,7 @@ socket_connect_check:
      /* expect user to call again */
      return ncclSuccess;
    } else if (conState == ncclSocketError) {
-      return ncclSystemError;
+      return ncclRemoteError;
    }
    stage->state = ncclSocketCommStateSend;

@ -616,6 +617,7 @@ ncclNet_t ncclNetSocket = {
  ncclSocketConnect,
  ncclSocketAccept,
  ncclSocketRegMr,
+  NULL, // No DMA-BUF support
  ncclSocketDeregMr,
  ncclSocketIsend,
  ncclSocketIrecv,
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@ -7,6 +7,7 @@
 #include "comm.h"
 #include "graph.h"
 #include "utils.h"
+#include "shm.h"

 struct ncclP2pBuff {
  void* directPtr;
@ -17,6 +18,34 @@ struct p2pConnectInfo {
  int rank;
  int read;
  struct ncclP2pBuff p2pBuff;
+  // Use by CE memcpy
+  char shmName[7];
+  int shmSize;
+};
+static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
+
+struct p2pShm {
+  struct ncclSendMem sendMem;
+  struct ncclRecvMem recvMem;
+};
+struct p2pProxyInfo {
+  // Shared memory between proxy and receiving GPU
+  struct p2pShm* shm;
+  struct p2pShm* devShm;
+  char shmName[7];
+  int shmSize;
+
+  // Intermediate step for sender
+  struct ncclRecvMem* ceRecvMem;
+  char* ceDevBuff;
+
+  // Receiver buffer
+  char* recvFifo;
+
+  // Used by progress only
+  uint64_t step;
+  cudaStream_t stream;
+  cudaEvent_t events[NCCL_STEPS];
 };
 static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");

@ -24,12 +53,16 @@ struct p2pSendResources {
  struct ncclSendMem* devMem;
  void* sendMemIpc;
  void* recvMemIpc;
+  struct p2pProxyInfo proxyInfo;
 };

 struct p2pRecvResources {
  struct ncclRecvMem* devMem;
  void* sendMemIpc;
  void* recvMemIpc;
+  struct p2pShm* shm;
+  struct p2pShm* devShm;
+  int shmSize;
 };

 #include <sys/types.h>
@ -51,8 +84,14 @@ static int busIdToCudaDev(int64_t busId) {
  return -1;
 }

+NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
+static int useMemcpy = 0;
+static void initCeOperation();
+
 /* Determine if two peers can communicate through p2p */
 ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  initCeOperation();
+
  // Rule out different nodes / isolated containers
  if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
    *ret = 0;
@ -63,7 +102,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
  int intermediateRank;
  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
  if (*ret == 0) return ncclSuccess;
-  if (intermediateRank != -1) return ncclSuccess;
+  if (intermediateRank != -1) {
+    if (useMemcpy) *ret = 0;
+    return ncclSuccess;
+  }

  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
  int cudaDev1 = busIdToCudaDev(info1->busId);
@ -170,6 +212,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  send->transportResources = resources;
  int useRead, intermediateRank;
  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  if (useMemcpy) useRead = 0;

  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
@ -185,14 +228,14 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash) {
+    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
      if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
    } else {
      send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
-          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
+      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
+          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
    }
  } else {
    info->rank = intermediateRank;
@ -202,9 +245,15 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  }

  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
-  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+  if (useMemcpy) {
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
+    info->shmSize = resources->proxyInfo.shmSize;
+    memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
+  } else {
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+    NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
+  }

-  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
  return ncclSuccess;
 }

@ -230,7 +279,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash) {
+    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
      if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
    } else {
      recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
@ -258,30 +307,61 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (info->read && p == NCCL_PROTO_SIMPLE) {
      /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
+      if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy
      send->conn.buffs[p] = (char*)(resources->devMem+1);
    } else {
      send->conn.buffs[p] = buff;
      buff += send->comm->buffSizes[p];
    }
  }
-  send->conn.tail = &remDevMem->tail;
-  send->conn.head = &resources->devMem->head;
-  send->conn.ptrExchange = &resources->devMem->ptrExchange;
-  send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
+
+  if (useMemcpy) {
+    send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
+    send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
+    send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
+    // Send SIMPLE buff to proxy, and replace it by local buffer
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
+    send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
+  } else {
+    send->conn.tail = &remDevMem->tail;
+    send->conn.head = &resources->devMem->head;
+    send->conn.ptrExchange = &resources->devMem->ptrExchange;
+    send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
+  }
  return ncclSuccess;
 }

 /* Connect/Recv from this peer */
 ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
-  struct ncclSendMem* remDevMem;
  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;

-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+  struct ncclSendMem* remDevMem = NULL;
+
+  if (useMemcpy) {
+    char shmPath[PATH_MAX];
+    sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
+    TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+    resources->shmSize = info->shmSize;
+    NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0));
+    // Remove the file to ensure proper clean-up
+    NCCLCHECK(ncclShmUnlink(shmPath));
+
+    recv->conn.tail = &resources->devShm->recvMem.tail;
+    recv->conn.head = &resources->devShm->sendMem.head;
+  } else {
+    NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+
+    recv->conn.tail = &resources->devMem->tail;
+    recv->conn.head = &remDevMem->head;
+    recv->conn.ptrExchange = &remDevMem->ptrExchange;
+    recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
+  }

  char* buff = (char*)(resources->devMem+1);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (info->read && p == NCCL_PROTO_SIMPLE) {
+      if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
      /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
      recv->conn.buffs[p] = (char*)(remDevMem+1);
    } else {
@ -289,10 +369,6 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
      buff += recv->comm->buffSizes[p];
    }
  }
-  recv->conn.tail = &resources->devMem->tail;
-  recv->conn.head = &remDevMem->head;
-  recv->conn.ptrExchange = &remDevMem->ptrExchange;
-  recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
  return ncclSuccess;
 }

@ -308,11 +384,52 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
  if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
  if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+  if (useMemcpy) {
+    NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
+  }
  free(resources);
  return ncclSuccess;
 }

-static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (useMemcpy) {
+    struct p2pProxyInfo* proxyInfo;
+    NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+    connection->transportResources = proxyInfo;
+
+    NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE]));
+
+    char shmPath[PATH_MAX];
+    shmPath[0] = '\0';
+    proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
+    NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1));
+    TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
+    memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
+
+    NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+
+    if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError;
+    memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo));
+  } else {
+    if (reqSize != sizeof(int)) return ncclInternalError;
+    int size = *((int*)reqBuff);
+    if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
+    struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
+    NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size));
+    connection->transportResources = p2pBuff->directPtr;
+    cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
+    if (res != cudaSuccess) {
+      WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
+      cudaFree(p2pBuff->directPtr);
+      free(p2pBuff);
+      CUDACHECK(res);
+    }
+  }
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
  if (reqSize != sizeof(int)) return ncclInternalError;
  int size = *((int*)reqBuff);
  if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
@ -330,15 +447,116 @@ static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct
  return ncclSuccess;
 }

-static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
+
+  if (reqSize != sizeof(void*)) return ncclInternalError;
+  proxyInfo->recvFifo = *((char**)reqBuff);
+
+  CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  if (useMemcpy) {
+    struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
+    NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
+    NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
+    CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
+    CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
+    for (int i=0; i<NCCL_STEPS; i++) {
+      CUDACHECK(cudaEventDestroy(proxyInfo->events[i]));
+    }
+    free(proxyInfo);
+  } else {
+    // Do not check return code as CUDA may have already shut down
+    cudaFree(connection->transportResources);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
  // Do not check return code as CUDA may have already shut down
  cudaFree(connection->transportResources);
  return ncclSuccess;
 }

+static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
+        // Check GPU has sent everything
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(cudaMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, cudaMemcpyDeviceToDevice, resources->stream));
+          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        cudaError_t res = cudaEventQuery(resources->events[buffSlot]);
+        if (res != cudaErrorNotReady) CUDACHECK(res);
+        if (res == cudaSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify SHM
+          resources->shm->recvMem.tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
 struct ncclTransport p2pTransport = {
  "P2P",
  p2pCanConnect,
-  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL }
 };
+
+static void initCeOperation() {
+  static int init = 0;
+  if (!init) {
+    useMemcpy = ncclParamP2pUseCudaMemcpy();
+    if (useMemcpy) {
+      p2pTransport.send.proxyConnect = p2pSendProxyConnect;
+      p2pTransport.send.proxyProgress = p2pSendProxyProgress;
+    }
+    init = 1;
+  }
+}
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@ -31,11 +31,21 @@ struct shmRecvResources {
  struct ncclRecvMem* devHostMem;
 };

+#define SHM_SEND_SIDE 1
+#define SHM_RECV_SIDE 2
 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
+NCCL_PARAM(ShmUseCudaMemcpy, "SHM_USE_CUDA_MEMCPY", 0);
+NCCL_PARAM(ShmMemcpyMode, "SHM_MEMCPY_MODE", SHM_SEND_SIDE); // 1 is sender-side, 2 is receiver-side, 3 is both
+static int useMemcpySend = 0;
+static int useMemcpyRecv = 0;
+NCCL_PARAM(ShmLocality, "SHM_LOCALITY", SHM_RECV_SIDE); // 1 is sender-size, 2 is receiver-size
+static int shmLocality = 0;
+static void initCeOperation();

 /* Determine two peers can communicate with SHM */
-ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
  *ret = 0;
+  initCeOperation();

  if (ncclParamShmDisable() == 1) return ncclSuccess;

@ -55,7 +65,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 #define MAX_SHM_NAME_LEN 1024

 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
  struct shmSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  send->transportResources = resources;
@ -65,16 +75,20 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  char shmPath[PATH_MAX];
  shmPath[0] = '\0';
-  info->shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+  int shmSize = sizeof(struct ncclSendMem);
+  if (shmLocality == SHM_SEND_SIDE) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
+  }
+  info->shmSize = resources->shmSize = shmSize;
  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));

-  INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+  INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via SHM/%s/%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct");
  return ncclSuccess;
 }

-ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
  struct shmRecvResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;
@ -85,7 +99,9 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  char shmPath[PATH_MAX];
  shmPath[0] = '\0';
  int shmSize = sizeof(struct ncclRecvMem);
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
+  if (shmLocality == SHM_RECV_SIDE) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
+  }
  info->shmSize = resources->shmSize = shmSize;
  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
@ -94,8 +110,21 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  return ncclSuccess;
 }

+struct shmProxyInfo {
+  struct ncclRecvMem* ceRecvMem;
+  char* devFifo;
+  char* shmFifo;
+  struct ncclSendMem* sendMem;
+  struct ncclRecvMem* recvMem;
+
+  // used by progress only
+  uint64_t step;
+  cudaStream_t stream;
+  cudaEvent_t events[NCCL_STEPS];
+};
+
 /* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  // Setup device pointers
  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
  struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@ -108,19 +137,29 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn
  // Remove the file to ensure proper clean-up
  NCCLCHECK(ncclShmUnlink(shmPath));

-  send->transportResources = resources;
-  int offset = 0;
+  char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset;
-    offset += send->comm->buffSizes[p];
+    send->conn.buffs[p] = buff;
+    buff += send->comm->buffSizes[p];
  }
  send->conn.tail = &resources->devRemHostMem->tail;
-
  send->conn.head = &resources->devHostMem->head;
+
+  if (useMemcpyRecv) {
+    send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
+  }
+  if (useMemcpySend) {
+    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
+    struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
+    send->conn.tail = &proxyInfo.ceRecvMem->tail;
+    send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
+  }
  return ncclSuccess;
 }

-ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  // Setup device pointers
  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
@ -131,18 +170,26 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
  NCCLCHECK(ncclShmUnlink(shmPath));
-  recv->conn.head = &resources->devRemHostMem->head;

-  int offset = 0;
+  char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset;
-    offset += recv->comm->buffSizes[p];
+    recv->conn.buffs[p] = buff;
+    buff += recv->comm->buffSizes[p];
  }
+  recv->conn.head = &resources->devRemHostMem->head;
  recv->conn.tail = &resources->devHostMem->tail;
+
+  if (useMemcpyRecv) {
+    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
+    struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
+    NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
+    recv->conn.tail = &proxyInfo.ceRecvMem->tail;
+  }
  return ncclSuccess;
 }

-ncclResult_t shmSendFree(struct ncclConnector* send) {
+static ncclResult_t shmSendFree(struct ncclConnector* send) {
  struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
@ -150,7 +197,7 @@ ncclResult_t shmSendFree(struct ncclConnector* send) {
  return ncclSuccess;
 }

-ncclResult_t shmRecvFree(struct ncclConnector* recv) {
+static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
@ -158,9 +205,209 @@ ncclResult_t shmRecvFree(struct ncclConnector* recv) {
  return ncclSuccess;
 }

+static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct shmProxyInfo* proxyInfo;
+  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+  if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(proxyInfo, reqBuff, reqSize);
+  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
+  NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+  CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->transportResources = proxyInfo;
+  if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(respBuff, proxyInfo, respSize);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct shmProxyInfo* proxyInfo;
+  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+  if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(proxyInfo, reqBuff, reqSize);
+  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
+  NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+  CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->transportResources = proxyInfo;
+  if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(respBuff, proxyInfo, respSize);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
+  CUDACHECK(cudaStreamDestroy(resources->stream));
+  CUDACHECK(cudaFree(resources->devFifo));
+  NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventDestroy(resources->events[i]));
+  }
+  free(connection->transportResources);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
+  CUDACHECK(cudaStreamDestroy(resources->stream));
+  CUDACHECK(cudaFree(resources->devFifo));
+  NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventDestroy(resources->events[i]));
+  }
+  free(connection->transportResources);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
+        // Check GPU has sent everything
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(cudaMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, cudaMemcpyDeviceToHost, resources->stream));
+          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
+          resources->recvMem->sizesFifo[buffSlot] = size;
+          __sync_synchronize(); // make sure sizesFifo is visible
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        cudaError_t res = cudaEventQuery(resources->events[buffSlot]);
+        if (res != cudaErrorNotReady) CUDACHECK(res);
+        if (res == cudaSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify SHM
+          resources->recvMem->tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->recvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->recvMem->tail;
+        // Check data is ready in SHM
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(cudaMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, cudaMemcpyHostToDevice, resources->stream));
+          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        cudaError_t res = cudaEventQuery(resources->events[buffSlot]);
+        if (res != cudaErrorNotReady) CUDACHECK(res);
+        if (res == cudaSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify GPU
+          resources->ceRecvMem->tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
 struct ncclTransport shmTransport = {
  "SHM",
  shmCanConnect,
  { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
+
+static void initCeOperation() {
+  static int init = 0;
+  if (!init) {
+    useMemcpySend = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 1);
+    useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
+    if (useMemcpySend) {
+      shmTransport.send.proxyConnect = shmSendProxyConnect;
+      shmTransport.send.proxyFree = shmSendProxyFree;
+      shmTransport.send.proxyProgress = shmSendProxyProgress;
+    }
+    if (useMemcpyRecv) {
+      shmTransport.recv.proxyConnect = shmRecvProxyConnect;
+      shmTransport.recv.proxyFree = shmRecvProxyFree;
+      shmTransport.recv.proxyProgress = shmRecvProxyProgress;
+    }
+    shmLocality = ncclParamShmLocality();
+    if (shmLocality != SHM_SEND_SIDE && shmLocality != SHM_RECV_SIDE) {
+      WARN("Ignoring SHM locality, must be 1 (sender side) or 2 (receiver side, default)");
+      shmLocality = SHM_RECV_SIDE;
+    }
+    init = 1;
+  }
+}