2.17.1-1

Add new NVLS algorithm for allreduce using NVLink SHARP (intra-node only). Add new config options: cgaClusterSize, minCTAs, maxCTAs, netName. Enable LL128 when we use PXN to close rings. NVTX3 includes update. Fix crash when one CollNet (SHARP) rail fails to initialize.
2023-02-27 02:48:21 -08:00 · 2023-02-27 02:48:21 -08:00 · 5d3ab08b69
commit 5d3ab08b69
parent f3d5166783
72 changed files with 4541 additions and 2391 deletions
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 16
+NCCL_MINOR   := 17
-NCCL_PATCH   := 5
+NCCL_PATCH   := 1
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
--- a/src/Makefile
+++ b/src/Makefile
@ -12,7 +12,8 @@ INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
 		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
 		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
-		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
+		misc/ipcsocket.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
                collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
@ -62,7 +63,7 @@ ALWAYS_REBUILD:
 -include $(DEPFILES)
 $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
-$(INCDIR)/nccl.h : nccl.h.in
+$(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
 # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
 	@$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
 	mkdir -p $(INCDIR)
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@ -386,6 +386,24 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
  return ncclSuccess;
 }
 // IntraNode in-place Broadcast
 ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
  if (nranks == 1) return ncclSuccess;
  TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
  if (rank == root) {
    for (int i=0; i<nranks; i++) {
      if (i != root) NCCLCHECK(bootstrapSend(commState, ranks[i], /*tag=*/ranks[i], bcastData, size));
    }
  }
  else {
    NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/rank, bcastData, size));
  }
  TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
  return ncclSuccess;
 }
 ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
  // New unex
  struct unexConn* unex;
--- a/src/channel.cc
+++ b/src/channel.cc
@ -13,14 +13,15 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
  if (channel->id != -1) return ncclSuccess;
  int nRanks = comm->nRanks;
  int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */;
  channel->id = channelId;
  channel->workFifoSent = 0;
  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
  // The extra on nRanks+1 is for collnet root (i.e. network)
-  channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1);
+  channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nPeers);
-  NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.cudaStream));
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, comm->deviceStream.cudaStream));
  ncclCommPushCudaFree(comm, channel->devPeers);
  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
@ -29,7 +30,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
-  for (int r=0; r < nRanks+1; ++r) {
+  for (int r=0; r < nPeers; ++r) {
    for (int b=0; b < NCCL_MAX_CONNS; b++) {
      channel->peers[r].send[b].comm = comm;
      channel->peers[r].recv[b].comm = comm;
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@ -97,3 +97,45 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
    runRing<T, RedOp, ProtoLL128>(args);
  }
 };
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
    const int tid = threadIdx.x;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
    const ssize_t chunkSize = int(args->lastChunkSize);
    const ssize_t size = args->count;
    const ssize_t loopSize = nChannels*chunkSize;
    const int nThreadsGather = 128;
    const int nThreadsBcast = 384 + WARP_SIZE;
    const int tidEndGather = nThreadsGather;
    const int tidEndBcast = tidEndGather + nThreadsBcast;
    using Proto = ProtoSimple<1, 1>;
    if (tid < tidEndGather) {
      // Gather
      int group = (0*Proto::MaxGroupWidth) | (0<<16);
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
        prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*chunkSize;
        int nelem = min(chunkSize, size-offset);
        prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
      }
    } else if (tid < tidEndBcast) {
      int group = (3*Proto::MaxGroupWidth) | (1<<16);
      // Bcast through MC
      Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
        prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, group, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*chunkSize;
        int nelem = min(chunkSize, size-offset);
        prims.send(offset, nelem);
      }
    }
  }
 };
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@ -306,9 +306,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
        int nelem = min(direct->nHeads*chunkSize, size-offset);
        if (args->regUsed) {
-          prims.directScatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
+          prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
        } else {
-          prims.scatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
+          prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
        }
      }
    } else if (tid >= tidStartReduce && direct->out != -1) {
@ -344,7 +344,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
        int nelem = min(direct->nHeads*chunkSize, size-offset);
-        prims.directGather(offset, nelem, chunkSize, direct->headRank, direct->shift);
+        prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
      }
    } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
      int group = (1*Proto::MaxGroupWidth) | (0<<16);
@ -371,6 +371,65 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
  }
 };
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
  #if NCCL_NVLS_ENABLED
    const int tid = threadIdx.x;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
    const ssize_t chunkSize = int(args->lastChunkSize);
    const ssize_t size = args->count;
    const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
    const int nranks = ncclShmem.comm.nRanks;
    const int reduceWarps = nranks <= 6 ? 6 : 4;
    const int copyWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps)/2;
    const int nThreadsScatter = copyWarps*WARP_SIZE;
    const int nThreadsGather  = (copyWarps-1)*WARP_SIZE;
    const int nThreadsReduce = (reduceWarps+1)*WARP_SIZE;
    const int tidEndScatter = nThreadsScatter;
    const int tidEndGather = tidEndScatter + nThreadsGather;
    const int tidEndReduce = tidEndGather + nThreadsReduce;
    using Proto = ProtoSimple<1, 1, COLL_UNROLL, /*NVLS=*/true>;
    if (tid < tidEndScatter) {
      // Scatter
      int group = (0*Proto::MaxGroupWidth) | (0<<16);
      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
        int nelem = min(nvls->nHeads*chunkSize, size-offset);
        prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
      }
    } else if (tid < tidEndGather) {
      // Gather
      int group = (2*Proto::MaxGroupWidth) | (0<<16);
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
        prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
        int nelem = min(nvls->nHeads*chunkSize, size-offset);
        prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
      }
    } else if (tid < tidEndReduce) {
      int group = (3*Proto::MaxGroupWidth) | (1<<16);
      // Reduce, broadcast through NVLS
      Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
        prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
        int nelem = min(chunkSize, size-offset);
        prims.recvSend(nelem);
      }
    }
  #endif // NCCL_NVLS_ENABLED
  }
 };
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@ -11,31 +11,23 @@
 #include "devcomm.h"
 #include "op128.h"
-#if __CUDA_ARCH__ >= 800
+#define COLL_UNROLL (ncclCollUnroll())
 #define COLL_UNROLL 8
 #else
 #define COLL_UNROLL 4
 #endif
 #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1)  // Using balanced tree instead of split tree
 typedef void(*ncclKern_t)();
 extern __device__ ncclKern_t ncclFuncs[];
 struct ncclShmemGroup {
-  ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY];
+  ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
-  ncclConnInfo *sendConns[NCCL_MAX_DIRECT_ARITY];
+  ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
-  void* srcs[NCCL_MAX_DIRECT_ARITY+1];
+  void* srcs[NCCL_MAX_NVLS_ARITY+1];
-  void* dsts[NCCL_MAX_DIRECT_ARITY+1];
+  void* dsts[NCCL_MAX_NVLS_ARITY+1];
-  int totalSendSize[NCCL_MAX_SLICE_PER_CHUNK];
+  int nvlsRecv;
 };
 struct ncclShmemData {
-  union {
+  struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
-    uint64_t ll128warp[NCCL_LL128_MAX_NTHREADS/WARP_SIZE][NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE];
+  uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
    struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
  };
  uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
  int channelId;
  int aborted;
  alignas(16) struct ncclDevComm comm;
@ -45,6 +37,15 @@ struct ncclShmemData {
 static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
 extern __shared__ ncclShmemData ncclShmem;
 #if __CUDA_ARCH__ >= 700
  extern __shared__ ulong2 ncclShmemPerWarp[/*ncclShmemDynamicSize()/sizeof(ulong2)*/];
 #else
  extern __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
 #endif
 __device__ inline void* ncclScratchForWarp(int warp) {
  return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
 }
 __device__ inline bool barrierReduceAny(int bit) {
  uint32_t popc;
@ -235,7 +236,8 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
  IMPL_COLL4(func, TREE,    devredop, type, ncclType) \
  IMPL_COLL4(func, RING,    devredop, type, ncclType) \
  IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
-  IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType)
+  IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \
  IMPL_COLL4(func, NVLS, devredop, type, ncclType)
 #if NCCL_TYPE == 0
 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t,   ncclInt8)
@ -291,4 +293,6 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
 #define IMPL_COLL_P(func)
 #endif
 #define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP))
 #endif
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@ -9,6 +9,9 @@
 #include "common.h"
 __shared__ ncclShmemData ncclShmem;
 #if __CUDA_ARCH__ < 700
  __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
 #endif
 #define NCCL_FUNC5(func, algo, devredop, type, nullify) \
  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
@ -19,7 +22,8 @@ __shared__ ncclShmemData ncclShmem;
  NCCL_FUNC5(func, TREE,    devredop, type, nullify), \
  NCCL_FUNC5(func, RING,    devredop, type, nullify), \
  NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
-  NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify)
+  NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, nullify), \
  NCCL_FUNC5(func, NVLS,           devredop, type, nullify)
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 // Must be consistent with ncclDataType_t
--- a/src/collectives/device/onerank_reduce.cu
+++ b/src/collectives/device/onerank_reduce.cu
@ -6,7 +6,7 @@
 #include "devcomm.h"
 #include "collectives.h"
-#include "reduce_kernel.h"
+#include "common_kernel.h"
 #include "common.h"
 namespace {
@ -35,8 +35,10 @@ namespace {
      i1 = i1 < eltN ? i1 : eltN;
      src += i0;
      dst += i0;
-      ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1>
+      void *vsrc = (void*)src;
-        (tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0);
+      void *vdst = (void*)dst;
      ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/1>
        (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
    }
  }
 }
--- a/src/collectives/device/op128.h
+++ b/src/collectives/device/op128.h
@ -65,4 +65,290 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
  v1 = tmp8[1];
 }
 template<typename T>
 __device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) {
  return (uint32_t)__cvta_generic_to_shared(ptr);
 }
 template<typename T>
 __device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) {
  return (uintptr_t)__cvta_generic_to_global(ptr);
 }
 template<typename T>
 __device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) {
  T* ans;
  asm("cvta.shared.u64 %0, %1;" : "=l"(ans) : "l"(uint64_t(shptr)));
  return ans;
 }
 template<typename T>
 __device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
  T* ans;
  asm("cvta.global.u64 %0, %1;" : "=l"(ans) : "l"(gptr));
  return ans;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // BytePack<Size>: struct of bytes.
 template<int Size>
 union BytePack;
 template<>
 union BytePack<1> {
  uint8_t u8, native;
 };
 template<>
 union BytePack<2> {
  BytePack<1> half[2];
  uint8_t u8[2];
  uint16_t u16, native;
 };
 template<>
 union BytePack<4> {
  BytePack<2> half[2];
  uint8_t u8[4];
  uint16_t u16[2];
  uint32_t u32, native;
 };
 template<>
 union BytePack<8> {
  BytePack<4> half[2];
  uint8_t u8[8];
  uint16_t u16[4];
  uint32_t u32[2];
  uint64_t u64, native;
 };
 template<>
 union alignas(16) BytePack<16> {
  BytePack<8> half[2];
  uint8_t u8[16];
  uint16_t u16[8];
  uint32_t u32[4];
  uint64_t u64[2];
  ulong2 ul2, native;
 };
 template<typename T>
 __device__ __forceinline__ BytePack<sizeof(T)> toPack(T value)  {
  union { BytePack<sizeof(T)> p; T v; };
  v = value;
  return p;
 }
 template<typename T>
 __device__ __forceinline__ T fromPack(BytePack<sizeof(T)> pack)  {
  union { BytePack<sizeof(T)> p; T v; };
  p = pack;
  return v;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Load/store of BytePack<?> using integral addresses.
 template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
 template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
 template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
 template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
 template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
 template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
 // Used to define implementations for above prototypes.
 #define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
  template<> \
  __device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
    data_cxx_ty tmp; \
    asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
    BytePack<bytes> ans; \
    ans.native = tmp; \
    return ans; \
  } \
  template<> \
  __device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
    data_cxx_ty tmp; \
    asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
    BytePack<bytes> ans; \
    ans.native = tmp; \
    return ans; \
  } \
  template<> \
  __device__ __forceinline__ void st_##space<bytes>(addr_cxx_ty addr, BytePack<bytes> value) { \
    data_cxx_ty tmp = value.native; \
    asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \
  }
 // Single-byte types use 4-byte registers since there is no 1-byte register
 // character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
 DEFINE_ld_st(1, uint32_t, b8, r, global, uintptr_t, l)
 DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
 DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
 DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
 DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
 DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
 DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
 DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
 #undef DEFINE_ld_st
 #define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
  template<> \
  __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
    BytePack<16> ans; \
    asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
    return ans; \
  } \
  template<> \
  __device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
    BytePack<16> ans; \
    asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
    return ans; \
  } \
  template<> \
  __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
    asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
  }
 DEFINE_ld_st_16(global, uintptr_t, l)
 DEFINE_ld_st_16(shared, uint32_t, r)
 #undef DEFINE_ld_st_16
 ////////////////////////////////////////////////////////////////////////////////
 // Atomic load/store using c++ pointers.
 __device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
  uint64_t ans;
  asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
  return ans;
 }
 __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
  uint64_t ans;
  #if __CUDA_ARCH__ >= 700
    asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
  #else
    asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
  #endif
  return ans;
 }
 __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
  uint64_t ans;
  #if __CUDA_ARCH__ >= 700
    asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
  #else
    asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)));
  #endif
  return ans;
 }
 __device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) {
  asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
 }
 __device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) {
  #if __CUDA_ARCH__ >= 700
    asm volatile("st.relaxed.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
  #else
    asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
  #endif
 }
 __device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) {
  #if __CUDA_ARCH__ >= 700
    asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
  #else
    asm volatile("membar.sys; st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
  #endif
 }
 __device__ __forceinline__ void fence_acq_rel_sys() {
  #if __CUDA_ARCH__ >= 700
    asm volatile("fence.acq_rel.sys;" ::: "memory");
  #else
    asm volatile("membar.sys;" ::: "memory");
  #endif
 }
 __device__ __forceinline__ void fence_acq_rel_gpu() {
  #if __CUDA_ARCH__ >= 700
    asm volatile("fence.acq_rel.gpu;" ::: "memory");
  #else
    asm volatile("membar.gl;" ::: "memory");
  #endif
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Multimem stores of BytePack<?>.
 template<int Size>
 __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val);
 #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
 template<>
 __device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
  asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
 }
 template<>
 __device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) {
  asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory");
 }
 template<>
 __device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) {
  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};"
    :: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3])
    : "memory");
 }
 #else
 template<int Size>
 __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val) {
  // nop
 }
 #endif
 // Warp-uniform memory copy from shared address (not generic) to global memory.
 // The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value
 // is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes.
 template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
 __device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
    int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
  ) {
  static_assert(std::is_signed<IntBytes>::value, "`IntBytes` must be a signed integral type.");
  int nBytes = min(nBytesAhead, (IntBytes)MaxBytes);
  int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16);
  int nMiddleBytes = (nBytes-nFrontBytes) & -16;
  int nBackBytes = (nBytes-nFrontBytes) % 16;
  { int backLane = WARP_SIZE-1 - lane;
    bool hasFront = lane*EltSize < nFrontBytes;
    bool hasBack = backLane*EltSize < nBackBytes;
    int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize);
    if (hasFront | hasBack) {
      BytePack<EltSize> tmp = ld_shared<EltSize>(srcAddr+offset);
      // Can't use multimem_st since it doesn't support EltSize==2
      st_global<EltSize>(dstAddr+offset, tmp);
    }
  }
  srcAddr += nFrontBytes;
  int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0;
  srcAddr += -srcMisalign + lane*16;
  dstAddr += nFrontBytes + lane*16;
  nMiddleBytes -= lane*16;
  #pragma unroll
  for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) {
    if (nMiddleBytes <= 0) break;
    union {
      BytePack<4> b4[4];
      BytePack<16> b16;
    };
    b4[0] = ld_shared<4>(srcAddr + 0*4);
    b4[1] = ld_shared<4>(srcAddr + 1*4);
    b4[2] = ld_shared<4>(srcAddr + 2*4);
    b4[3] = ld_shared<4>(srcAddr + 3*4);
    if (srcMisalign != 0) {
      BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4);
      b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8);
      b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8);
      b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8);
      b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8);
    }
    if (Multimem) multimem_st_global<16>(dstAddr, b16);
    else          st_global<16>(dstAddr, b16);
    srcAddr += WARP_SIZE*16;
    dstAddr += WARP_SIZE*16;
    nMiddleBytes -= WARP_SIZE*16;
  }
 }
 #endif
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@ -9,6 +9,7 @@
 #include <type_traits>
 #include "reduce_kernel.h" // for reduction funcs
 #include "common_kernel.h"
 #include "common.h"
 #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
@ -20,12 +21,13 @@
 * to how that protocol operates with a consistent interface so that our
 * algorithm code can operate protocol parametrically.
 */
-template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL>
+template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, bool NVLS_1 = false>
 struct ProtoSimple {
  static constexpr int Id = NCCL_PROTO_SIMPLE;
  static constexpr int SlicePerChunk = SlicePerChunk_1;
  static constexpr int StepPerSlice = StepPerSlice_1;
  static constexpr int Unroll = Unroll_1;
  static constexpr bool NVLS = NVLS_1;
  // Data bytes (no flags etc) in one step of the fifo queue.
  __device__ static int calcBytePerStep() {
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@ -255,18 +255,18 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
      }
      if (SRC) {
        data = dl.loadFinish();
-        if (SrcBuf == Input) data = MULTI<RedOp, T>().preOp(redOp, data);
+        if (SrcBuf == Input) data = applyPreOp(redOp, data);
      }
      if (RECV) {
-        data = !SRC ? peerData : MULTI<RedOp,T>()(redOp, peerData, data);
+        data = !SRC ? peerData : applyReduce(redOp, peerData, data);
        #pragma unroll MaxRecv
        for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
          peerData = readLLFinish(offset, line, i);
-          data = MULTI<RedOp,T>()(redOp, peerData, data);
+          data = applyReduce(redOp, peerData, data);
        }
      }
-      if (postOp) data = MULTI<RedOp, T>().postOp(redOp, data);
+      if (postOp) data = applyPostOp(redOp, data);
      // Send : inter-node, then intra-node, then local
      if (SEND) {
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@ -82,7 +82,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
    if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
  }
  inline __device__ void postSend() {
-    if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; }
+    if (sendConnTailPtr) {
 #if __CUDA_ARCH__ >= 900
      __threadfence_system();
 #else
      __threadfence();
 #endif
      *sendConnTailPtr = sendConnTail += 1;
    }
  }
  template<int WordPerThread>
@ -109,7 +116,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
      // buffer into shmem.
      int misalignment = reinterpret_cast<uintptr_t>(src) % 16;
      uint64_t *src8 = reinterpret_cast<uint64_t*>(reinterpret_cast<uintptr_t>(src) & -uintptr_t(16));
-      uint64_t *shm8 = shmemCvtPtr(ncclShmem.ll128warp[warpInBlock]);
+      uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock));
      #pragma unroll
      for(int g=0; g < WordPerThread/2; g++)
        if((g*WARP_SIZE + wid)*16 < misalignment + eltN*sizeof(T))
@ -153,7 +160,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
    }
    // Write to dst if 16-byte aligned, shmem otherwise.
    int misalignment = reinterpret_cast<uintptr_t>(dst)%16;
-    uint64_t *shm8 = shmemCvtPtr(ncclShmem.ll128warp[warpInBlock]);
+    uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock));
    #pragma unroll
    for(int g=0; g < WordPerThread/2; g++) {
      int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8);
@ -167,7 +174,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
    __syncwarp();
    // Write rest from shmem to dst. No need to coalesce stores to 16-bytes,
    // the hardware keeps up fine.
-    T *shm = (T*)ncclShmem.ll128warp[warpInBlock];
+    T *shm = (T*)ncclScratchForWarp(warpInBlock);
    int skip = misalignment == 0 ? eltN & -EltPer16B : 0;
    for(int i=skip+wid; i < eltN; i += WARP_SIZE)
      dst[i] = shm[i];
@ -196,6 +203,10 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
        }
        needReload &= (0 == checkAbort(spins, 0, 0));
      } while (__any_sync(WARP_MASK, needReload));
      #pragma unroll
      for (int u=0; u<ELEMS_PER_THREAD; u+=2)
        load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
    }
    /************* Finish register load **************/
@ -206,9 +217,9 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
      if (SrcBuf == Input) {
        #pragma unroll
        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-          v[u] = MULTI<RedOp, T>().preOp(redOp, v[u]);
+          v[u] = applyPreOp(redOp, v[u]);
          if (!flagThread)
-            v[u+1] = MULTI<RedOp, T>().preOp(redOp, v[u+1]);
+            v[u+1] = applyPreOp(redOp, v[u+1]);
        }
      }
    }
@ -218,8 +229,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
      { // Consume data from first recv
        #pragma unroll
        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-          v[u] = SRC ? MULTI<RedOp, T>()(redOp, vr[u], v[u]) : vr[u];
+          v[u]   = SRC ? applyReduce(redOp, vr[u], v[u]) : vr[u];
-          v[u+1] = SRC ? MULTI<RedOp, T>()(redOp, vr[u+1], v[u+1]) : vr[u+1];
+          v[u+1] = SRC ? applyReduce(redOp, vr[u+1], v[u+1]) : vr[u+1];
        }
      }
@ -238,20 +249,24 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
          needReload &= (0 == checkAbort(spins, i, 0));
        } while (__any_sync(WARP_MASK, needReload));
        #pragma unroll
        for (int u=0; u<ELEMS_PER_THREAD; u+=2)
          load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
        #pragma unroll
        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-          v[u] = MULTI<RedOp, T>()(redOp, vr[u], v[u]);
+          v[u]   = applyReduce(redOp, vr[u], v[u]);
-          v[u+1] = MULTI<RedOp, T>()(redOp, vr[u+1], v[u+1]);
+          v[u+1] = applyReduce(redOp, vr[u+1], v[u+1]);
        }
      }
    }
    /********************** End Recv ************************/
-    if (postOp && !FuncTraits<RedOp>::IsPostOpIdentity) {
+    if (postOp) {
      #pragma unroll
      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-        v[u]   = MULTI<RedOp, T>().postOp(redOp, v[u]);
+        v[u]   = applyPostOp(redOp, v[u]);
-        v[u+1] = MULTI<RedOp, T>().postOp(redOp, v[u+1]);
+        v[u+1] = applyPostOp(redOp, v[u+1]);
      }
    }
@ -282,14 +297,6 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
  __device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
    constexpr int SRC = SrcBuf != -1 ? 1 : 0;
    constexpr int DST = DstBuf != -1 ? 1 : 0;
    static_assert(-1<=SrcBuf && SrcBuf < 2, "Uhoh");
    static_assert(-1<=DstBuf && DstBuf < 2, "Uhoh");
    static_assert(DstBuf!=Input, "Mistake?");
    #if 0
    assert((SrcBuf==-1) == (srcIx==-1));
    assert((DstBuf==-1) == (dstIx==-1));
    #endif
    T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
    T       *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
    int wireOffset = WireWordPerSlice*warp + 2*wid;
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@ -5,9 +5,9 @@
 ************************************************************************/
 template<typename T, typename RedOp, typename Fan, int Direct,
-         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p>
+         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, bool NVLS>
 class Primitives<
-    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>, P2p
+    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, NVLS>, P2p
  > {
  static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
  static constexpr int Input=0, Output=1;
@ -22,8 +22,10 @@ class Primitives<
                       SizesFifoEnabled = 0x100,
                       DirectWrite = 0x200,
                       DirectRead = 0x400,
-                       ThreadsSynced = 0x800;
+                       ThreadsSynced = 0x800,
-  const int tid;
+                       NvlsMinPolling = 0x1000,
                       NvlsRecv = 0x2000;
  const int tid, tidInBlock;
  int nthreads;
  int nworkers;
  const int stepSize;
@ -41,22 +43,54 @@ class Primitives<
    int volatile *connSizesFifoPtr; //  (flags & SizesFifoEnabled)
    T *directBuff;                  // !(flags & SizesFifoEnabled)
  };
-  uint64_t volatile *connStepPtr;
+  uint64_t *connStepPtr;
  uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
  // Don't use barrier 0 as it's used by the final sync
-  inline __device__ void barrier() {
+  __device__ void barrier() {
    if (nthreads == WARP_SIZE)
      __syncwarp();
    else
      asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads));
    flags |= ThreadsSynced;
    if (nthreads == WARP_SIZE) __syncwarp();
    else {
      int bar = 15-group;
      asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory");
    }
  }
-  inline __device__ void subBarrier() {
+  __device__ void subBarrier() {
-    if (nworkers == nthreads)
+    if (nworkers == WARP_SIZE) __syncwarp();
-      barrier();
+    else {
-    else
+      int bar = (nworkers==nthreads ? 15 : 8) - group;
-      asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers));
+      asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory");
    }
  }
  __device__ bool barrierAny(int vote) {
    flags |= ThreadsSynced;
    if (nthreads == WARP_SIZE) {
      return __any_sync(~0u, vote);
    } else {
      int ans, bar = 15-group;
      asm volatile(
        "{ .reg .pred p;"
        "  setp.ne.s32 p, %1, 0;"
        "  bar.red.or.pred p, %2, %3, p; "
        "  selp.s32 %0, 1, 0, p; }"
        : "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory");
      return ans != 0;
    }
  }
  __device__ bool subBarrierAny(int vote) {
    if (nworkers == WARP_SIZE) {
      return __any_sync(~0u, vote);
    } else {
      int ans, bar = (nworkers==nthreads ? 15 : 8) - group;
      asm volatile(
        "{ .reg .pred p;"
        "  setp.ne.s32 p, %1, 0;"
        "  bar.red.or.pred p, %2, %3, p; "
        "  selp.s32 %0, 1, 0, p; }"
        : "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory");
      return ans != 0;
    }
  }
  inline __device__ bool checkAbort(int &spins) {
@ -71,6 +105,19 @@ class Primitives<
    return flags & Aborted;
  }
  inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
    #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
    if (NVLS && (flags & NvlsMinPolling)) {
      uint64_t ans;
      asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
      return ans;
    }
    #endif
    // volatile is faster than acquire but not as correct. Make sure ReduceOrCopyMulti
    // loads data using volatile so it doesn't see stale data in L1.
    return ld_volatile_global(ptr);
  }
  template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
  __device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
    const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
@ -80,7 +127,7 @@ class Primitives<
        ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
      int spins = 0;
      while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
-        connStepCache = *connStepPtr;
+        connStepCache = loadStepValue(connStepPtr);
        if (checkAbort(spins)) break;
        //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
      }
@ -119,10 +166,11 @@ class Primitives<
  }
  template<int Recv, int Send>
-  inline __device__ void postPeer() {
+  inline __device__ void postPeer(bool dataStored) {
    if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
      step += StepPerSlice;
-      *connStepPtr = step;
+      if (Send && (flags & RolePostSend) && dataStored) fence_acq_rel_sys();
      st_relaxed_sys_global(connStepPtr, step);
    }
  }
@ -166,7 +214,7 @@ class Primitives<
      //     post();
      //   } // Since we no longer unroll, new branch added here
      #if __CUDA_ARCH__ < 700
-        // Yeah, so all that above don't matter a lick on older hardware.
+        // Above doesn't matter on older hardware.
        #pragma unroll SlicePerChunk
      #else
        #pragma unroll 1
@ -181,37 +229,39 @@ class Primitives<
        subBarrier();
        /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
         * to 0 to avoid unnecessary workload. */
-        size_t workSize = ncclShmem.aborted ? 0 : sliceSize;
+        int workSize = ncclShmem.aborted ? 0 : sliceSize;
-        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
+        if (NVLS && ncclShmem.groups[group].nvlsRecv) {
          void* src = ncclShmem.groups[group].srcs[0];
          void* dst = ncclShmem.groups[group].dsts[0];
          copyMultimemMultimem<RedOp>(tid, nworkers, ncclShmem.redOpArgs[0], postOp, src, dst, workSize,
          cvta_to_shared(ncclScratchForWarp(tidInBlock/WARP_SIZE)));
        } else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
          if (Send) {
-            // (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0).
+            ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, MaxSend, /*PreOpSrcs*/0>
-            ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0>
+              (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
-              (tid, nworkers, nullptr, false,
+               1, ncclShmem.groups[group].srcs,
-               1, (T const**)ncclShmem.groups[group].srcs,
+               fan.nsend(), ncclShmem.groups[group].dsts+1,
               fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
               workSize);
          }
        } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
          // For broadcast in CollNet to do empty send
-          ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
+          ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs*/0>
-            (tid, nworkers, ncclShmem.redOpArgs, postOp,
+            (tid, nworkers, ncclShmem.redOpArgs[0],  nullptr, postOp,
-             Recv, (T const**)ncclShmem.groups[group].srcs,
+             Recv, ncclShmem.groups[group].srcs,
-             Dst, (T**)ncclShmem.groups[group].dsts,
+             Dst, ncclShmem.groups[group].dsts,
             workSize);
        } else {
-          constexpr int PreOpN = SrcBuf != Input ? 0 :
+          constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
-                                 DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
+                                    DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
-          ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN>
+          ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
-            (tid, nworkers, ncclShmem.redOpArgs, postOp,
+            (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
-             Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
+             Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
-             Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
+             Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
             workSize);
        }
        barrier(); // This barrier has a counterpart in following loop
-        if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
+        postPeer<Recv, Send>(0 < sliceSize);
        __syncwarp();
        postPeer<Recv, Send>();
        offset += sliceSize;
        slice += 1;
      } while (slice < SlicePerChunk && offset < nelem);
@ -229,9 +279,7 @@ class Primitives<
        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
      }
      barrier(); // Has couterpart in preceding worker-only loop.
-      if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
+      postPeer<Recv, Send>(0 < sliceSize);
      __syncwarp();
      postPeer<Recv, Send>();
      offset += sliceSize;
      slice += 1;
    }
@ -242,7 +290,7 @@ class Primitives<
  // shift: peer offset to avoid all ranks sending to or receiving from same peer
  template <int DirectRecv1, int DirectSend1, int Recv, int Send>
  __device__ __forceinline__ void
-  ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) {
+  ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
    constexpr int DirectRecv = 1 && Direct && DirectRecv1;
    constexpr int DirectSend = 1 && Direct && DirectSend1;
    int offset = 0; // slice offset
@ -252,12 +300,12 @@ class Primitives<
    #pragma unroll
    for (int slice=0; slice<SlicePerChunk; ++slice) {
      int realSize = max(0, min(dataSize, peerElem-offset));
      bool fenceNeeded = false;
      if (tid < nworkers) {
        if (Send) {
          // Scatter pre-scales data of input buffer only in non-Direct case
-          constexpr int PreOpN = DirectSend ? 0 : 1;
+          constexpr int PreOpSrcs = DirectSend ? 0 : 1;
          if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
          if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence
          // realSize is not accurate here; but intra-node does not rely on sizes FIFO
          waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
          subBarrier();
@ -265,23 +313,23 @@ class Primitives<
          // Loop over peers
          for (int j=0; j<fan.nsend(); j++) {
            int i = (j+shift)%fan.nsend();
-            int peerOffset = i*peerElem;
+            int pOffset = i*peerOffset;
            // Skip the data I am responsible of reducing myself
-            if (skip >= 0 && i >= skip) peerOffset += peerElem;
+            if (skip >= 0 && i >= skip) pOffset += peerElem;
-            const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
+            void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
-            int realPeerSize = min(realSize, totalElem-peerOffset);
+            int realPeerSize = min(realSize, totalElem-pOffset);
            if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
-              ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
+              ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
              // Mark for threadfence at the end
-              if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
+              fenceNeeded |= true;
            }
          }
        } else if (Recv) {
          if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
-          int peerOffset = index*peerElem;
+          int pOffset = index*peerOffset;
-          if (skip >= 0 && index >= skip) peerOffset += peerElem;
+          if (skip >= 0 && index >= skip) pOffset += peerElem;
          // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
-          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+peerOffset, offset, realSize);
+          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
          subBarrier();
          if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
            // Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
@ -290,21 +338,17 @@ class Primitives<
            #pragma unroll
            for (int j=0; j<fan.nrecv(); j++) {
              int i = (j+shift)%fan.nrecv();
-              peerOffset = i*peerElem;
+              pOffset = i*peerOffset;
-              if (skip >= 0 && i >= skip) peerOffset += peerElem;
+              if (skip >= 0 && i >= skip) pOffset += peerElem;
-              T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset;
+              void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
-              int realPeerSize = min(realSize, totalElem-peerOffset);
+              int realPeerSize = min(realSize, totalElem-pOffset);
-              if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
+              if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
            }
          }
        }
      }
-      barrier();
+      fenceNeeded = barrierAny(fenceNeeded);
-      // If we indeed send something, threadfence
+      postPeer<Recv, Send>(fenceNeeded);
      if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
        __threadfence_system();
      __syncwarp();
      postPeer<Recv, Send>();
      offset += realSize;
    }
  }
@ -320,25 +364,33 @@ class Primitives<
      }
      if (flags & RoleWaitRecv) {
        ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
        if ((index == 0) && (flags & RoleWaitRecv)) {
          if (conn->flags & NCCL_NVLS_MIN_POLL) {
            flags |= NvlsMinPolling;
            ncclShmem.groups[group].nvlsRecv = 1;
          } else {
            ncclShmem.groups[group].nvlsRecv = 0;
          }
        }
        connStepPtr = conn->tail;
-        connStepCache = *connStepPtr;
+        connStepCache = loadStepValue(connStepPtr);
        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
        if (Direct) {
          // User buffers have been registered
-          if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
+          if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
            if (connIndex == 1 && P2p == 0) {
              flags |= DirectRead;  // scatter-reduce use direct pull
            } else {
              flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
                       (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
            }
-          } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
+          } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
            if (connIndex == 1 && P2p == 0) {
              flags |= DirectRead;  // scatter-reduce use direct pull
            } else {
              // direct read not allowed in non-register case
              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
-              flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
+              flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
            }
          }
        }
@ -359,8 +411,9 @@ class Primitives<
      }
      if (flags & RoleWaitSend) {
        ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
        flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
        connStepPtr = conn->head;
-        connStepCache = *connStepPtr;
+        connStepCache = loadStepValue(connStepPtr);
        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
        if (flags & OffsFifoEnabled)
          connOffsFifoPtr = conn->offsFifo;
@ -371,20 +424,20 @@ class Primitives<
          connSizesFifoPtr = conn->sizesFifo;
        } else if (Direct) {
          // User buffers have been registered
-          if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
+          if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
            if (connIndex == 1 && P2p == 0) {
              flags |= DirectRead;  // scatter-reduce use direct pull
            } else {
              flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
                       (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
            }
-          } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
+          } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
            if (connIndex == 1 && P2p == 0) {
              flags |= DirectRead;  // scatter-reduce use direct pull
            } else {
              // direct read not allowed in non-register case
              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
-              flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
+              flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
            }
          }
        }
@ -397,7 +450,7 @@ class Primitives<
      int tid, int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
    ):
-    tid(tid),
+    tid(tid), tidInBlock(threadIdx.x),
    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
    // For send operations, we need an extra warp to overlap the threadfence and the copy
@ -412,7 +465,7 @@ class Primitives<
    this->fan = Fan(nrecv, nsend);
    constexpr int ThreadPerSync = 8;
-    static_assert(MaxSend < ThreadPerSync && MaxRecv < ThreadPerSync, "Not enough threads to cover all peers");
+    static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
    int g = tid / ThreadPerSync;
    int ng = nthreads / ThreadPerSync;
@ -566,6 +619,9 @@ class Primitives<
    genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
  }
  __device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
    genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, postOp);
  }
  __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
    genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp);
  }
@ -596,20 +652,20 @@ class Primitives<
  }
  __device__ __forceinline__ void
-  scatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) {
+  scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
-    ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
+    ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
  }
  __device__ __forceinline__ void
-  directScatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) {
+  directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
-    ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
+    ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
  }
  __device__ __forceinline__ void
-  gather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp=false) {
+  gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
-    ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, postOp);
+    ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
  }
  __device__ __forceinline__ void
-  directGather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift) {
+  directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
-    ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, /*postOp=*/false);
+    ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
  }
 };
--- a/src/collectives/device/reduce_kernel.h
+++ b/src/collectives/device/reduce_kernel.h
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@ -87,3 +87,45 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROT
    runRing<T, RedOp, ProtoLL128>(args);
  }
 };
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
    const int tid = threadIdx.x;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
    const ssize_t chunkSize = int(args->lastChunkSize);
    const ssize_t size = args->count;
    const ssize_t loopSize = nChannels*chunkSize;
    const int nThreadsScatter = 128 + WARP_SIZE;
    const int nThreadsReduce = 384;
    const int tidEndScatter = nThreadsScatter;
    const int tidEndReduce = tidEndScatter + nThreadsReduce;
    using Proto = ProtoSimple<1, 1>;
    if (tid < tidEndScatter) {
      // Scatter
      int group = (0*Proto::MaxGroupWidth) | (0<<16);
      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, group, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*chunkSize;
        int nelem = min(chunkSize, size-offset);
        prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
      }
    } else if (tid < tidEndReduce) {
      int group = (3*Proto::MaxGroupWidth) | (1<<16);
      // Reduce through MC
      Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
        prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*chunkSize;
        int nelem = min(chunkSize, size-offset);
        prims.recv(offset, nelem);
      }
    }
  }
 };
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@ -13,12 +13,13 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
  template<typename Proto>
  __device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
    void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
-    size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
+    ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
    if (args->peer == ncclShmem.comm.rank) {
      struct ncclWorkElemP2p* recvArgs = args-1;
      void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
      if (buff != recvBuff) {
-        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);
+        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
          (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
      }
    } else {
      int chunkSize = args->chunkSize/sizeof(T);
--- a/src/debug.cc
+++ b/src/debug.cc
@ -74,6 +74,8 @@ void ncclDebugInit() {
        mask = NCCL_ALLOC;
      } else if (strcasecmp(subsys, "CALL") == 0) {
        mask = NCCL_CALL;
      } else if (strcasecmp(subsys, "NVLS") == 0) {
        mask = NCCL_NVLS;
      } else if (strcasecmp(subsys, "ALL") == 0) {
        mask = NCCL_ALL;
      }
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@ -32,7 +32,8 @@ struct ncclKernelMatch {
  NCCL_FUNC5(func, TREE,           devredop, type, specialized), \
  NCCL_FUNC5(func, RING,           devredop, type, specialized), \
  NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
-  NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, specialized)
+  NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, specialized), \
  NCCL_FUNC5(func, NVLS,           devredop, type, specialized)
 #ifdef __CUDA_BF16_TYPES_EXIST__
  #define HAVE_BFLOAT16 1
@ -90,34 +91,48 @@ static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNum
 static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
-// Determine the maximum kernel stack size of all CUDA kernels
+NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
-size_t ncclKernMaxLocalSize() {
+
-  ncclResult_t res = ncclSuccess;
+// Returns maximum kernel stack size of all CUDA kernels
-  int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
-  cudaFuncAttributes attr = {0};
+  constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
-  size_t max = 0;
+  ncclResult_t result = ncclSuccess;
-  for (int i = 0; i < numNcclKerns; i++) {
+
-    CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i].kernelFn), res, error);
+  if (maxStackSize) *maxStackSize = 0;
-    if (attr.localSizeBytes > max) max = attr.localSizeBytes;
+  int carveout = ncclParamL1SharedMemoryCarveout();
  // Keep track if we already visited a function pointer.
  void* lru[2] = {nullptr, nullptr};
  for (int i=0; i < KernelCount; i++) {
    void* fn = ncclKerns[i].kernelFn;
    if (fn == lru[0] || fn == lru[1]) goto next_kernel;
    lru[1] = lru[0];
    lru[0] = fn;
    if (maxStackSize) {
      cudaFuncAttributes attr = {0};
      CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
      if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
    ignore0:;
    }
    if (carveout) {
      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
        cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
        result, ignore1);
    ignore1:;
    }
    if (ncclShmemDynamicSize(cudaArch) != 0) {
      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
        cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
        result, next_kernel);
    }
  next_kernel:;
  }
-
+  return result;
 error:
  return (res != ncclSuccess) ? 0 : max;
 }
 // Set shared memory carveout for the nccl kernels
 ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) {
  ncclResult_t res = ncclSuccess;
  int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
  for (int i = 0; i < numNcclKerns; i++) {
    CUDACHECKGOTO(cudaFuncSetAttribute(ncclKerns[i].kernelFn, cudaFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
  }
 error:
  return res;
 }
 /*****************************************************************************/
 /*       Launch system : synchronization and CUDA kernel launch              */
 /*****************************************************************************/
@ -248,10 +263,9 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP
 static ncclResult_t addCollToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
    struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
-    int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
+    int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
  ) {
  struct ncclKernelPlan::Channel *chans = plan->channels;
  int nCollChannels = comm->nChannels;
  // Choose the `nBid` least loaded channels to do the work. This ensures
  // all bids go to different channels in case they need to synchronize.
@ -268,9 +282,7 @@ static ncclResult_t addCollToPlan(
    }
  }
  // Sort in the rest of the channels. If a channel has less work than the max
-  // member of least[], replace that member and compute the new max. The optimal
+  // member of least[], replace that member and compute the new max.
  // algorithm uses a max-heap, but for our small sizes I suspect the better
  // asymptotic complexity would be swamped by the increased instruction complexity.
  for (int c=nBid; c < nCollChannels; c++) {
    if (chans[c].collBytes < maxBytesInLeast) {
      least[maxIndexInLeast] = c;
@ -541,8 +553,9 @@ static ncclResult_t scheduleCollTasksToPlan(
      info.sliceSteps = head->sliceSteps;
      NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
      if (nAggOps > 1) {
        int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
        info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
-        info.nChannels = std::max(1, std::min(info.nChannels, comm->nChannels));
+        info.nChannels = std::max(1, std::min(info.nChannels, maxChannels));
        info.algorithm = aggInfo.algorithm;
        info.protocol = aggInfo.protocol;
        info.nThreads = aggInfo.nThreads;
@ -565,8 +578,9 @@ static ncclResult_t scheduleCollTasksToPlan(
        NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
      }
      int maxChannels = info.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
      NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
-        info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
+        maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
      tasks->nTasksColl -= 1;
      tasks->collBytesTotal -= info.nBytes;
      ncclIntruQueueDequeue(&tasks->collQueue);
@ -856,7 +870,7 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
  ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
  if (result != ncclSuccess) {
-    WARN("hostStreamPlanCallback() failed : %s\n", ncclGetErrorString(result));
+    WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
  }
 }
@ -964,7 +978,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
    }
    NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure);
-    if (persistent || comm->persistentRefs != 0) {
+    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
      // We have to launch host tasks to push proxy args. We are careful to only
      // do this if necessary since host tasks impose a high performance cost in CUDA.
      bool acquired = false;
@ -1005,12 +1019,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
  return ncclSuccess;
 }
 #if CUDART_VERSION >= 11080
 #define NCCL_MAX_CGA_CLUSTER_SIZE 8
 #define NCCL_CGA_CLUSTER_SIZE_SM90 4
 NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", -2);
 #endif
 #if CUDART_VERSION >= 12000
 // NCCL uses the "Remote" Mem Sync domain by default
 NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
@ -1022,6 +1030,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
  cudaStream_t launchStream = tasks->streams->stream;
  dim3 grid = {(unsigned)plan->channelCount, 1, 1};
  dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
  size_t smem = ncclShmemDynamicSize(comm->cudaArch);
  void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
  #if CUDART_VERSION >= 11080
@ -1029,19 +1038,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
  NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
  if (driverVersion >= 11080) {
    int compCap = comm->compCap;
-    unsigned int clusterSize = (compCap == 90) ? NCCL_CGA_CLUSTER_SIZE_SM90 : 0;
+    unsigned int clusterSize = (compCap == 90) ? comm->cgaClusterSize : 0;
    if (ncclParamCGAClusterSize() != -2) {
      clusterSize = ncclParamCGAClusterSize();
      if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
        static bool warned = false;
        if (warned == false) {
          WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
               clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
          warned = true;
        }
        clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
      }
    }
    cudaLaunchConfig_t launchConfig = {0};
    cudaLaunchAttribute launchAttrs[3];
@ -1073,6 +1070,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
    #endif
    launchConfig.gridDim = grid;
    launchConfig.blockDim = block;
    launchConfig.dynamicSmemBytes = smem;
    launchConfig.attrs = launchAttrs;
    launchConfig.numAttrs = attrs;
    launchConfig.stream = launchStream;
@ -1082,12 +1080,12 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
  }
  #endif
  // Standard kernel launch
-  CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream));
+  CUDACHECK(cudaLaunchKernel(fn, grid, block, args, smem, launchStream));
  return ncclSuccess;
 }
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  if (comm->persistentRefs == 0) { // implies !plan->persistent
+  if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) {
    // If this isn't being captured and there aren't any CUDA graphs alive
    // then we don't need to do our proxyOp pushing on the host stream.
    NCCLCHECK(hostStreamPlanTask(comm, plan));
@ -1161,6 +1159,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
    int nAlgos = NCCL_NUM_ALGORITHMS;
    for (int a=0; a<nAlgos; a++) {
      if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
      if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
        float time;
        NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
@ -1193,6 +1193,9 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
      }
      ncSwitch /= 2;
    }
  } else if (info->algorithm == NCCL_ALGO_NVLS) {
    // NVLS should not need more than 16 channels to get peak BW.
    nc = comm->nvlsChannels;
  } else {
    // Ring/Tree channel tuning
    while (info->nBytes < nc*nt*threadThreshold) {
@ -1207,6 +1210,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
    if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
    if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
    if (info->algorithm == NCCL_ALGO_NVLS) nt = NCCL_MAX_NTHREADS;
  }
  nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
  info->nChannels = nc;
@ -1225,6 +1229,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
      info->pattern = ncclPatternRing; break;
    case ncclFuncAllReduce:
      info->pattern =
        info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
        info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
        info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
        info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
@ -1244,6 +1249,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
    case ncclPatternPipelineFrom:
    case ncclPatternPipelineTo:
    case ncclPatternCollnetChain:
    case ncclPatternNvls:
      info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
    case ncclPatternCollnetDirect:
      info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
@ -1319,6 +1325,14 @@ comp_next:
    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2;
    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
  } else if (info->algorithm == NCCL_ALGO_NVLS) {
    if (chunkSize > 131072) chunkSize = 131072;
    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
    uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
    if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
    if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
    if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
  } else if (info->protocol == NCCL_PROTO_LL) {
    const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
    const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@ -1618,6 +1632,11 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
    WARN("ncclRedOpDestroy :  operator is garbage.");
    return ncclInvalidArgument;
  }
  if (comm == NULL) {
    WARN("ncclRedOpDestroy : invalid communicator passed.");
    return ncclInvalidArgument;
  }
  int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
  if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) {
    WARN("ncclRedOpDestroy : operator unknown to this communicator.");
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@ -313,8 +313,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
  // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
  // We permit combining max, then min, to only use the first channels, then duplicate them.
-  nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
+  nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->maxCTAs);
-  nChannels = comm->nChannels = copyChannels(comm, nChannels, ncclMinNchannels(), ringPrev, ringNext);
+  nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->minCTAs), ringPrev, ringNext);
  // Create rings array and check all is fine
  NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@ -461,7 +461,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
      type = node->type;
    }
    if (type != GPU) {
-      WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev);
+      WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
      return ncclInternalError;
    }
    *intermediateRank = node->gpu.rank;
@ -707,6 +707,7 @@ static int nextPow2(int v) {
 }
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
  /* here we already honor comm->max/minCTAs for p2pnChannels. */
  comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
  comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
  int minChannels = comm->p2pnChannels;
@ -734,7 +735,6 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
    for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
    comm->p2pChannels[c] = mirror;
  }
  INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
  return ncclSuccess;
 }
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@ -765,7 +765,6 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
  if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
  // SPLIT_TREE works better on older archs.
  int ccMin;
  NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@ -815,6 +815,6 @@ ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int*
      return ncclSuccess;
    }
  }
-  WARN("Could not find local GPU with rank %d\n", rank);
+  WARN("Could not find local GPU with rank %d", rank);
  return ncclInternalError;
 }
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@ -53,7 +53,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
 // Latencies in us, Bandwidths in GB/s
 // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4,  0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4,  0 }, { 4.4, 4.4,  0 }};
+static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4,  0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4,  0 }, { 4.4, 4.4,  0 }, { 0, 0, 40.0 }};
 // NVLink, PCI, Network
 #define NCCL_HW_NVLINK 0
@ -63,13 +63,16 @@ static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4,
 static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 { /* NVLINK */
  { /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } },
+    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
    /* NVLS */ { 0, 0, 0 } },
  /* PCI */
  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } },
+    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
    /* NVLS */ { 0, 0, 0 } },
  /* NET */
  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 } }
+    /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 },
    /* NVLS */ { 0, 0, 0 } }
 };
 /* Array indexes used below */
@ -78,7 +81,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 #define HOPPER_COMPCAP_IDX 2
 // LL128 max BW per channel
-static const double ll128MaxBwPerCh = 20.0;
+static const double ll128MaxBwPerCh[3] = { 20.0, 20.0, 36.7 };
 static const double llMaxBws[3][3] = {
  /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
  /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
@ -88,7 +91,7 @@ static const double llMaxBws[3][3] = {
 static const double perChMaxTreeBws[3][3] = {
  /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
  /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
-  /* Hopper (N1/N2/N4) */ {24.0, 23.6, 17.8},
+  /* Hopper (N1/N2/N4) */ {38.7, 41.4, 33.0},
 };
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
@ -98,7 +101,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
  comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
-    comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
+    comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
    comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
@ -108,7 +112,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  int nRanks = comm->nRanks;
  if (nRanks <= 1) return ncclSuccess;
-  int compCapIndex = (minCompCap == 80 && maxCompCap == 80) ? AMPERE_COMPCAP_IDX : ((minCompCap == 90 && maxCompCap == 90) ? HOPPER_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
+  int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
  int cpuArch, cpuVendor, cpuModel;
  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
  int index2 = nNodes <= 2 ? nNodes-1 : 2;
@ -120,7 +124,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
  float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
-  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph };
+  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph, ringGraph/* we only need the NVSwitch speed for NVLS*/ };
  int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
@ -134,20 +138,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
      nNodes;
    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
+      if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
      if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
        if (a == NCCL_ALGO_NVLS && p != NCCL_PROTO_SIMPLE) continue;
        int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
        float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
        float busBw = graphs[a]->nChannels * bw;
        // Various model refinements
        if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
        if (compCapIndex == HOPPER_COMPCAP_IDX) busBw = std::min(busBw, 370.0f);
        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
-        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh*graphs[a]->nChannels);
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
-        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
        if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
        if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
        if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@ -159,7 +168,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
        // Convert bus BW to algorithm BW
-        float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
+        float ratio;
        if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
        else if (a == NCCL_ALGO_NVLS) ratio = .75;
        else ratio = .5;
        comm->bandwidths[coll][a][p] = busBw * ratio;
        comm->latencies[coll][a][p] = baseLat[a][p];
@ -195,7 +207,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  // Protocols/Algorithms enable/disable, and user overrides.
  // All are enabled except ll128 which is enabled by default only in certain cases.
  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1 };
+  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1 };
  const char *protoStr = getenv("NCCL_PROTO");
  if (protoStr) {
@ -207,6 +219,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
    NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
  }
  // Disable NVLink SHARP if not supported
  if (comm->nvlsSupport == 0 /* || comm->localRanks <= 2*/) algoEnable[NCCL_ALGO_NVLS] = 0;
  // Disable CollNet if it is not supported
  if (comm->collNetSupport == 0) {
    algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
@ -228,7 +244,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    if (pEnable == 2 && p == NCCL_PROTO_LL128) {
      // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
      pEnable = 1;
-      pEnable &= (graphs[a]->typeInter <= PATH_PXB);
+      pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
      pEnable &= (graphs[a]->typeIntra <= PATH_NVL);
      pEnable &= (minCompCap == maxCompCap);
      switch (minCompCap) {
@ -239,8 +255,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
      }
    }
    if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
-    // Only disable algo for Allreduce since others only have one
+    // Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE.
-    if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+    if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue;
    if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
  }
  if (comm->rank == 0) {
@ -284,9 +301,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  char* str = getenv("NCCL_THREAD_THRESHOLDS");
  if (str) {
    INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
-    ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }};
+    ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
    sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
-    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    for (int a=0; a<2; a++) {
      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
        if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
      }
@ -323,7 +340,9 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
  if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
  if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
-      && info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
+      && info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) {
    lat *= info->comm->minCompCap < 90 ? 1.9 : 1.5; // Plateau effect of ring
  }
  // Tree pipelining saves latency in aggregation cases
  int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
  *time = lat * latCount + (info->nBytes) / (1000 * bw);
--- a/src/group.cc
+++ b/src/group.cc
@ -315,7 +315,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
            ret = ncclSystemError;
          }
          job->state = ncclGroupJobJoined;
-          if (job->result != ncclSuccess) {
+          if (job->result != ncclSuccess && ret == ncclSuccess) {
            ret = job->result;
            errorJobAbortFlag = true;
          }
@ -326,7 +326,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
        if (*groupAbortFlag == true || errorJobAbortFlag == true) {
          *job->abortFlag = 1;
          ret = ncclInternalError;
        }
        job = job->next;
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@ -25,6 +25,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
 ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
 ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
 ncclResult_t bootstrapClose(void* commState);
 ncclResult_t bootstrapAbort(void* commState);
 #endif
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@ -53,7 +53,8 @@ struct ncclDevRedOpFull {
  DECL4(func, RING,    devredop, type, undef) \
  DECL4(func, TREE,    devredop, type, undef) \
  DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
-  DECL4(func, COLLNET_CHAIN, devredop, type, undef)
+  DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
  DECL4(func, NVLS,    devredop, type, undef)
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 #define DECL2(func, devredop, undefForFloat) \
@ -121,4 +122,13 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
 // We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
 // macro will be used in preprocessor conditionals where enums have no meaning.
 #define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
  (((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
   ((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
   ((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
   (type==7 && red==0) || \
   (type==8 && red==0))
 #endif
--- a/src/include/comm.h
+++ b/src/include/comm.h
@ -104,6 +104,7 @@ struct ncclChannel {
  struct ncclTree tree;
  struct ncclTree collnetChain;
  struct ncclDirect collnetDirect;
  struct ncclNvls nvls;
  int id; // index of this channel
  uint32_t workFifoSent; // last used work index+1
  uint64_t p2pOpCount;
@ -177,8 +178,10 @@ struct ncclComm {
  int nRanks;  // number of GPUs in communicator
  int cudaDev; // my cuda device index
  int compCap; // compute capability of the GPU
  int minCompCap; // min compute capability in the communicator
  int64_t busId;   // my PCI bus ID in int format
  cpu_set_t cpuAffinity; // CPU affinity of the GPU
  int cudaArch; // matches __CUDA_ARCH__ of device
  int node;
  int nNodes;
@ -201,6 +204,7 @@ struct ncclComm {
  // Channels for collectives
  int nChannels;
  int nvlsChannels;
  // Channels (per peer) for p2p
  int p2pnChannels;
  int p2pnChannelsPerPeer;
@ -257,6 +261,10 @@ struct ncclComm {
  int collNetSupport;
  int intraHighestTransportType;
  // NVLink SHARP (NVLS) support
  int nvlsSupport;
  void* nvlsResources;
  size_t channelSize; // User requested work size (bytes) for channel partitions
  // Internal streams
@ -288,6 +296,11 @@ struct ncclComm {
  // communicator mode
  int blocking;
  // CGA cluster size
  int cgaClusterSize;
  int minCTAs, maxCTAs;
  // network interface name
  char *netName;
  // initState is to more conveniently reclaim resources when errors happen.
  ncclResult_t initState;
  // flag to indicate if ncclCommFinalize() is called
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@ -73,10 +73,32 @@ DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
 DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
 DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
 DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
 // cuMem API support
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
 #if CUDA_VERSION >= 11070
 DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
 #endif
 #if CUDA_VERSION >= 12010
 /* NVSwitch Multicast support */
 DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
 DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
 DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
 DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
 DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
 DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
 #endif
 #endif
 /* CUDA Driver functions loaded with dlsym() */
@ -88,6 +110,7 @@ DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
 ncclResult_t ncclCudaLibraryInit(void);
 extern int ncclCudaDriverVersionCache;
 extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
 inline ncclResult_t ncclCudaDriverVersion(int* driver) {
  int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
@ -98,5 +121,4 @@ inline ncclResult_t ncclCudaDriverVersion(int* driver) {
  *driver = version;
  return ncclSuccess;
 }
 #endif
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@ -15,11 +15,12 @@
 typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
-#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet*
+#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
 #define NCCL_ALGO_COLLNET_DIRECT 2
 #define NCCL_ALGO_COLLNET_CHAIN 3
 #define NCCL_ALGO_NVLS 4
 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@ -78,6 +79,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_DIRECT_NIC   0x04
 #define NCCL_IPC_WRITE    0x08
 #define NCCL_IPC_READ     0x10
 #define NCCL_NVLS_MIN_POLL 0x20
 struct ncclConnInfo {
  // Regular comm mechanism
@ -85,7 +87,7 @@ struct ncclConnInfo {
  uint64_t *tail;     // Local for recv, remote for send
  uint64_t *head;     // Local for send, remote for recv
-  int direct;         // Direct communication
+  int flags;          // Direct communication / other flags
  int shared;         // Buffers are shared
  void **ptrExchange; // Pointer exchange for direct communication
  uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
@ -138,13 +140,22 @@ struct ncclTree {
 struct ncclDirect {
  int depth;
  int out;
-  int nHeads;
+  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
-  int headRank;
+  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
-  int shift;
+  int shift;    // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
  int up[NCCL_MAX_DIRECT_ARITY];
  int down[NCCL_MAX_DIRECT_ARITY];
 };
 #define NCCL_MAX_NVLS_ARITY 8
 struct ncclNvls {
  int out;
  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
  int up[NCCL_MAX_NVLS_ARITY];
  int down;
 };
 #define NCCL_MAX_CONNS 2
 struct ncclChannelPeer {
  struct ncclConnector send[NCCL_MAX_CONNS];
@ -264,6 +275,7 @@ struct alignas(16) ncclDevChannel {
  struct ncclTree tree;
  struct ncclTree collnetChain;
  struct ncclDirect collnetDirect;
  struct ncclNvls nvls;
  uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
 };
@ -288,4 +300,65 @@ struct alignas(16) ncclDevCommAndChannels {
  struct ncclDevChannel channels[MAXCHANNELS];
 };
 #ifdef __CUDA_ARCH__
  #define NCCL_CUDA_ARCH __CUDA_ARCH__
 #else
  #define NCCL_CUDA_ARCH 0
 #endif
 template<typename T>
 __host__ __device__ constexpr T min_constexpr(T a) { return a; }
 template<typename T, typename ...Ts>
 __host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
  return min_constexpr<T>((a < b ? a : b), c...);
 }
 template<typename T>
 __host__ __device__ constexpr T max_constexpr(T a) { return a; }
 template<typename T, typename ...Ts>
 __host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
  return max_constexpr<T>((a > b ? a : b), c...);
 }
 // Calculate the unroll factor given:
 // * bytePerPack: number of bytes accessed per instruction
 // * insns: max permissible unroll value
 // * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
 __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
  return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
 }
 // Note that all unroll value logic should depend on a given cudaArch argument
 // and not __CUDA_ARCH__ since these need to be host-side executable where the
 // arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
 // side code can elide passing the arch for brevity.
 __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
  // Our collective unroll should move to the same bytes&insns model as NVLS.
  return cudaArch >= 800 ? 8 : 4;
 }
 __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
 __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
 __host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
  return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
 }
 // The amount of dynamic shmem per warp
 __host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
  return (max_constexpr<int>(
      /*LL    */0,
      /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
      /*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
      // NVLS needs an extra 16B to read unaligned data.
      /*NVLS  */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
    ) + 15) & -16; // pad to 16 bytes
 }
 // The amount of dynamic shmem per block
 __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
  return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
 }
 #endif
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@ -15,8 +15,7 @@
 #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
 #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
-size_t ncclKernMaxLocalSize();
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
 ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
 ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
--- a/src/include/info.h
+++ b/src/include/info.h
@ -24,6 +24,7 @@ typedef enum : uint8_t {
  ncclPatternTreeUpDown,
  ncclPatternCollnetChain,
  ncclPatternCollnetDirect,
  ncclPatternNvls,
  ncclPatternSend,
  ncclPatternRecv
 } ncclPattern_t;
--- a/src/include/ipcsocket.h
+++ b/src/include/ipcsocket.h
@ -0,0 +1,37 @@
 /*
 * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
 *
 * See COPYRIGHT for license information
 */
 #ifndef NCCL_IPCSOCKET_H
 #define NCCL_IPCSOCKET_H
 #include "nccl.h"
 #include <stdio.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <errno.h>
 #include <sys/wait.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <memory.h>
 #include <sys/un.h>
 #include <inttypes.h>
 #define NCCL_IPC_SOCKNAME_LEN 64
 struct ncclIpcSocket {
  int fd;
  char socketName[NCCL_IPC_SOCKNAME_LEN];
  volatile uint32_t* abortFlag;
 };
 ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
 ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
 ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
 ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
 #endif /* NCCL_IPCSOCKET_H */
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@ -20,7 +20,7 @@
 #define NCCL_NET_MAX_REQUESTS 8
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@ -7,12 +7,12 @@
 #ifndef NCCL_NVTX_H_
 #define NCCL_NVTX_H_
-#include "nvtx3.hpp"
+#include "nvtx3/nvtx3.hpp"
-#if __cpp_constexpr >= 201304L && !defined(NVTX3_RELAXED_CONSTEXPR)
+#if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
-#define NVTX3_RELAXED_CONSTEXPR constexpr
+#define NVTX3_CONSTEXPR_IF_CPP14 constexpr
 #else
-#define NVTX3_RELAXED_CONSTEXPR
+#define NVTX3_CONSTEXPR_IF_CPP14
 #endif
 // Define all NCCL-provided static schema IDs here (avoid duplicates).
@ -37,7 +37,7 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
 class payload_schema {
 public:
-  NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
+  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
  {
    schema_attr.name = schemaName;
    schema_attr.entries = entries;
@ -74,11 +74,11 @@ class payload_schema {
 #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
  static const payload_schema schema{S, std::extent<decltype(S)>::value, \
    NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
-  static ::nvtx3::v1::registered_string<nccl_domain> const nvtx3_func_name__{__func__}; \
+  static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
  nvtxPayloadData_t nvtx3_bpl__[] = { \
    {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
-  ::nvtx3::v1::event_attributes nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
+  ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
-  ::nvtx3::v1::domain_thread_range<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
+  ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
 extern void initNvtxRegisteredEnums();
--- a/src/include/nvtx3/nvToolsExt.h
+++ b/src/include/nvtx3/nvToolsExt.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvToolsExtCuda.h
+++ b/src/include/nvtx3/nvToolsExtCuda.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvToolsExtCudaRt.h
+++ b/src/include/nvtx3/nvToolsExtCudaRt.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvToolsExtOpenCL.h
+++ b/src/include/nvtx3/nvToolsExtOpenCL.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvToolsExtPayload.h
+++ b/src/include/nvtx3/nvToolsExtPayload.h
@ -1,12 +1,12 @@
 /*
-* Copyright 2021  NVIDIA Corporation.  All rights reserved.
+* Copyright 2021-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 */
-#include "nvtx3/nvToolsExt.h"
+#include "nvToolsExt.h"
 #ifndef NVTOOLSEXT_PAYLOAD_H
 #define NVTOOLSEXT_PAYLOAD_H
--- a/src/include/nvtx3/nvToolsExtSync.h
+++ b/src/include/nvtx3/nvToolsExtSync.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtx3.hpp
+++ b/src/include/nvtx3/nvtx3.hpp
--- a/src/include/nvtx3/nvtxDetail/nvtxImpl.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxImpl.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxImplCore.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplCore.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxInit.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxInit.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxDetail/nvtxTypes.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxTypes.h
@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
+++ b/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
@ -35,10 +35,11 @@ NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM
 NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
 {
    intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
    nvtxExtModuleSegment_t segment = {
        0, // unused (only one segment)
        NVTX3EXT_CBID_PAYLOAD_FN_NUM,
-        NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1
+        fnSlots
    };
    nvtxExtModuleInfo_t module = {
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@ -10,6 +10,7 @@
 #include "devcomm.h"
 #include "info.h"
 #include "socket.h"
 #include "ipcsocket.h"
 #include <pthread.h>
 #include "shm.h"
@ -161,6 +162,31 @@ struct ncclProxyProgressState {
  int nextOps;
 };
 // Expected proxy response fifo
 struct ncclExpectedProxyResponse {
  void*    opId;
  int      respSize;
  bool     done;
  void*    respBuff;
  struct   ncclExpectedProxyResponse* next;
 };
 struct ncclProxyAsyncOp {
  int type;
  struct ncclProxyConnection* connection;
  int reqSize, respSize;
  char *reqBuff, *respBuff;
  void* opId;
  ncclProxyAsyncOp* next;
 };
 struct ncclProxyLocalPeer {
  struct ncclSocket sock;
  int localRank;
  ncclProxyAsyncOp* asyncOps;
  int asyncOpCounter;
 };
 struct ncclProxyState {
  // Service thread
  pthread_t thread;
@ -176,6 +202,9 @@ struct ncclProxyState {
  // Progress thread
  struct ncclProxyProgressState progressState;
  // Queue of expected responses from the proxy
  struct ncclExpectedProxyResponse* expectedResponses;
 };
 enum proxyConnectState {
@ -220,10 +249,19 @@ enum ncclProxyMsgType {
  ncclProxyMsgStart = 5,
  ncclProxyMsgClose = 6,
  ncclProxyMsgAbort = 7,
-  ncclProxyMsgStop = 8
+  ncclProxyMsgStop = 8,
  ncclProxyMsgConvertFd = 9 // cuMem API support
 };
-ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
+// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
 // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
 // ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
 ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
 // This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
 ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
 ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
 ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
 #endif
--- a/src/include/socket.h
+++ b/src/include/socket.h
@ -92,6 +92,6 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
 ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
 ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
-ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
 ncclResult_t ncclSocketClose(struct ncclSocket* sock);
 #endif
--- a/src/include/transport.h
+++ b/src/include/transport.h
@ -62,7 +62,7 @@ struct ncclTransportComm {
 };
 struct ncclTransport {
-  const char name[4];
+  const char name[8];
  ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
  struct ncclTransportComm send;
  struct ncclTransportComm recv;
@ -71,6 +71,9 @@ struct ncclTransport {
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
 enum { collNetRecv=0, collNetSend=1 };
 int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
--- a/src/init.cc
+++ b/src/init.cc
@ -35,13 +35,13 @@
 #endif
 const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
-const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain" };
+const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
-NCCL_PARAM(CommBlocking, "COMM_BLOCKING", 0);
+NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
 static uint64_t hashUniqueId(ncclUniqueId const &id) {
  char const *bytes = (char const*)&id;
@ -67,12 +67,8 @@ ncclResult_t initGdrCopy() {
  return ncclSuccess;
 }
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static size_t maxLocalSizeBytes = 0;
 static ncclResult_t ncclInit() {
  if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
@ -80,9 +76,6 @@ static ncclResult_t ncclInit() {
  if (!initialized) {
    initEnv();
    initGdrCopy();
    maxLocalSizeBytes = ncclKernMaxLocalSize();
    int carveout = ncclParamL1SharedMemoryCarveout();
    if (carveout) ncclKernSetSharedMemoryCarveout(carveout);
    // Always initialize bootstrap network
    NCCLCHECK(bootstrapNetInit());
    NCCLCHECK(ncclNetPluginInit());
@ -210,6 +203,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
    NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
  }
  if (comm->nvlsSupport) NCCLCHECK(ncclNvlsFree(comm));
  struct ncclDestructor* dtor = comm->destructorHead;
  while (dtor != nullptr) {
    NCCLCHECK(dtor->fn(dtor));
@ -220,6 +215,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
  ncclMemoryStackDestruct(&comm->memPermanent);
  ncclCudaHostFree((void *)comm->abortFlag);
  free(comm->netName);
  commPoison(comm); // poison comm before free to avoid comm reuse.
  free(comm);
@ -243,8 +239,8 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
  int flag = 0;
  CUdevice dev;
  int cudaDriverVersion;
-  CUCHECK(cuDriverGetVersion(&cudaDriverVersion));
+  CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion));
-  if (cudaDriverVersion < 11070) return ncclInternalError;
+  if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError;
  CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
  // Query device to see if DMA-BUF support is available
  (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
@ -265,7 +261,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
    NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
    if (ret != ncclSuccess) {
      /* if ret is not ncclInProgress, we just keep it. */
-      WARN("Attempt to use communicator before the previous operation returned ncclSuccess\n");
+      WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
      if (ret == ncclInProgress) ret = ncclInvalidArgument;
      goto exit;
    }
@ -395,6 +391,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
    tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
    tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
    tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
    tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
    tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
    if (comm->channels[c].ring.userRanks != nullptr) {
@ -521,8 +518,8 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN
    struct ncclChannel* channel = comm->channels + c;
    for (int h = 0; h < nHeads; h++) {
      const int head = heads[h];
-      collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
+      collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
-      if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
+      if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
    }
    // Verify CollNet setup across ranks after trying the first channel
    if (c == 0) {
@ -922,6 +919,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  // Check if we can setup CollNet
  if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);
  NCCLCHECKGOTO(ncclNvlsSetup(comm), ret, fail);
  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
  // Compute time models for algorithm and protocol combinations
@ -929,7 +928,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    int myCompCap = comm->peerInfo[rank].cudaCompCap;
    int minCompCap = myCompCap, maxCompCap = myCompCap;
    for (int i = 0; i < nranks; i++) {
-      minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
+      comm->minCompCap = minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
      maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
    }
    NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
@ -938,6 +937,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  // Compute nChannels per peer for p2p
  NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
  INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
  do { // Setup p2p structures in comm->tasks
    struct ncclTasks* tasks = &comm->tasks;
    int nRanks = comm->nRanks;
@ -1004,12 +1005,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
        }
      }
    }
    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
  }
  // Connect to local net proxy
  NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
-  NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+  NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
  // Then to remote ones when using PXN
  if (ncclPxnDisable(comm) == 0) {
@ -1017,7 +1019,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
    for (int r=0; r<nranks; r++) {
      NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
-      NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+      NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
    }
  }
@ -1065,6 +1067,11 @@ fail:
 }
 NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
 NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT);
 // Match config max/minCTAs
 NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
 NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
 #define NCCL_MAX_CGA_CLUSTER_SIZE 8
 struct ncclCommInitRankAsyncJob {
  struct ncclAsyncJob base;
@ -1087,9 +1094,16 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  ncclUniqueId commId = job->commId; // C++ struct assignment
  int myrank = job->myrank;
  int cudaDev = job->cudaDev;
  int archMajor, archMinor;
  size_t maxLocalSizeBytes = 0;
  ncclResult_t res = ncclSuccess;
  CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
  CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
  CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev));
  comm->cudaArch = 100*archMajor + 10*archMinor;
  NCCLCHECK(ncclInitKernelsForDevice(comm->cudaArch, &maxLocalSizeBytes));
  // Set the maximum kernel stack size of all kernels to avoid
  // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
  if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
@ -1114,18 +1128,143 @@ fail:
  goto exit;
 }
-static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
+#define NCCL_CONFIG_DEFAULT(config, field, undef, defvalue, fieldStr, format) \
-  ncclResult_t ret = ncclSuccess;
+  if (config->field == undef) { \
-
+    config->field = defvalue; \
-  /* first set configuration */
+  } else { \
-  if (config) {
+    INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \
    comm->blocking = config->blocking;
  } else {
    /* default setting of communicator */
    comm->blocking = 1;
  }
 static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
  ncclResult_t ret = ncclSuccess;
  /* config must not be NULL in this function */
  int blockingEnv;
  int cgaClusterSizeEnv;
  int minCTAsEnv;
  int maxCTAsEnv;
  const char *envNetName, *tmpNetName;
  ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER;
  ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
  ncclConfig_t *internalConfigPtr;
  size_t realSize;
  internalConfigPtr = &internalConfig;
  if (config) {
    memcpy((void*)&realSize, (void*)config, sizeof(size_t));
    realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
    memcpy((void*)internalConfigPtr, (void*)config, realSize);
    if (internalConfigPtr->magic != 0xcafebeef) {
      WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
      ret = ncclInvalidArgument;
      goto fail;
    }
    /* check version. */
    if (internalConfigPtr->version < NCCL_VERSION(2, 14, 0)) {
      internalConfigPtr->blocking = defaultConfig.blocking;
    }
    if (internalConfigPtr->version < NCCL_VERSION(2, 17, 0)) {
      internalConfigPtr->cgaClusterSize = defaultConfig.cgaClusterSize;
      internalConfigPtr->minCTAs = defaultConfig.minCTAs;
      internalConfigPtr->maxCTAs = defaultConfig.maxCTAs;
      internalConfigPtr->netName = defaultConfig.netName;
    }
  }
  /* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */
  if (internalConfigPtr->blocking != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
    WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
    ret = ncclInvalidArgument;
    goto fail;
  }
  if (internalConfigPtr->cgaClusterSize != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->cgaClusterSize < 0) {
    WARN("Invalid config cgaClusterSize attribute value %d", internalConfigPtr->cgaClusterSize);
    ret = ncclInvalidArgument;
    goto fail;
  }
  if ((internalConfigPtr->minCTAs != NCCL_CONFIG_UNDEF_INT &&
    internalConfigPtr->minCTAs <= 0) ||
    (internalConfigPtr->maxCTAs != NCCL_CONFIG_UNDEF_INT &&
      internalConfigPtr->maxCTAs <= 0) ||
    (internalConfigPtr->minCTAs > internalConfigPtr->maxCTAs)) {
    WARN("Invalid config min/max channels attribute value %d/%d", internalConfigPtr->minCTAs, internalConfigPtr->maxCTAs);
    ret = ncclInvalidArgument;
    goto fail;
  }
  /* default config value can be tuned on different platform. */
  NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
  NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
  NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d");
  NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
  NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
  tmpNetName = internalConfigPtr->netName;
  /* assign config to communicator */
  comm->blocking = internalConfigPtr->blocking;
  comm->cgaClusterSize = internalConfigPtr->cgaClusterSize;
  comm->minCTAs = internalConfigPtr->minCTAs;
  comm->maxCTAs = internalConfigPtr->maxCTAs;
  /* override configuration from env variable. */
  blockingEnv = ncclParamCommBlocking();
  if (blockingEnv == 0 || blockingEnv == 1)
    comm->blocking = blockingEnv;
  cgaClusterSizeEnv = ncclParamCGAClusterSize();
  if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
    comm->cgaClusterSize = cgaClusterSizeEnv;
  } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
    WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
    comm->cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
  }
  minCTAsEnv = ncclParamMinCTAs();
  if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
    comm->minCTAs = minCTAsEnv;
  }
  maxCTAsEnv = ncclParamMaxCTAs();
  if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
    comm->maxCTAs = maxCTAsEnv;
  }
  /* cap channels if needed */
  if (comm->minCTAs > MAXCHANNELS) {
    WARN("minCTAs %d is larger than #channels upper limit %d", comm->minCTAs, MAXCHANNELS);
    comm->minCTAs = MAXCHANNELS;
  }
  if (comm->maxCTAs > MAXCHANNELS) {
    WARN("maxCTAs %d is larger than #channels upper limit %d", comm->maxCTAs, MAXCHANNELS);
    comm->maxCTAs = MAXCHANNELS;
  }
  if (comm->minCTAs > comm->maxCTAs) {
    WARN("minCTAs %d is larger than maxCTAs %d", comm->minCTAs, comm->maxCTAs);
    ret = ncclInvalidArgument;
    goto fail;
  }
  envNetName = getenv("NCCL_NET");
  if (envNetName)
    tmpNetName = envNetName;
  if (tmpNetName != NULL) {
    int netNameLen = strlen(tmpNetName) + 1;
    comm->netName = (char*)malloc(netNameLen);
    memcpy(comm->netName, tmpNetName, netNameLen);
  } else {
    comm->netName = NULL;
  }
 exit:
  return ret;
 fail:
  goto exit;
 }
 static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
@ -1151,6 +1290,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
  CUDACHECKGOTO(cudaFree(NULL), res, fail);
  NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail);
  NCCLCHECKGOTO(PtrCheck(config, "CommInitRank", "config"), res, fail);
  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
    WARN("Invalid rank requested : %d/%d", myrank, nranks);
    res = ncclInvalidArgument;
@ -1201,12 +1341,13 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
  (void)ncclCudaLibraryInit();
  int cudaDev;
  ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
  CUDACHECK(cudaGetDevice(&cudaDev));
  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
  NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
-  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL));
+  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config));
  return ncclSuccess;
 }
@ -1215,6 +1356,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
  ncclResult_t ret = ncclSuccess;
  int totalnDev;
  int *gpuFlags = NULL;
  ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
  constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
@ -1258,7 +1400,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
  NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
  for (int i=0; i<ndev; i++) {
    // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
-    ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, NULL);
+    ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config);
  }
  NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
@ -1283,39 +1425,16 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
  int cudaDev;
  ncclResult_t ret = ncclSuccess;
  ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
-  ncclConfig_t *internalConfigPtr;
+  ncclConfig_t *internalConfigPtr = NULL;
  size_t realSize;
  int blockingEnv;
  NCCLCHECK(ncclGroupStartInternal());
  internalConfigPtr = &internalConfig;
  if (config) {
    memcpy((void*)&realSize, (void*)config, sizeof(size_t));
    realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
    memcpy((void*)internalConfigPtr, (void*)config, realSize);
    if (internalConfigPtr->magic != 0xcafebeef) {
      WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
      ret = ncclInvalidArgument;
      goto exit;
    }
  }
  /* check input config attributes */
  if (internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
    WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
    ret = ncclInvalidArgument;
    goto exit;
  }
  /* overwrite configuration from env variable. */
  blockingEnv = ncclParamCommBlocking();
  if (blockingEnv != 0 && blockingEnv != 1) {
    WARN("Invalid NCCL_COMM_BLOCKING value %d", blockingEnv);
  }
  if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv;
  (void)ncclCudaLibraryInit();
-  CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, exit);
+  CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);
  if (config == NULL)
    internalConfigPtr = &internalConfig;
  else
    internalConfigPtr = config;
  NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
 exit:
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@ -23,11 +23,33 @@ DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
 /* proxy.cc */
 DECLARE_CUDA_PFN(cuCtxCreate, 3020);
 DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
 DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
 DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
 DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
 /* cuMem API support */
 DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
 DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
 DECLARE_CUDA_PFN(cuMemCreate, 10020);
 DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
 DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
 DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
 DECLARE_CUDA_PFN(cuMemMap, 10020);
 DECLARE_CUDA_PFN(cuMemRelease, 10020);
 DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
 DECLARE_CUDA_PFN(cuMemUnmap, 10020);
 #if CUDA_VERSION >= 11070
 /* transport/collNet.cc/net.cc*/
 DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
 #endif
 #if CUDA_VERSION >= 12010
 /* NVSwitch Multicast support */
 DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
 DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
 DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
 DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
 DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
 DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
 #endif
 #endif
 /* CUDA Driver functions loaded with dlsym() */
@ -39,6 +61,7 @@ DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
 static void *cudaLib;
 int ncclCudaDriverVersionCache = -1;
 bool ncclCudaLaunchBlocking = false;
 #if CUDART_VERSION >= 11030
 /*
@ -62,9 +85,33 @@ static ncclResult_t cudaPfnFuncLoader(void) {
  LOAD_SYM(cuMemGetAddressRange, 3020, 1);
  LOAD_SYM(cuCtxCreate, 3020, 1);
  LOAD_SYM(cuCtxDestroy, 4000, 1);
  LOAD_SYM(cuCtxGetCurrent, 4000, 1);
  LOAD_SYM(cuCtxSetCurrent, 4000, 1);
  LOAD_SYM(cuCtxGetDevice, 2000, 1);
 /* cuMem API support */
 #if CUDA_VERSION >= 11030
  LOAD_SYM(cuMemAddressReserve, 10020, 1);
  LOAD_SYM(cuMemAddressFree, 10020, 1);
  LOAD_SYM(cuMemCreate, 10020, 1);
  LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
  LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
  LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
  LOAD_SYM(cuMemMap, 10020, 1);
  LOAD_SYM(cuMemRelease, 10020, 1);
  LOAD_SYM(cuMemSetAccess, 10020, 1);
  LOAD_SYM(cuMemUnmap, 10020, 1);
 #endif
 #if CUDA_VERSION >= 11070
  LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
 #endif
 #if CUDA_VERSION >= 12010
 /* NVSwitch Multicast support */
  LOAD_SYM(cuMulticastAddDevice, 12010, 1);
  LOAD_SYM(cuMulticastBindMem, 12010, 1);
  LOAD_SYM(cuMulticastBindAddr, 12010, 1);
  LOAD_SYM(cuMulticastCreate, 12010, 1);
  LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
  LOAD_SYM(cuMulticastUnbind, 12010, 1);
 #endif
  return ncclSuccess;
 }
@ -74,6 +121,11 @@ static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
 static ncclResult_t initResult;
 static void initOnceFunc() {
  do {
    char* val = getenv("CUDA_LAUNCH_BLOCKING");
    ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
  } while (0);
  CUresult res;
  /*
   * Load CUDA driver library
@ -85,9 +137,10 @@ static void initOnceFunc() {
  else
    snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
  (void) dlerror(); // Clear any previous errors
  cudaLib = dlopen(path, RTLD_LAZY);
  if (cudaLib == NULL) {
-    WARN("Failed to find CUDA library (NCCL_CUDA_PATH='%s') : %s", ncclCudaPath ? ncclCudaPath : "", dlerror());
+    WARN("Failed to find CUDA library %s (NCCL_CUDA_PATH='%s') : %s", path, ncclCudaPath ? ncclCudaPath : "", dlerror());
    goto error;
  }
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@ -0,0 +1,200 @@
 /*
 * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
 *
 * See COPYRIGHT for license information
 */
 #include "ipcsocket.h"
 #include "utils.h"
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 // Enable Linux abstract socket naming
 #define USE_ABSTRACT_SOCKET
 #define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx"
 /*
 * Create a Unix Domain Socket
 */
 ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
  int fd = -1;
  struct sockaddr_un cliaddr;
  char temp[NCCL_IPC_SOCKNAME_LEN] = "";
  if (handle == NULL) {
    return ncclInternalError;
  }
  handle->fd = -1;
  handle->socketName[0] = '\0';
  if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
    WARN("UDS: Socket creation error : %d", errno);
    return ncclSystemError;
  }
  bzero(&cliaddr, sizeof(cliaddr));
  cliaddr.sun_family = AF_UNIX;
  // Create unique name for the socket.
  int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
  if (len > (sizeof(cliaddr.sun_path) - 1)) {
    WARN("UDS: Cannot bind provided name to socket. Name too large");
    return ncclInternalError;
  }
 #ifndef USE_ABSTRACT_SOCKET
  unlink(temp);
 #endif
  TRACE(NCCL_INIT, "UDS: Creating socket %s", temp);
  strncpy(cliaddr.sun_path, temp, len);
 #ifdef USE_ABSTRACT_SOCKET
  cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
 #endif
  if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
    WARN("UDS: Binding to socket %s failed : %d", temp, errno);
    close(fd);
    return ncclSystemError;
  }
  handle->fd = fd;
  strcpy(handle->socketName, temp);
  handle->abortFlag = abortFlag;
  // Mark socket as non-blocking
  if (handle->abortFlag) {
    int flags;
    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
  }
  return ncclSuccess;
 }
 ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
  if (handle == NULL) {
    return ncclInternalError;
  }
  if (handle->fd <= 0) {
    return ncclSuccess;
  }
 #ifndef USE_ABSTRACT_SOCKET
  if (handle->socketName[0] != '\0') {
    unlink(handle->socketName);
  }
 #endif
  close(handle->fd);
  return ncclSuccess;
 }
 ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
  struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
  struct iovec iov[1];
  // Union to guarantee alignment requirements for control array
  union {
    struct cmsghdr cm;
    char control[CMSG_SPACE(sizeof(int))];
  } control_un;
  struct cmsghdr *cmptr;
  char dummy_buffer[1];
  int ret;
  msg.msg_control = control_un.control;
  msg.msg_controllen = sizeof(control_un.control);
  iov[0].iov_base = (void *)dummy_buffer;
  iov[0].iov_len = sizeof(dummy_buffer);
  msg.msg_iov = iov;
  msg.msg_iovlen = 1;
  while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
    if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
      WARN("UDS: Receiving data over socket failed : %d", errno);
      return ncclSystemError;
    }
    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
  }
  if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
    if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
      WARN("UDS: Receiving data over socket failed");
      return ncclSystemError;
    }
    memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
  } else {
    WARN("UDS: Receiving data over socket %s failed", handle->socketName);
    return ncclSystemError;
  }
  TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
  return ncclSuccess;
 }
 ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
  struct msghdr msg;
  struct iovec iov[1];
  char temp[NCCL_IPC_SOCKNAME_LEN];
  union {
    struct cmsghdr cm;
    char control[CMSG_SPACE(sizeof(int))];
  } control_un;
  struct cmsghdr *cmptr;
  struct sockaddr_un cliaddr;
  // Construct client address to send this shareable handle to
  bzero(&cliaddr, sizeof(cliaddr));
  cliaddr.sun_family = AF_UNIX;
  int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
  if (len > (sizeof(cliaddr.sun_path) - 1)) {
    WARN("UDS: Cannot connect to provided name for socket. Name too large");
    return ncclInternalError;
  }
  (void) strncpy(cliaddr.sun_path, temp, len);
  TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
 #ifdef USE_ABSTRACT_SOCKET
  cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
 #endif
  msg.msg_control = control_un.control;
  msg.msg_controllen = sizeof(control_un.control);
  cmptr = CMSG_FIRSTHDR(&msg);
  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
  cmptr->cmsg_level = SOL_SOCKET;
  cmptr->cmsg_type = SCM_RIGHTS;
  memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
  msg.msg_name = (void *)&cliaddr;
  msg.msg_namelen = sizeof(struct sockaddr_un);
  iov[0].iov_base = (void *)"";
  iov[0].iov_len = 1;
  msg.msg_iov = iov;
  msg.msg_iovlen = 1;
  msg.msg_flags = 0;
  ssize_t sendResult;
  while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
    if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
      WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
      return ncclSystemError;
    }
    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
  }
  return ncclSuccess;
 }
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@ -43,7 +43,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
 static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
  int closed;
-  NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
+  NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
  if (closed) {
    char line[SOCKET_NAME_MAXLEN+1];
    WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
@ -785,16 +785,33 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
 }
 // Receive or detect connection closed
-ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
  int offset = 0;
  if (sock == NULL) {
    WARN("ncclSocketTryRecv: pass NULL socket");
    return ncclInvalidArgument;
  }
  *closed = 0;
-  while (offset < size) {
+  // Block until connection closes or nbytes received
  if (blocking) {
    while (offset < size) {
      NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
      if (*closed) return ncclSuccess;
    }
  } else {
    NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
    if (*closed) return ncclSuccess;
    // If any bytes were received, block waiting for the rest
    if (offset > 0) {
      while (offset < size) {
        NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
        if (*closed) return ncclSuccess;
      }
    // No bytes were received, return ncclInProgress
    } else {
      return ncclInProgress;
    }
  }
  return ncclSuccess;
 }
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@ -25,8 +25,10 @@
 extern "C" {
 #endif
 #include <limits.h>
 /* Opaque handle to communicator */
 typedef struct ncclComm* ncclComm_t;
 #define NCCL_COMM_NULL NULL
 #define NCCL_UNIQUE_ID_BYTES 128
 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
@ -42,15 +44,22 @@ typedef enum { ncclSuccess                 =  0,
               ncclInProgress              =  7,
               ncclNumResults              =  8 } ncclResult_t;
 #define NCCL_CONFIG_UNDEF_INT INT_MIN
 #define NCCL_CONFIG_UNDEF_PTR NULL
 /* Communicator configuration. Users can assign value to attributes to specify the
 * behavior of a communicator. */
-typedef struct ncclConfig_v21400 {
+typedef struct ncclConfig_v21700 {
  /* attributes that users should never touch. */
  size_t size;
  unsigned int magic;
  unsigned int version;
  /* attributes that users are able to customize. */
  int blocking;
  int cgaClusterSize;
  int minCTAs;
  int maxCTAs;
  const char *netName;
 } ncclConfig_t;
 /* Config initializer must be assigned to initialize config structure when it is created.
@ -59,7 +68,11 @@ typedef struct ncclConfig_v21400 {
  sizeof(ncclConfig_t), /* size */                                      \
  0xcafebeef,           /* magic */                                     \
  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
-  1                     /* blocking */                                  \
+  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
  NCCL_CONFIG_UNDEF_PTR                     /* netName */               \
 }
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
--- a/src/net.cc
+++ b/src/net.cc
@ -176,14 +176,8 @@ ncclResult_t ncclNetPluginInit() {
  }
  void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
  if (netPluginLib == nullptr) {
-    // dlopen does not guarantee to set errno, but dlerror only gives us a
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror());
-    // string, so checking errno doesn't hurt to try to provide a better
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation");
    // error message
    if (errno == ENOENT) {
      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
    } else {
      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
    }
    return ncclSuccess;
  }
@ -264,9 +258,10 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
 ncclResult_t ncclNetInit(struct ncclComm* comm) {
  // Initialize main communication network
-  char* netName = getenv("NCCL_NET");
+  char* netName;
  bool ok = false;
  netName = comm->netName;
  for (int i=0; i<3; i++) {
    if (ncclNets[i] == nullptr) continue;
    enum ncclNetState state;
@ -324,9 +319,26 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
    ncclResult_t ret;
    ncclDebugNoWarn = NCCL_NET;
    NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
-    NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
+
-    NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
+    bool connected;
-    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
+    connected = false;
    while (!connected) {
      // If we're aborting now, skip to cleanup
      if (*comm->abortFlag) {
        goto cleanup2;
      }
      if (sComm == NULL)
        NCCLCHECKGOTO(ncclNetConnect(comm, dev, &handle, &sComm), ret, cleanup2);
      if (rComm == NULL)
        NCCLCHECKGOTO(ncclNetAccept(comm, lComm, &rComm), ret, cleanup2);
      connected = (rComm != NULL) && (sComm != NULL);
    }
    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
    if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
      NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
      NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
@ -335,11 +347,11 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
    }
    ncclDebugNoWarn = 0;
    CUDACHECK(cudaFree(gpuPtr));
 cleanup4:
    NCCLCHECK(ncclNetCloseRecv(comm, rComm));
 cleanup3:
    NCCLCHECK(ncclNetCloseSend(comm, sComm));
 cleanup2:
    if (rComm != NULL)
      NCCLCHECK(ncclNetCloseRecv(comm, rComm));
    if (sComm != NULL)
      NCCLCHECK(ncclNetCloseSend(comm, sComm));
    NCCLCHECK(ncclNetCloseListen(comm, lComm));
 cleanup1:
    break;
--- a/src/proxy.cc
+++ b/src/proxy.cc
@ -14,6 +14,7 @@
 #include "timer.h"
 #include <sys/syscall.h>
 #include <assert.h>
 enum { proxyRecv=0, proxySend=1 };
@ -37,6 +38,155 @@ struct ncclProxyPool {
  struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
 };
 static void expectedProxyResponseFree(struct ncclProxyState* state) {
  struct ncclExpectedProxyResponse* elem = state->expectedResponses;
  struct ncclExpectedProxyResponse* prev = NULL;
  while (elem) {
    prev = elem;
    elem = elem->next;
    free(prev->respBuff);
    free(prev);
  }
 }
 static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) {
  struct ncclExpectedProxyResponse* elem = state->expectedResponses;
  while (elem) {
    if (elem->opId == opId) {
      if (respSize != elem->respSize) {
        WARN("Mismatched response size for opId=%p", opId);
        return ncclInternalError;
      }
      if (elem->done) {
        WARN("Storing response for already completed opId=%p", opId);
        return ncclInternalError;
      }
      memcpy(elem->respBuff, respBuff, respSize);
      elem->done = true;
      return ncclSuccess;
    }
    elem = elem->next;
  }
  WARN("Proxy response for opId=%p doesn't match any expected response", opId);
  return ncclInternalError;
 }
 static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize, void* respData, int respDataSize) {
  struct ncclExpectedProxyResponse* ex;
  NCCLCHECK(ncclCalloc(&ex, 1));
  ex->opId = opId;
  // Pre-alloc response buffer
  ex->respBuff = malloc(respSize);
  ex->respSize = respSize;
  ex->done     = false;
  if (respData) {
    memcpy(ex->respBuff, respData, respDataSize);
    ex->done = true;
  }
  // Enqueue
  struct ncclExpectedProxyResponse* list = state->expectedResponses;
  if (list == NULL) {
    state->expectedResponses = ex;
    return ncclSuccess;
  }
  while (list->next) list = list->next;
  list->next = ex;
  return ncclSuccess;
 }
 static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, void* opId, void* respBuff, int* found) {
  struct ncclExpectedProxyResponse* elem = state->expectedResponses;
  struct ncclExpectedProxyResponse* prev = NULL;
  *found = 0;
  while (elem) {
    if ((elem->opId == opId) && elem->done) {
      if (prev == NULL) {
        state->expectedResponses = elem->next;
      } else {
        prev->next = elem->next;
      }
      memcpy(respBuff, elem->respBuff, elem->respSize);
      free(elem->respBuff);
      free(elem);
      *found = 1;
      return ncclSuccess;
    }
    prev = elem;
    elem = elem->next;
  }
  return ncclSuccess;
 }
 static ncclResult_t expectedProxyResponseRemove(struct ncclProxyState* state, void* opId) {
  struct ncclExpectedProxyResponse* elem = state->expectedResponses;
  struct ncclExpectedProxyResponse* prev = NULL;
  while (elem) {
    if (elem->opId == opId) {
      if (prev == NULL) {
        state->expectedResponses = elem->next;
      } else {
        prev->next = elem->next;
      }
      free(elem->respBuff);
      free(elem);
      return ncclSuccess;
    }
    prev = elem;
    elem = elem->next;
  }
  WARN("Couldn't find opId=%p", opId);
  return ncclInternalError;
 }
 static ncclResult_t asyncProxyOpEnqueue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) {
  ncclProxyAsyncOp* list = peer->asyncOps;
  if (list == NULL) {
    peer->asyncOps = op;
    return ncclSuccess;
  }
  while (list->next) list = list->next;
  list->next = op;
  return ncclSuccess;
 }
 static ncclResult_t asyncProxyOpDequeue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) {
  struct ncclProxyAsyncOp* elem = peer->asyncOps;
  struct ncclProxyAsyncOp* prev = NULL;
  while (elem) {
    if (elem->opId == op->opId) {
      if (prev == NULL) {
        peer->asyncOps = elem->next;
      } else {
        prev->next = elem->next;
      }
      if (elem->reqBuff) {
        free(elem->reqBuff);
      }
      if (elem->respBuff) {
        free(elem->respBuff);
      }
      free(elem);
      return ncclSuccess;
    }
    prev = elem;
    elem = elem->next;
  }
  if (op) {
    WARN("Attempting to dequeue nonexistent async opId=%p", op->opId);
  } else {
    WARN("Attempting to dequeue null operation");
  }
  return ncclInternalError;
 }
 static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) {
  struct ncclProxyArgs* elem;
  if (state->pool == NULL) {
@ -86,7 +236,7 @@ ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState*
    pool = pool->next;
    p++;
  }
-  WARN("Could not find pool of op %p\n", op);
+  WARN("Could not find pool of op %p", op);
  return ncclInternalError;
 }
@ -140,7 +290,7 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
      nextOp->state |= OP_SEEN;
      printf("\n");
      if (nextOp->next) {
-        WARN("Inactive op has next set!\n");
+        WARN("Inactive op has next set!");
      }
      nextOp = nextOp->nextPeer;
    }
@ -337,7 +487,7 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
      }
    }
    if (lastOp == -1) {
-      WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
+      WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
      return ncclInternalError;
    }
    // Cut chain at lastOp
@ -770,19 +920,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
  return ncclSuccess;
 }
 struct ncclProxyAsyncOp {
  int type;
  struct ncclProxyConnection* connection;
  int reqSize, respSize;
  char *reqBuff, *respBuff;
 };
 struct ncclProxyLocalPeer {
  struct ncclSocket sock;
  int localRank;
  struct ncclProxyAsyncOp asyncOps;
 };
 #define NCCL_PROXY_CONN_POOL_SIZE_POW2 7
 #define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2))
 #define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1)
@ -790,7 +927,6 @@ struct ncclProxyConnectionPool {
  struct ncclProxyConnection** pools;
  int banks;
  int offset;
  struct ncclProxyAsyncOp* ops;
 };
 static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
@ -888,26 +1024,137 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
  return ncclSuccess;
 }
-const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
+const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" };
-ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
+ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
  struct ncclSocket* sock;
  ncclResult_t ret = ncclSuccess;
  void* respData = NULL;
  int respDataSize = 0;
  struct ncclComm* comm = proxyConn->comm;
  struct ncclIpcSocket ipcSock = { 0 };
-  if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
+  if (*comm->abortFlag != 0) {
-  sock = proxyConn->comm->proxyState.peerSocks + proxyConn->localRank;
+    WARN("ncclProxyCallAsync() - Saw abortFlag while waiting for proxyThread response");
    return ncclInternalError;
  }
  if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
  sock = comm->proxyState.peerSocks + proxyConn->localRank;
  if (sock == NULL) return ncclInternalError;
  if (type == ncclProxyMsgConvertFd) {
    // cuMem API support
    // Create a UDS socket to receive the converted fd
    NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, (uint64_t)proxyConn->connection, comm->abortFlag));
  }
  NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
  NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
  NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
  NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
  if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
-  if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error);
+
  if (type == ncclProxyMsgConvertFd) {
    // cuMem API support
    int recvFd = -1;
    if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
    // Receive converted fd over UDS
    NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, &recvFd));
    TRACE(NCCL_NET, "UDS: ConvertFd rank %d returned %p %d", proxyConn->localRank, &recvFd, recvFd);
    assert(recvFd != -1);
    respData = &recvFd;
    respDataSize = sizeof(recvFd);
    NCCLCHECK(ncclIpcSocketClose(&ipcSock));
  } else {
    // Send opId to proxy
    NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error);
  }
  // Add proxyOp to expected response queue
  NCCLCHECK(expectedProxyResponseEnqueue(&comm->proxyState, opId, respSize, respData, respDataSize));
  return ncclSuccess;
 error:
-  WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
+  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
  WARN("Proxy Call to rank %d failed (%s)", comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
  return ret;
 }
 ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
  struct ncclComm* comm = proxyConn->comm;
  // Receive the connection pointer from the Proxy
  if (*comm->abortFlag) {
    WARN("Comm %p is in abort state", comm);
    return ncclInternalError;
  }
  if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
  // Check response queue
  int found = 0;
  NCCLCHECK(expectedProxyResponseDequeue(&comm->proxyState, opId, respBuff, &found));
  if (found == 0) {
    // Attempt to read in a new response header from the proxy thread
    struct ncclSocket* sock = comm->proxyState.peerSocks + proxyConn->localRank;
    void* recvOpId;
    int offset = 0;
    if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
      WARN("Socket recv failed while polling for opId=%p", opId);
      return ncclInternalError;
    }
    if (offset == 0) {
      return ncclInProgress;
    // If we've returned a partial response, block to receive the rest of it
    } else if (offset < sizeof(recvOpId)) {
      while (offset < sizeof(recvOpId))
        NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
    }
    INFO(NCCL_PROXY, "ncclPollProxyResponse Recieved new opId=%p", recvOpId);
    // Now do a blocking recv of the response size
    int respSize = 0;
    NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize)));
    // If there's a respSize to recv
    if (respSize > 0) {
      NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize));
    }
    if (recvOpId == opId) {
      INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
      NCCLCHECK(expectedProxyResponseRemove(&comm->proxyState, recvOpId));
      return ncclSuccess;
    } else {
      INFO(NCCL_PROXY, "Queing opId=%p", recvOpId);
      // Store the result and mark response as completed
      NCCLCHECK(expectedProxyResponseStore(&comm->proxyState, recvOpId, respBuff, respSize));
      return ncclInProgress;
    }
  } else {
    INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId);
  }
  return ncclSuccess;
 }
 ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
  // Alloc some memory to act as a handle
  void* opId = malloc(1);
  NCCLCHECK(ncclProxyCallAsync(proxyConn, type, reqBuff, reqSize, respSize, opId));
  ncclResult_t res = ncclInProgress;
  while (res == ncclInProgress) {
    res = ncclPollProxyResponse(proxyConn, respBuff, opId);
  }
  free(opId);
  return res;
 }
 static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
  if (state->opsPool == NULL) {
@ -998,16 +1245,55 @@ static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct
  if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
  int nChannels;
  NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
  // Store opId for completion response
  void* opId;
  NCCLCHECK(ncclSocketRecv(sock, &opId, sizeof(opId)));
  INFO(NCCL_PROXY, "proxyConnSharedInit received opId=%p", opId);
  if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
  __atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
  // Send the opId for referencing async operation
  INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(opId=%p)", opId);
  NCCLCHECK(ncclSocketSend(connection->sock, &opId, sizeof(opId)));
  // Send the response size
  INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(op.respSize=%d)", respSize);
  NCCLCHECK(ncclSocketSend(connection->sock, &respSize, sizeof(respSize)));
  return ncclSuccess;
 }
-static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) {
+// cuMem API support
 static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, struct ncclComm* comm) {
  struct ncclSocket* sock = &peer->sock;
  uint64_t connection;
  NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(uint64_t)));
  int reqSize, respSize;
  NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
  NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
  if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
  int fd;
  struct ncclIpcSocket ipcSock = { 0 };
  NCCLCHECK(ncclSocketRecv(sock, &fd, sizeof(int)));
  INFO(NCCL_NET, "UDS: proxyConvertFd received fd %d peer %d connection %lx", fd, peer->localRank, connection);
  // Send back the converted fd using UDS
  NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, connection^1, comm->abortFlag));
  NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->localRank, connection));
  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
  return ncclSuccess;
 }
 static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount, struct ncclProxyLocalPeer* peer) {
  int done = 1;
  if (op->type == ncclProxyMsgSetup) {
    INFO(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
    NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
  } else if (op->type == ncclProxyMsgConnect) {
    INFO(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
    NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
  } else return ncclInternalError;
  if (done) {
@ -1015,31 +1301,38 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC
      __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
    else if (op->type == ncclProxyMsgConnect)
      __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
-    /* if setup or connect is done, we should not return any error at this point since
+    /* if setup or connect is done, we should not return any error at this point since 
     * ncclSocketSend might already send the respBuff to the requester. If we still choose
     * to abort and close the connection, it can cause segfault if the requester is using
     * the respBuff. */
-    if (op->respSize) ncclSocketSend(op->connection->sock, op->respBuff, op->respSize);
+
-    if (op->reqBuff) {
+    // Send the opId for referencing async operation
-      free(op->reqBuff);
+    NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
-      op->reqBuff = NULL;
+
    // Send the response size
    NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
    if (op->respSize) {
      // Send the response
      NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
    }
-    if (op->respBuff) {
+
-      free(op->respBuff);
+    asyncProxyOpDequeue(peer, op);
      op->respBuff = NULL;
    }
    op->type = 0;
    (*asyncOpCount)--;
    return ncclSuccess;
  } else if (*comm->abortFlag != 0) {
    return ncclInternalError;
  }
-  return ncclSuccess;
+  return ncclInProgress;
 }
 static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
  struct ncclSocket* sock = &peer->sock;
-  struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps;
+  struct ncclProxyAsyncOp* asyncOp;
  NCCLCHECK(ncclCalloc(&asyncOp, 1));
  asyncOp->type = type;
  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
@ -1049,9 +1342,16 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p
    NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
    NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
  }
  // Store opId for completion response
  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
  if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
  asyncProxyOpEnqueue(peer, asyncOp);
  (*asyncOpCount)++;
-  NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount));
+  NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount, peer));
  return ncclSuccess;
 }
@ -1081,7 +1381,7 @@ void* ncclProxyService(void* _args) {
    pollfds[s].events = POLLHUP|POLLIN;
  }
  if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
-    WARN("[Proxy Service] Get listenSock fd fails\n");
+    WARN("[Proxy Service] Get listenSock fd fails");
    return NULL;
  };
  pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
@ -1113,14 +1413,14 @@ void* ncclProxyService(void* _args) {
      }
      if (maxnpeers < s+1) maxnpeers = s+1;
      if (ncclSocketInit(&peers[s].sock) != ncclSuccess) {
-        WARN("[Service thread] Initialize peers[%d].sock fails\n", s);
+        WARN("[Service thread] Initialize peers[%d].sock fails", s);
        return NULL;
      }
      if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
        WARN("[Service thread] Accept failed %s", strerror(errno));
      } else {
        if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
-          WARN("[Service thread] Get peers[%d].sock fd fails\n", s);
+          WARN("[Service thread] Get peers[%d].sock fd fails", s);
          return NULL;
        }
        npeers++;
@ -1130,25 +1430,37 @@ void* ncclProxyService(void* _args) {
    for (int s=0; s<maxnpeers; s++) {
      struct ncclProxyLocalPeer* peer = peers+s;
      struct ncclSocket* sock = &peer->sock;
      struct ncclProxyAsyncOp* op = &peer->asyncOps;
      int closeConn = 0;
      int type = 0;
      ncclResult_t res = ncclSuccess;
      if (pollfds[s].fd == -1) continue;
-      if (op->type != 0) {
+
-        res = proxyProgressAsync(op, comm, &asyncOpCount);
+      // Progress all ops for this ncclProxyLocalPeer
      ncclProxyAsyncOp* op = peer->asyncOps;
      while (op != nullptr) {
        type = op->type;
-        if (res != ncclSuccess) closeConn = 1;
+        res = proxyProgressAsync(op, comm, &asyncOpCount, peer);
-      } else if (pollfds[s].revents & POLLIN) {
+        if (res == ncclSuccess || res == ncclInProgress) {
          op = op->next;
        } else {
          // Res is a bad result
          closeConn = 1;
          WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", ncclProxyMsgTypeStr[type], res);
          break;
        }
      }
      // Check for additional ops coming in
      if (pollfds[s].revents & POLLIN) {
        int closed;
-        if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
+        res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
-          WARN("[Service thread] Could not receive type from localRank %d", peer->localRank);
+        if (res != ncclSuccess && res != ncclInProgress) {
          WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->localRank, res, closed);
          closeConn = 1;
        } else if (closed) {
          INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
          closeConn = 1;
-        } else {
+        } else if (res == ncclSuccess) { // We received something from the sock
          if (type == ncclProxyMsgStop) {
            stop = 1;
            closeConn = 1;
@ -1159,30 +1471,32 @@ void* ncclProxyService(void* _args) {
          } else if (type == ncclProxyMsgSharedInit) {
            res = proxyConnSharedInit(peers+s, &connectionPool, comm);
          } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
            INFO(NCCL_PROXY, "proxyConnSetupConnect for peer->localRank %d,", peer->localRank);
            res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
          } else if (type == ncclProxyMsgConvertFd) {
            res = proxyConvertFd(peers+s, comm); // cuMem API support
          } else {
-            WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank);
+            WARN("[Service thread] Unknown command %d from localRank %d", type, peer->localRank);
            closeConn = 1;
          }
          INFO(NCCL_PROXY, "Received and initiated operation=%s res=%d", ncclProxyMsgTypeStr[type], res);
        }
      } else if (pollfds[s].revents & POLLHUP) {
        closeConn = 1;
-      } 
+      }
-      if (res != ncclSuccess) {
+      if (res != ncclSuccess && res != ncclInProgress) {
        WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
        closeConn = 1;
      }
      if (closeConn) {
        ncclSocketClose(sock);
-        if (op->reqBuff) {
+
-          free(op->reqBuff);
+        if (op != nullptr) {
-          op->reqBuff = NULL;
+          asyncProxyOpDequeue(peer, op);
          asyncOpCount--;
        }
        if (op->respBuff) {
          free(op->respBuff);
          op->respBuff = NULL;
        }
        op->type = 0;
        pollfds[s].fd = -1;
        npeers--;
      }
@ -1250,6 +1564,7 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
    free(state->peerSocks);
    free(state->proxyOps);
    free(state->sharedDevMems);
    expectedProxyResponseFree(state);
  }
  return ncclSuccess;
 }
--- a/src/transport.cc
+++ b/src/transport.cc
@ -69,9 +69,12 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
  // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
  ncclResult_t ret = ncclSuccess;
  int highestType = TRANSPORT_P2P;  // track highest transport type
-  struct ncclConnect data[2*MAXCHANNELS];
+  struct ncclConnect** data = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Store intermediate send/recvData structs for connect
  struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel
  struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel
  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
  // First time initialization
  for (int i=1; i<comm->nRanks; i++) {
    int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
@ -79,22 +82,28 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    uint64_t recvMask = comm->connectRecv[recvPeer];
    uint64_t sendMask = comm->connectSend[sendPeer];
-    struct ncclConnect* recvData = data;
+    // Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer
    // This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers
    // The first N entries contain recvData, connection information for recv connections
    // The next M entries contain sendData, connection information for send connections
    // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
    data[i] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS);
    recvData[i] = data[i];
    int sendChannels = 0, recvChannels = 0;
    int type;
    TIME_START(0);
    for (int c=0; c<MAXCHANNELS; c++) {
      if (recvMask & (1UL<<c)) {
-        NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
+        NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[i]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
        if (type > highestType) highestType = type;
      }
    }
    TIME_STOP(0);
    TIME_START(1);
-    struct ncclConnect* sendData = recvData+recvChannels;
+    sendData[i] = recvData[i]+recvChannels;
    for (int c=0; c<MAXCHANNELS; c++) {
      if (sendMask & (1UL<<c)) {
-        NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
+        NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[i]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
        if (type > highestType) highestType = type;
      }
    }
@ -103,42 +112,82 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    TIME_START(2);
    if (sendPeer == recvPeer) {
      if (recvChannels+sendChannels) {
-         NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
+        NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
-         NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
+        NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
-         sendData = data;
+        sendData[i] = data[i];
-         recvData = data+sendChannels;
+        recvData[i] = data[i]+sendChannels;
      }
    } else {
-      if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
+      if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
-      if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
+      if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
-      if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
+      if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
-      if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
+      if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
    }
    TIME_STOP(2);
    TIME_START(3);
    for (int c=0; c<MAXCHANNELS; c++) {
      if (sendMask & (1UL<<c)) {
        struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
        NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
        conn->connected = 1;
        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
      }
    }
    TIME_STOP(3);
    TIME_START(4);
    for (int c=0; c<MAXCHANNELS; c++) {
      if (recvMask & (1UL<<c)) {
        struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
        NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
        conn->connected = 1;
        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
      }
    }
    TIME_STOP(4);
    comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
  }
  // Loop until all channels with all ranks have been connected
  bool allChannelsConnected;
  allChannelsConnected = false;
  while (!allChannelsConnected) {
    allChannelsConnected = true;
    for (int i=1; i<comm->nRanks; i++) {
      int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
      int sendPeer = (comm->rank + i) % comm->nRanks;
      uint64_t recvMask = comm->connectRecv[recvPeer];
      uint64_t sendMask = comm->connectSend[sendPeer];
      int sendDataOffset = 0;
      int recvDataOffset = 0;
      for (int c=0; c<MAXCHANNELS; c++) {
          TIME_START(3);
          if (sendMask & (1UL<<c)) {
            struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
            // This connector hasn't completed connection yet
            if (conn->connected == 0) {
              NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
              if (ret == ncclSuccess) {
                conn->connected = 1;
                CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
              } else if (ret == ncclInProgress) {
                allChannelsConnected = false;
              }
            }
          }
          TIME_STOP(3);
          // Start with recv channels
          TIME_START(4);
          if (recvMask & (1UL<<c)) {
            struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
            // This connector hasn't completed connection yet
            if (conn->connected == 0) {
              NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
              if (ret == ncclSuccess) {
                conn->connected = 1;
                CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
              } else if (ret == ncclInProgress) {
                allChannelsConnected = false;
              }
            }
          }
          TIME_STOP(4);
      }
    }
  }
  // Clear all connect masks and free each connectInfo array
  for (int i=1; i<comm->nRanks; i++) {
    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
    int sendPeer = (comm->rank + i) % comm->nRanks;
    comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
    free(data[i]);
  }
  free(data);
  free(sendData);
  free(recvData);
  if (highestTransportType != NULL) *highestTransportType = highestType;
  TIME_PRINT("P2P Setup/Connect");
 exit:
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@ -152,13 +152,13 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  int proxyRank;
  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
-  send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
  // Determine whether we need to flush the GDR buffer on recv or not
  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
-  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
+  NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "");
@ -171,12 +171,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  int proxyRank;
  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
-  recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
-  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
+  NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "");
@ -221,7 +221,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
  // We're on the same process as the proxy. We can pass a pointer to a struct.
  struct collNetConnectArgs args = { rank, nranks, connectInfos };
  struct connectMap* map;
-  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
+  NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
  // If collnet connect failed, propagate error to fallback on regular p2p
  if (map == NULL) return ncclSystemError;
@ -247,7 +247,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
  // We're on the same process as the proxy. We can pass a pointer to a struct.
  struct collNetConnectArgs args = { rank, nranks, connectInfos };
  struct connectMap* map;
-  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
+  NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
  // If collnet connect failed, propagate error to fallback on regular p2p
  if (map == NULL) return ncclSystemError;
@ -410,7 +410,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
 }
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
-  if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
+  if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
  struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
@ -426,7 +426,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
  // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
-  if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+  if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
  if (resources->collNetComm == NULL) {
    *((struct connectMap**)respBuff) = NULL;
    return ncclSuccess;
@ -484,7 +484,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 }
 static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
-  if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
+  if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
  struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
@ -494,7 +494,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
  // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
-  if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+  if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
  if (resources->collNetComm == NULL) {
    *((struct connectMap**)respBuff) = NULL;
    return ncclSuccess;
@ -553,7 +553,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
    info->mhandles[p] = resources->mhandles[p];
-  if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+  if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
  *((struct connectMap**)respBuff) = &resources->map;
  return ncclSuccess;
 }
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@ -172,13 +172,13 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  int proxyRank;
  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
-  send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
  req.rank = myInfo->rank;
  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
  req.remoteRank = peerInfo->rank;
-  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
+  NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
  if (proxyRank == myInfo->rank) {
    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
@ -218,8 +218,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  req.rank = myInfo->rank;
  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
  req.remoteRank = peerInfo->rank;
-  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
+  NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  return ncclSuccess;
@ -264,11 +263,28 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
 }
 static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
-  // Setup device pointers
+  struct connectMap* map = (connectMap*) send->transportResources;
-  struct connectMap* map;
+
-  NCCLCHECK(ncclCalloc(&map, 1));
+  void* opId;
-  send->transportResources = map;
+
-  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap)));
+  // map isn't allocated thus this op hasn't been submitted yet
  if (!map) {
    // Setup device pointers
    NCCLCHECK(ncclCalloc(&map, 1));
    send->transportResources = map;
    opId = send;
    INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
    NCCLCHECK(ncclProxyCallAsync(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
  } else {
    opId =  send;
  }
  ncclResult_t ret;
  NCCLCHECK(ret = ncclPollProxyResponse(&send->proxyConn, map, opId));
  if (ret == ncclInProgress) {
    return ret;
  }
  INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId);
  if (map->sameProcess) {
    if (map->cudaDev != comm->cudaDev) {
@ -315,10 +331,26 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
 /* Connect to this peer */
 static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
-  struct connectMap* map;
+  struct connectMap* map = (connectMap*) recv->transportResources;
-  NCCLCHECK(ncclCalloc(&map, 1));
+  void* opId;
-  recv->transportResources = map;
+  if (!map) {
-  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap)));
+    NCCLCHECK(ncclCalloc(&map, 1));
    recv->transportResources = map;
    // Use recv connector as unique identifier
    opId = recv;
    INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
       opId, &recv->proxyConn, connectInfo);
    NCCLCHECK(ncclProxyCallAsync(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
  } else {
    opId = recv;
  }
  ncclResult_t ret;
  NCCLCHECK(ret = ncclPollProxyResponse(&recv->proxyConn, map, opId));
  if (ret == ncclInProgress) {
    return ret;
  }
  INFO(NCCL_PROXY, "recvConnect ncclPollProxyResponse opId=%p", opId);
  //NCCLCHECK(netDumpMap(map));
  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
@ -490,12 +522,14 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
  if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
  NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
  *done = 1;
  return ncclSuccess;
 }
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
  if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
  ncclResult_t ret = ncclSuccess;
  if (resources->shared) {
    // Shared buffers
@ -515,21 +549,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
      }
      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId));
+      if (comms->sendComm[resources->channelId] == NULL) ret = ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId);
      resources->netSendComm = comms->sendComm[resources->channelId];
      if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
    } else {
-      NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
+      ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
    }
  } else {
    // Connect to remote peer
-    NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
+    ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
    connection->proxyAppendPtr = &connection->proxyAppend;
  }
  NCCLCHECK(ret);
  if (resources->netSendComm == NULL) {
    *done = 0;
-    return ncclSuccess;
+    return ncclInProgress;
  }
  *done = 1;
@ -630,6 +665,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  if (reqSize != sizeof(int)) return ncclInternalError;
  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
  resources->proxyRank = *(int*)reqBuff;
  ncclResult_t ret = ncclSuccess;
  // Finish connection establishment from remote peer
  if (resources->shared) {
@ -650,23 +686,25 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
      }
      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
-      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId));
+      if (comms->recvComm[resources->channelId] == NULL) ret = ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId);
      resources->netRecvComm = comms->recvComm[resources->channelId];
      if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
    } else {
-      NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
+      ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
    }
  } else {
    // Connect to remote peer
-    NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
+    ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
    connection->proxyAppendPtr = &connection->proxyAppend;
  }
  NCCLCHECK(ret);
  if (resources->netRecvComm == NULL) {
    *done = 0;
-    return ncclSuccess;
+    return ncclInProgress;
  }
  *done = 1;
  NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
  // Create structures
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@ -363,7 +363,9 @@ enum ncclIbCommState {
  ncclIbCommStateAccept = 3,
  ncclIbCommStateSend = 4,
  ncclIbCommStateRecv = 5,
-  ncclIbCommStateConnected = 6,
+  ncclIbCommStateConnecting = 6,
  ncclIbCommStateConnected = 7,
  ncclIbCommStatePendingReady = 8,
 };
 struct ncclIbCommStage {
@ -599,8 +601,10 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
  int ready;
  *sendComm = NULL;
-  if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
+  if (stage->state == ncclIbCommStateConnect)    goto ib_connect_check;
-  if (stage->state == ncclIbCommStateSend) goto ib_send;
+  if (stage->state == ncclIbCommStateSend)       goto ib_send;
  if (stage->state == ncclIbCommStateConnecting) goto ib_connect;
  if (stage->state == ncclIbCommStateConnected)  goto ib_send_ready;
  if (stage->state != ncclIbCommStateStart) {
    WARN("Error: trying to connect already connected sendComm");
    return ncclInternalError;
@ -664,11 +668,37 @@ ib_connect_check:
 ib_send:
  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
-  if (stage->offset != sizeof(qpInfo))
+  if (stage->offset != sizeof(qpInfo)) return ncclSuccess;
-    return ncclSuccess;
+
  stage->state = ncclIbCommStateConnecting;
  stage->offset = 0;
  // Clear the staging buffer for re-use
  memset(stage->buffer, 0, sizeof(qpInfo));
 ib_connect:
  struct ncclIbQpInfo remQpInfo;
  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, stage->buffer, sizeof(ncclIbQpInfo), &stage->offset));
  if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
  memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo));
  for (int q=0; q<comm->nqps; q++) {
    struct ibv_qp* qp = comm->qps[q];
    NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
    NCCLCHECK(ncclIbRtsQp(qp));
  }
  comm->ready = 1;
  stage->state = ncclIbCommStateConnected;
  stage->offset = 0;
 ib_send_ready:
  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, &comm->ready, sizeof(int), &stage->offset));
  if (stage->offset != sizeof(int)) return ncclSuccess;
  free(stage->buffer);
-  stage->state = ncclIbCommStateConnected;
+  stage->state = ncclIbCommStateStart;
  *sendComm = comm;
  return ncclSuccess;
 }
@ -685,8 +715,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
  if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
  if (stage->state == ncclIbCommStateRecv) goto ib_recv;
  if (stage->state == ncclIbCommStateSend) goto ib_send;
  if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready;
  if (stage->state != ncclIbCommStateStart) {
-    WARN("Listencomm in unknown state %d\n", stage->state);
+    WARN("Listencomm in unknown state %d", stage->state);
    return ncclInternalError;
  }
@ -704,10 +735,10 @@ ib_accept_check:
  stage->state = ncclIbCommStateRecv;
  stage->offset = 0;
  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
 ib_recv:
  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
-  if (stage->offset != sizeof(remQpInfo))
+  if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
    return ncclSuccess;
  /* copy back the received info */
  memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
@ -780,10 +811,18 @@ ib_recv:
  if (stage->buffer) free(stage->buffer);
  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
  memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
 ib_send:
  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
  if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
  stage->offset = 0;
  stage->state = ncclIbCommStatePendingReady;
 ib_recv_ready:
  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV,  &rComm->sock, &rComm->ready, sizeof(int), &stage->offset));
  if (stage->offset != sizeof(int)) return ncclSuccess;
  free(stage->buffer);
  *recvComm = rComm;
@ -815,36 +854,6 @@ ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
  return ncclSuccess;
 }
 ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
  struct ncclIbQpInfo remQpInfo;
  // Do not block on this receive, return if not ready.
  int bytes = 0;
  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
  if (bytes == 0) return ncclSuccess; // Try again later
  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
  for (int q=0; q<comm->nqps; q++) {
    struct ibv_qp* qp = comm->qps[q];
    NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
    NCCLCHECK(ncclIbRtsQp(qp));
  }
  comm->ready = 1;
  // Block until this is done. It *should* not block indefinitely.
  NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int)));
  return ncclSuccess;
 }
 ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
  // Do not block on this receive, return if not ready.
  int bytes = 0;
  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
  if (bytes == 0) return ncclSuccess; // Try again later
  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
  return ncclSuccess;
 }
 ncclResult_t ncclIbTest(void* request, int* done, int* size);
 /* DMA-BUF support */
@ -1020,7 +1029,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
 ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
-  if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
+  if (comm->ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->ready == 0"); return ncclInternalError; }
  if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
@ -1153,7 +1162,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
 ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
-  if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
+  if (comm->ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->ready == 0"); return ncclInternalError; }
  if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
  if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@ -0,0 +1,373 @@
 /*************************************************************************
 * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
 // Implementation of the NVLink SHARP (NVLS) transport
 #include "comm.h"
 #include "graph.h"
 #include "utils.h"
 #include "proxy.h"
 #if CUDART_VERSION >= 12010
 // Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
 #define USE_POSIX_FD 1
 #if USE_POSIX_FD
 #define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
 #else
 #define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
 #endif
 ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
  // This transport cannot be used for p2p
  *ret = 0;
  return ncclSuccess;
 }
 ncclResult_t nvlsSendFree(struct ncclConnector* send) {
  return ncclSuccess;
 }
 ncclResult_t nvlsRecvFree(struct ncclConnector* recv) {
  return ncclSuccess;
 }
 struct ncclTransport nvlsTransport = {
  "NVLS",
  nvlsCanConnect,
  { NULL, NULL, nvlsSendFree, NULL, NULL, NULL, NULL, NULL },
  { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
 #define NVLS_HANDLE_SIZE 64
 struct nvlsResources {
  CUmulticastObjectProp properties;
  CUmemAccessDesc accessDesc;
  int dev;
  size_t size;
  size_t granularity;
  CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
  char* mcBuff; // Multicast NVLS buffer address
  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
  char* ucBuff; // Unicast NVLS buffer address
 };
 ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* resources, int dev, int nranks, size_t size) {
  CUmulticastObjectProp* prop = &resources->properties;
  memset(prop, 0, sizeof(*prop));
  prop->size = size;
  prop->numDevices = nranks;
  prop->handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
  prop->flags = 0;
  // Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved
  CUCHECK(cuMulticastGetGranularity(&resources->granularity, prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
  ALIGN_SIZE(size, resources->granularity);
  prop->size = resources->size = size;
  memset(&resources->accessDesc, 0, sizeof(resources->accessDesc));
  resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
  resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  resources->accessDesc.location.id = dev;
  resources->dev = dev;
  return ncclSuccess;
 }
 ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resources, int rank, unsigned int nranks, char* shareableHandle) {
  size_t size = resources->size;
  // Create a Multicast group
  CUmulticastObjectProp* prop = &resources->properties;
  INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank);
  CUCHECK(cuMulticastCreate(&resources->mcHandle, prop));
  if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
    // Get a handle to pass to other ranks
    CUCHECK(cuMemExportToShareableHandle(shareableHandle, resources->mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0));
  }
  else {
    memcpy(shareableHandle, &resources->mcHandle, sizeof(resources->mcHandle));
  }
  INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", resources->mcHandle, nranks, size, rank);
  return ncclSuccess;
 }
 ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct nvlsResources* resources) {
  INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev);
  CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev));
  return ncclSuccess;
 }
 ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct nvlsResources* resources) {
  int dev = resources->dev;
  size_t size = resources->size;
  INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
  // Unbind physical memory from group for the given device
  CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
  return ncclSuccess;
 }
 ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resources, int rank, char* shareableHandle) {
  CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
  INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
  // Import and map the remote memory descriptor to the local GPU
  if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
    // cuMem UDS support
    int fd = *(int *)shareableHandle;
    TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd);
    struct ncclProxyConnector proxyConn;
    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, rank, &proxyConn));
    TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank);
    NCCLCHECK(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgConvertFd, shareableHandle, sizeof(int), &fd, sizeof(int)));
    TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
    CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type));
  } else {
    if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
      CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)shareableHandle, type));
    } else {
      memcpy(&resources->mcHandle, shareableHandle, sizeof(resources->mcHandle));
    }
  }
  return ncclSuccess;
 }
 ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resources) {
  size_t size = resources->size;
  size_t granularity;
  CUdeviceptr ptr = 0;
  CUmemAllocationProp prop;
  memset(&prop, 0, sizeof(prop));
  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  prop.location.id = resources->dev;
  prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
  // Map a VA for UC memory
  CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0));
  // Alloc local physical mem for this NVLS group
  CUCHECK(cuMemCreate(&resources->ucHandle, size, &prop, 0));
  CUCHECK(cuMemMap(ptr, size, 0, resources->ucHandle, 0));
  CUCHECK(cuMemSetAccess(ptr, size, &resources->accessDesc, 1));
  CUDACHECK(cudaMemset((void*)ptr, 0, size));
  resources->ucBuff = (char*)ptr;
  INFO(NCCL_NVLS, "NVLS Mapped UC at %p size %zi", resources->ucBuff, size);
  // Bind physical memory to the Multicast group
  // NB: It will block until all ranks have been added to the Group
  INFO(NCCL_NVLS, "NVLS Bind mem %p UC handle 0x%llx MC handle 0x%llx size %zi", (void*)ptr, resources->ucHandle, resources->mcHandle, size);
  CUCHECK(cuMulticastBindMem(resources->mcHandle, 0/*mcOffset*/, resources->ucHandle, 0/*memOffset*/, size, 0/*flags*/));
  return ncclSuccess;
 }
 ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resources) {
  size_t size = resources->size;
  CUdeviceptr ptr = 0;
  // Create a VA for the NVLS
  CUCHECK(cuMemAddressReserve(&ptr, size, resources->granularity, 0U, 0));
  // Map the VA locally
  CUCHECK(cuMemMap(ptr, size, 0, resources->mcHandle, 0));
  resources->mcBuff = (char*)ptr;
  INFO(NCCL_NVLS, "NVLS Mapped MC buffer at %p size %zi", resources->mcBuff, size);
  // Having completed the BindMem we can now call SetAccess
  // NB: It will block until all ranks have bound to the Group
  CUCHECK(cuMemSetAccess((CUdeviceptr)resources->mcBuff, size, &resources->accessDesc, 1));
  return ncclSuccess;
 }
 ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* resources) {
  size_t size;
  CUdeviceptr ptr;
  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)",
       resources->ucHandle, resources->ucBuff, resources->mcHandle, resources->mcBuff);
  // Release the UC memory and mapping
  ptr = (CUdeviceptr)resources->ucBuff;
  size = resources->size;
  CUCHECK(cuMemUnmap(ptr, size));
  CUCHECK(cuMemAddressFree(ptr, size));
  CUCHECK(cuMemRelease(resources->ucHandle));
  // Release the MC memory and mapping
  ptr = (CUdeviceptr)resources->mcBuff;
  size = resources->size;
  CUCHECK(cuMemUnmap(ptr, size));
  CUCHECK(cuMemAddressFree(ptr, size));
  CUCHECK(cuMemRelease(resources->mcHandle));
  return ncclSuccess;
 }
 #include "bootstrap.h"
 #include "channel.h"
 #define NVLS_MEM_ALIGN_SIZE (1 << 21)
 NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
 NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 1);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
  if (!ncclParamNvlsEnable() || comm->localRanks <= 1 || comm->nNodes>1) return ncclSuccess;
  CUdevice dev;
  int driverVersion;
  if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess;
  CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
  CUDACHECK(cudaDriverGetVersion(&driverVersion));
  comm->nvlsSupport = 0;
  // NVLS Multicast support requires CUDA12.1 UMD + KMD
  if (CUPFN(cuMulticastCreate) != NULL && driverVersion >= 12010) {
    CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
  }
  INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
  if (comm->nvlsSupport == 0) return ncclSuccess;
  int nChannels = comm->nvlsChannels = std::max(comm->minCTAs, std::min(comm->maxCTAs, (int)ncclParamNvlsChannels()));
  int rank = comm->localRank, nranks = comm->localRanks;
  for (int c=0; c<nChannels; c++) {
    NCCLCHECK(initChannel(comm, c));
  }
  ncclResult_t res = ncclSuccess;
  struct nvlsResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  comm->nvlsResources = resources;
  size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
  size_t memSize = NVLS_MEM_ALIGN_SIZE;
  size_t nvlsPerRankSize = nChannels*2*(buffSize+memSize);
  size_t nvlsTotalSize = nvlsPerRankSize*nranks;
  INFO(NCCL_INIT|NCCL_NVLS, "NVLS comm %p rank %d nranks %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
       comm, rank, nranks, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
  char* nvlsShareableHandle = NULL;
  NCCLCHECKGOTO(ncclCalloc(&nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
  NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nranks, nvlsTotalSize), res, cleanup);
  if (rank == 0) {
    NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, rank, nranks, nvlsShareableHandle), res, cleanup);
    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
  } else {
    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
    NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, 0, nvlsShareableHandle), res, cleanup);
  }
  NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
  NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
  // Local intra-node barrier to ensure everyone has bound their memory to the group
  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
  NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
  for (int c=0; c<nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    channel->nvls.nHeads = nranks;
    for (int i=0; i<NCCL_MAX_NVLS_ARITY; i++) channel->nvls.up[i] = -1;
    channel->nvls.down = comm->nRanks+1+comm->localRank;
    channel->nvls.out = -1;       // Network not yet implemented.
    channel->nvls.headRank = comm->localRank;  // Network not yet implemented.
  }
  for (int r=0; r<nranks; r++) {
    int nvlsPeer = comm->nRanks+1+r;
    for (int c=0; c<nChannels; c++) {
      struct ncclChannel* channel = comm->channels+c;
      channel->nvls.up[r] = nvlsPeer;
      char* mem = NULL;
      struct ncclChannelPeer* peer = channel->peers+nvlsPeer;
      // Reduce UC -> MC
      mem = resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize);
      peer->send[0].transportComm = &nvlsTransport.send;
      peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
      peer->send[0].conn.head = (uint64_t*)(mem+buffSize);
      peer->send[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
      mem = resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize);
      peer->recv[1].transportComm = &nvlsTransport.recv;
      peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
      peer->recv[1].conn.head = (uint64_t*)(mem+buffSize);
      peer->recv[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
      peer->recv[1].conn.flags |= NCCL_NVLS_MIN_POLL;
      // Broadcast MC -> UC
      mem = resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
      peer->recv[0].transportComm = &nvlsTransport.recv;
      peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
      peer->recv[0].conn.head = (uint64_t*)(mem+buffSize);
      peer->recv[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
      mem = resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
      peer->send[1].transportComm = &nvlsTransport.send;
      peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
      peer->send[1].conn.head = (uint64_t*)(mem+buffSize);
      peer->send[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
      peer->send[1].conn.flags |= NCCL_NVLS_MIN_POLL;
      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
      /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
          nvlsPeer, c,
          resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize),
          resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize),
          resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize),
          resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize));*/
    }
  }
  free(nvlsShareableHandle);
  return res;
 cleanup:
  comm->nvlsSupport = 0;
  free(nvlsShareableHandle);
  return res;
 }
 ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
  struct nvlsResources* resources = (struct nvlsResources*)comm->nvlsResources;
  if (resources == NULL) return ncclSuccess;
  NCCLCHECK(nvlsGroupUnbind(comm, resources));
  NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
  free(resources);
  comm->nvlsResources = NULL;
  return ncclSuccess;
 }
 #else
 /*
 * Pre CUDA 12.1 stubs
 */
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
  return ncclSuccess;
 }
 ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
  return ncclSuccess;
 }
 #endif /* CUDA_VERSION >= 12010 */
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@ -239,11 +239,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
-      if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+      if (ncclParamP2pDirectDisable() == 0) send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
          channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
    } else {
-      send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
+      send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
          channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
    }
@ -256,11 +256,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
  if (useMemcpy) {
-    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
+    NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
    info->shmSize = resources->proxyInfo.shmSize;
    memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
  } else {
-    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+    NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
    NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
  }
@ -290,16 +290,16 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
-      if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+      if (ncclParamP2pDirectDisable() == 0) recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
    } else {
-      recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
+      recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
    }
  } else {
    info->rank = intermediateRank;
  }
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
-  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+  NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
  return ncclSuccess;
@ -330,7 +330,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
    send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
    send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
    // Send SIMPLE buff to proxy, and replace it by local buffer
-    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
+    NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
    send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
  } else {
    send->conn.tail = &remDevMem->tail;
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@ -157,7 +157,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
  if (useMemcpySend) {
    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
    struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
-    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
    send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
    send->conn.tail = &proxyInfo.ceRecvMem->tail;
    send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
@ -187,7 +187,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
  if (useMemcpyRecv) {
    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
    struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
-    NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
    recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
    recv->conn.tail = &proxyInfo.ceRecvMem->tail;
  }