From e11238b3029795d33f958b5868d47c90c4f22628 Mon Sep 17 00:00:00 2001
From: Ke Wen <kwen@nvidia.com>
Date: Wed, 8 Sep 2021 13:56:25 -0700
Subject: [PATCH] 2.11.4-1

Add new API for creating a reduction operation which multiplies the input by a rank-specific scalar before doing an inter-rank summation (see: ncclRedOpCreatePreMulSum).
Improve CollNet (SHARP) performance of ncclAllReduce when captured in a CUDA Graph via user buffer registration.
Add environment variable NCCL_NET_PLUGIN="<suffix>" to allow user to choose among multiple NCCL net plugins by substituting into "libnccl-net-<suffix>.so".
Fix memory leak of NVB connections.
Fix topology detection of IB Virtual Functions (SR-IOV).
---
 makefiles/version.mk                     |   4 +-
 src/bootstrap.cc                         |  24 +-
 src/collectives/device/Makefile          |   9 +-
 src/collectives/device/all_gather.h      |  10 +-
 src/collectives/device/all_reduce.h      |  72 +--
 src/collectives/device/broadcast.h       |  10 +-
 src/collectives/device/common.h          | 115 +++--
 src/collectives/device/common_kernel.h   |  71 ++-
 src/collectives/device/functions.cu      | 135 ++---
 src/collectives/device/gen_rules.sh      |   2 +-
 src/collectives/device/onerank_reduce.cu |  60 +++
 src/collectives/device/prims_ll.h        |   6 +-
 src/collectives/device/prims_ll128.h     |   6 +-
 src/collectives/device/prims_simple.h    | 266 +++++++---
 src/collectives/device/reduce.h          |  10 +-
 src/collectives/device/reduce_kernel.h   | 153 ++++--
 src/collectives/device/reduce_scatter.h  |  10 +-
 src/collectives/device/sendrecv.h        |  17 +-
 src/debug.cc                             |   2 +-
 src/enqueue.cc                           | 626 ++++++++++++++++-------
 src/graph/topo.cc                        |   5 +-
 src/graph/xml.cc                         |   2 +-
 src/graph/xml.h                          |   5 +
 src/include/alloc.h                      |   7 +-
 src/include/bootstrap.h                  |   3 +-
 src/include/collectives.h                | 131 +++--
 src/include/comm.h                       |  63 ++-
 src/include/debug.h                      |   7 +-
 src/include/devcomm.h                    |  33 +-
 src/include/enqueue.h                    |  59 +++
 src/include/gdrwrap.h                    |   1 +
 src/include/info.h                       |   2 +
 src/include/param.h                      |   1 +
 src/include/transport.h                  |   2 +-
 src/init.cc                              |  80 ++-
 src/misc/argcheck.cc                     |   8 +-
 src/misc/ibvwrap.cc                      |   2 +-
 src/nccl.h.in                            |  49 +-
 src/transport.cc                         |  28 +-
 src/transport/net_ib.cc                  |   2 +
 src/transport/p2p.cc                     |  33 +-
 41 files changed, 1532 insertions(+), 599 deletions(-)
 create mode 100644 src/collectives/device/onerank_reduce.cu
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 833ab99..22bddce 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 10
-NCCL_PATCH   := 3
+NCCL_MINOR   := 11
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 021a49a..ae9da9b 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -202,7 +202,7 @@ struct unexConn {
 struct remAllocState {
   int cudaDev;
   int listenFd;
-  int stop;
+  volatile int stop;
 };
 
 struct extState {
@@ -257,7 +257,7 @@ void* ncclRemoteMemAllocationService(void* args) {
   for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
   for (int s=0; s<MAX_SEGMENTS; s++) {
     pollfds[s].fd = -1;
-    pollfds[s].events = POLLHUP;
+    pollfds[s].events = POLLIN;
   }
   pollfds[MAX_SEGMENTS].fd = state->listenFd;
   pollfds[MAX_SEGMENTS].events = POLLIN;
@@ -285,7 +285,7 @@ void* ncclRemoteMemAllocationService(void* args) {
       }
     }
     for (int s=0; s<MAX_SEGMENTS; s++) {
-      if (pollfds[s].revents & POLLHUP) {
+      if (pollfds[s].revents & (POLLIN|POLLHUP)) {
         if (cudaFree(segments[s]) != cudaSuccess) {
           WARN("[Rem Allocator] cudaFree %p failed", segments[s]);
         }
@@ -429,7 +429,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
   return ncclSuccess;
 }
 
-ncclResult_t bootstrapBarrier(void* commState, int *ranks, int tag, int rank, int nranks) {
+ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
   if (nranks == 1) return ncclSuccess;
   TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag);
 
@@ -450,6 +450,22 @@ ncclResult_t bootstrapBarrier(void* commState, int *ranks, int tag, int rank, in
   return ncclSuccess;
 }
 
+ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size) {
+  if (nranks == 1) return ncclSuccess;
+  char* data = (char*)allData;
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size);
+
+  for (int i=1; i<nranks; i++) {
+    int src = (rank - i + nranks) % nranks;
+    int dst = (rank + i) % nranks;
+    NCCLCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data+rank*size, size));
+    NCCLCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data+src*size, size));
+  }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
+  return ncclSuccess;
+}
+
 ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd, union socketAddress *addr) {
   // New unex
   struct unexConn* unex;
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index ead98ec..04bce8e 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -10,7 +10,7 @@ include ../../../makefiles/version.mk
 BUILDDIR ?= $(abspath ../../../build)
 OBJDIR := $(BUILDDIR)/obj/collectives/device
 
-LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu
+LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu
 
 LIBSRCFILES += functions.cu
 
@@ -36,7 +36,7 @@ $(RULESFILE) :
 
 -include $(RULESFILE)
 
-LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o
+LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o
 
 -include $(DEPFILES)
 
@@ -63,6 +63,11 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
 	mkdir -p `dirname $@`
 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
 
+$(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
+
 # ... and create the device-side linked object with all those.
 $(DEVOBJ) : $(LIBOBJ)
 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 3d781af..83b0da9 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -10,7 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ void runRing(ncclWorkElem *args) {
+  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
     const int nthreads = args->nThreads;
     const int bid = args->coll.bid;
@@ -27,7 +27,7 @@ namespace {
     T *inputBuf = (T*)args->sendbuff;
     T *outputBuf = (T*)args->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 1, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf);
+      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -78,7 +78,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -86,14 +86,14 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index bd088e9..c3171bf 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -10,7 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ void runRing(ncclWorkElem *args) {
+  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
     const int nthreads = args->nThreads;
     const int bid = args->coll.bid;
@@ -31,7 +31,7 @@ namespace {
     }
 
     Primitives<T, RedOp, FanSymmetric<1>, 1, Proto> prims
-      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff);
+      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -95,7 +95,7 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
-  __device__ void runTreeUpDown(ncclWorkElem *args) {
+  __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
     const int tid = threadIdx.x;
     const int nthreads = args->nThreads;
     const int bid = args->coll.bid;
@@ -115,7 +115,7 @@ namespace {
 
     { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto> prims
-        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff);
+        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg);
       if (tree->up == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -141,7 +141,7 @@ namespace {
 
     { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
       Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto> prims
-        (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff);
+        (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg);
       if (tree->up == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -167,7 +167,7 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
-  __device__ void runTreeSplit(ncclWorkElem *args) {
+  __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
     const int tid = threadIdx.x;
     const int nthreads = args->nThreads;
     const int bid = args->coll.bid;
@@ -199,7 +199,7 @@ namespace {
     if (tree->up == -1) {
       // Reduce and broadcast. Max number of recv is 3, max number of send is 3
       Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto>
-        prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff);
+        prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*int(chunkSize);
         int nelem = min(chunkSize, size-offset);
@@ -216,7 +216,7 @@ namespace {
        * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
        */
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/1, Proto>
-        prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, 0*Proto::MaxGroupWidth);
+        prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, 0*Proto::MaxGroupWidth);
       if (tree->down[0] == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -235,7 +235,7 @@ namespace {
     else {
       // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
       Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto>
-        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, 1*Proto::MaxGroupWidth);
+        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, 1*Proto::MaxGroupWidth);
       if (tree->down[0] == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -256,7 +256,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -264,7 +264,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     #if CUDART_VERSION >= 11020 && CUDART_VERSION < 11040 && __CUDA_ARCH__ >= 800
       runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
     #else
@@ -275,7 +275,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SI
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     static constexpr int COLLNET_COPY_THREADS = 96;
     const int tid = threadIdx.x;
     const int bid = args->coll.bid;
@@ -299,27 +299,37 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
 
     if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
       // Scatter
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto>
-        prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, 2*Proto::MaxGroupWidth);
+      int group = (2*Proto::MaxGroupWidth) | (1<<16);
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto>
+        prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
         int nelem = min(tree->nHeads*chunkSize, size-offset);
-        prims.scatter(offset, nelem, chunkSize, tree->headRank, tree->shift);
+        if (args->regUsed) {
+          prims.directScatter(offset, nelem, chunkSize, tree->headRank, tree->shift);
+        } else {
+          prims.scatter(offset, nelem, chunkSize, tree->headRank, tree->shift);
+        }
       }
     } else if (tid >= tidStartReduce && tree->out != -1) {
+      int group = (3*Proto::MaxGroupWidth) | (1<<16);
       if (hasDn) {
         // Reduce, send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto>
-          prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, 3*Proto::MaxGroupWidth);
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto>
+          prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
-          prims.recvReduceSend(offset, nelem);
+          if (args->regUsed) {
+            prims.directRecvReduceSend(offset, offset, nelem);
+          } else {
+            prims.recvReduceSend(offset, nelem);
+          }
         }
       } else {
         // Directly send to network
         Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto>
-          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, 3*Proto::MaxGroupWidth);
+          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -328,27 +338,29 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
       }
     } else if (tid < tidStartBcast && hasUp) {
       // Gather
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto>
-        prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, 0*Proto::MaxGroupWidth);
+      int group = (0*Proto::MaxGroupWidth) | (0<<16);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto>
+        prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
         int nelem = min(tree->nHeads*chunkSize, size-offset);
-        prims.gather(offset, nelem, chunkSize, tree->headRank, tree->shift);
+        prims.directGather(offset, nelem, chunkSize, tree->headRank, tree->shift);
       }
     } else if (tid >= tidStartBcast && tid < tidStartScatter && tree->out != -1) {
+      int group = (1*Proto::MaxGroupWidth) | (0<<16);
       if (hasDn) {
         // Recv from network, broadcast
-        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto>
-          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, 1*Proto::MaxGroupWidth);
+        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto>
+          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
-          prims.recvCopySend(offset, nelem, /*postOp=*/true);
+          prims.recvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
         }
       } else {
         // Recv from network (no post thread needed)
         Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto>
-          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, 1*Proto::MaxGroupWidth);
+          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->coll.redOpArg, group);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -361,28 +373,28 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runTreeSplit<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runTreeSplit<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index f867315..61c60b9 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -10,7 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ void runRing(ncclWorkElem *args) {
+  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
     const int nthreads = args->nThreads;
     const int bid = args->coll.bid;
@@ -27,7 +27,7 @@ namespace {
     T *inputBuf = (T*)args->sendbuff;
     T *outputBuf = (T*)args->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf);
+      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -61,7 +61,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -69,14 +69,14 @@ struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 2b5d516..ff410d7 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -67,9 +67,18 @@ struct RunWorkElement {
   }
 };
 
+#if CUDART_VERSION >= 11030
+__device__ constexpr int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] =
+#else
+static __device__ __constant__ int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] =
+#endif
+{/*Tree*/1, /*Ring and P2P*/1, /*CollNet*/NCCL_REG_ELEM_FACTOR};
+
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
 struct RunWork {
-  __device__ void run(ncclWork *w) {
+  // This __forceinline__ is necessary. The compiler was inserting a function call
+  // here from the LL ncclKernel.
+  __device__ __forceinline__ void run(ncclWork *w) {
     int tid = threadIdx.x;
     /* Some invariants that must hold:
      * 1. All elems[] have same funcIndex.
@@ -78,23 +87,21 @@ struct RunWork {
      *    for all elems[].
      *
      * If (1) isn't true then we might be in the wrong function since dispatch
-     * on ncclFuncs[w->elems[0].funcIndex] is how we got here.
+     * on ncclFuncs[w->funcIndex] is how we got here.
      *
      * If (2) or (3) aren't true, then threads from different work elements
      * could race for barrier resources (barrier numbers 0...15) which is fatal.
      *
-     * Important, to ensure (3), implementations of
-     * `RunWorkElement<Fn,T,RedOp,Algo,Proto>::run()` may only use values which
-     * are the same for all elems[] when deciding how to map threads to groups,
-     * such as  the following:
+     * IMPORTANT!!! To ensure (3), implementations of
+     * `RunWorkElement<Fn,T,RedOp,Algo,Proto>::run()` may only use the following
+     * when deciding how to map threads to groups:
      *    Fn, T, RedOp, Algo, Proto, nThreads
      *
-     * This last one is difficult to enforce and diagnosing it is a headeache.
-     * Device-side developers, consider yourselves warned.
+     * This last one is difficult to enforce so I hope everyone reads this.
      */
     if (tid < w->elems[0].nThreads) {
       #pragma unroll 1
-      for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e++)
+      for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e+=ncclWorkElemFactors[Algo])
         RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
     }
   }
@@ -108,6 +115,7 @@ struct ncclShmemGroup {
   ncclConnInfo *sendConns[NCCL_MAX_DIRECT_ARITY];
   void* srcs[NCCL_MAX_DIRECT_ARITY+1];
   void* dsts[NCCL_MAX_DIRECT_ARITY+1];
+  int totalSendSize[NCCL_MAX_SLICE_PER_CHUNK];
 };
 
 struct ncclShmemData {
@@ -115,6 +123,7 @@ struct ncclShmemData {
     uint64_t ll128warp[NCCL_LL128_MAX_NTHREADS/WARP_SIZE][NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE];
     struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
   };
+  uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
   ncclDevComm comm;
   ncclChannel channel;
   ncclWork work;
@@ -135,7 +144,10 @@ __device__ void ncclKernel(ncclWorkElem first)  {
   // To optimize for latency, (only) the first operation is passed as argument.
   if (bid == 0 && first.active != 0) {
     turn = copyToShmem(&ncclShmem.work.elems[0], &first, turn);
-    if (tid == 0) ncclShmem.work.elems[1].active = 0;
+    if (1 <= tid && tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) {
+      ncclShmem.work.elems[tid].active = 0;
+      ncclShmem.work.elems[tid].redOpArgIsPtr = 0;
+    }
   }
   __syncthreads(); // publish ncclShmem
 
@@ -161,6 +173,29 @@ __device__ void ncclKernel(ncclWorkElem first)  {
     if (tid == 0)
       channel->index = workFifoIx; // write back to real channel, not shmem shadow
 
+    if (tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) {
+      ncclWorkElem *we = &ncclShmem.work.elems[tid];
+      if (we->redOpArgIsPtr && we->active != 0) {
+        /* redOpArg is a pointer to the scalar value, so we'll dereference it
+         * here so that redOpArg holds the bits of the scalar going forward.
+         * The tricky thing is we don't know its type T since that's encoded in
+         * the funcIndex. Because it would be difficult to get sizeof(T) from
+         * funcIndex, we'll cheat and just dereference the largest possible size
+         * given the alignment of the pointer. We might be reading in more bytes
+         * than we need but that's harmless.
+         */
+        if (we->coll.redOpArg%2 != 0)
+          we->coll.redOpArg = *reinterpret_cast<uint8_t*>(we->coll.redOpArg);
+        else if (we->coll.redOpArg%4 != 0)
+          we->coll.redOpArg = *reinterpret_cast<uint16_t*>(we->coll.redOpArg);
+        else if (we->coll.redOpArg%8 != 0)
+          we->coll.redOpArg = *reinterpret_cast<uint32_t*>(we->coll.redOpArg);
+        else
+          we->coll.redOpArg = *reinterpret_cast<uint64_t*>(we->coll.redOpArg);
+      }
+    }
+    __syncthreads();
+
     if (ncclShmem.work.elems[0].funcIndex == FnIndex)
       RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
     else
@@ -174,52 +209,52 @@ __device__ void ncclKernel(ncclWorkElem first)  {
 
 // Only generate kernels for SUM
 #if NCCL_OP == 0
-#define IMPL_COLL_KERN(func, algo, proto, redop, type, fIndex) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(ncclWorkElem first) { \
-  ncclKernel<ncclFunc##func, type, Func##redop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(first); \
+#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
+__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem first) { \
+  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(first); \
 }
 #else
-#define IMPL_COLL_KERN(func, algo, proto, redop, type, fInded)
+#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded)
 #endif
 
 // Examples :     AllReduce, RING, LL,    Sum,   uint8
-#define IMPL_COLL_FUNC(func, algo, proto, redop, type) \
-__device__ void NCCL_FUNC_NAME(func, algo, proto, redop, type)() { \
-  RunWork<ncclFunc##func, type, Func##redop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
+#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
+__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
+  RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
 }
 
 // Only generate inline kernels for LL
-#define IMPL_COLL4(func, algo, redop, type, ncclType) \
-  IMPL_COLL_FUNC(func, algo, LL,     redop, type) \
-  IMPL_COLL_FUNC(func, algo, LL128,  redop, type) \
-  IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type) \
-  IMPL_COLL_KERN(func, algo, LL,     redop, type, FUNC_INDEX(ncclFunc##func, nccl##redop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \
+#define IMPL_COLL4(func, algo, devredop, type, ncclType) \
+  IMPL_COLL_FUNC(func, algo, LL,     devredop, type) \
+  IMPL_COLL_FUNC(func, algo, LL128,  devredop, type) \
+  IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) \
+  IMPL_COLL_KERN(func, algo, LL,     devredop, type, FUNC_INDEX(ncclFunc##func, ncclDev##devredop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \
 
-#define IMPL_COLL3(func, redop, type, ncclType) \
-  IMPL_COLL4(func, TREE,    redop, type, ncclType) \
-  IMPL_COLL4(func, RING,    redop, type, ncclType) \
-  IMPL_COLL4(func, COLLNET, redop, type, ncclType)
+#define IMPL_COLL3(func, devredop, type, ncclType) \
+  IMPL_COLL4(func, TREE,    devredop, type, ncclType) \
+  IMPL_COLL4(func, RING,    devredop, type, ncclType) \
+  IMPL_COLL4(func, COLLNET, devredop, type, ncclType)
 
 #if NCCL_TYPE == 0
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int8_t,   ncclInt8)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t,   ncclInt8)
 #elif NCCL_TYPE == 1
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint8_t,  ncclUint8)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint8_t,  ncclUint8)
 #elif NCCL_TYPE == 2
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int32_t,  ncclInt32)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int32_t,  ncclInt32)
 #elif NCCL_TYPE == 3
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint32_t, ncclUint32)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint32_t, ncclUint32)
 #elif NCCL_TYPE == 4
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int64_t,  ncclInt64)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int64_t,  ncclInt64)
 #elif NCCL_TYPE == 5
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint64_t, ncclUint64)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint64_t, ncclUint64)
 #elif NCCL_TYPE == 6
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, half,     ncclFloat16)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, half,     ncclFloat16)
 #elif NCCL_TYPE == 7
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, float,    ncclFloat32)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, float,    ncclFloat32)
 #elif NCCL_TYPE == 8
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, double,   ncclFloat64)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, double,   ncclFloat64)
 #elif NCCL_TYPE == 9 && defined(__CUDA_BF16_TYPES_EXIST__)
-#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, __nv_bfloat16, ncclBfloat16)
+#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, __nv_bfloat16, ncclBfloat16)
 #endif
 
 // Reduction define all functions
@@ -232,7 +267,13 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, redop, type)() { \
 #elif NCCL_OP == 3
 #define IMPL_COLL_R(func) IMPL_COLL2(func, Max);
 #elif NCCL_OP == 4
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Avg);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, PreMulSum);
+#elif NCCL_OP == 5
+  #if NCCL_TYPE < 6
+    #define IMPL_COLL_R(func) IMPL_COLL2(func, SumPostDiv);
+  #else
+    #define IMPL_COLL_R(func) // skip SumPostDiv for floating point
+  #endif
 #endif
 
 #if NCCL_OP == 0 && NCCL_TYPE == 0
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index c90988c..dcf1f66 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -26,7 +26,6 @@ typedef uint64_t PackType;
 
 template<typename Fn>
 struct FuncTraits /*{
-  __device__ static Fn make();
   __device__ static T preOp(Fn, T);
   __device__ static T postOp(Fn, T);
 }*/;
@@ -487,12 +486,12 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
   asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
 }
 
-template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS, int PreOpN, typename Int>
 __device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t,
-    FUNC fn, bool preOpSrc0, bool postOp, int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem
+    uint64_t* redOpArgs, bool postOp, int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const Int Nelem
   ) {
-  const int inc = nw * UNROLL * WARP_SIZE;
-  int offset = w * UNROLL * WARP_SIZE + t;
+  const Int inc = nw * UNROLL * WARP_SIZE;
+  Int offset = w * UNROLL * WARP_SIZE + t;
 
   const T* srcs[MAXSRCS];
   for (int i=0; i<MAXSRCS; i++) srcs[i] = s[i]+elemOffset+offset;
@@ -503,26 +502,36 @@ __device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const
     T vals[UNROLL];
     // Load and reduce
     for (int u = 0; u < UNROLL; ++u) vals[u] = vFetch(srcs[0]+u*WARP_SIZE);
-    if (preOpSrc0) {
+    if (PreOpN) {
+      FUNC fn(redOpArgs[0]);
       for (int u = 0; u < UNROLL; ++u) vals[u] = FuncTraits<FUNC>().preOp(fn, vals[u]);
     }
 
     #pragma unroll
     for (int i=1; i<MINSRCS; i++) {
       T vals2[UNROLL];
+      FUNC fn(redOpArgs[i]);
       for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
+      if (i<PreOpN) {
+        for (int u = 0; u < UNROLL; ++u) vals2[u] = FuncTraits<FUNC>().preOp(fn, vals2[u]);
+      }
       for (int u = 0; u < UNROLL; ++u) vals[u] = fn(vals[u], vals2[u]);
     }
     #pragma unroll
     for (int i=MINSRCS; i<MAXSRCS; i++) {
       if (i<nsrcs) {
         T vals2[UNROLL];
+        FUNC fn(redOpArgs[i]);
         for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
+        if (i<PreOpN) {
+          for (int u = 0; u < UNROLL; ++u) vals2[u] = FuncTraits<FUNC>().preOp(fn, vals2[u]);
+        }
         for (int u = 0; u < UNROLL; ++u) vals[u] = fn(vals[u], vals2[u]);
       }
     }
 
     if (postOp) {
+      FUNC fn(redOpArgs[0]);
       #pragma unroll
       for (int u = 0; u < UNROLL; ++u) vals[u] = FuncTraits<FUNC>().postOp(fn, vals[u]);
     }
@@ -544,12 +553,12 @@ __device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const
   }
 }
 
-template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS, int PreOpN, typename Int>
 __device__ __forceinline__ void ReduceCopy128bMulti(const int w, const int nw, const int t,
-    FUNC fn, bool preOpSrc0, bool postOp, int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack
+    uint64_t* redOpArgs, bool postOp, int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const Int Npack
   ) {
-  const int inc = nw * UNROLL * WARP_SIZE;
-  int offset = w * UNROLL * WARP_SIZE + t;
+  const Int inc = nw * UNROLL * WARP_SIZE;
+  Int offset = w * UNROLL * WARP_SIZE + t;
 
   const Pack128* srcs[MAXSRCS];
   for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
@@ -560,26 +569,36 @@ __device__ __forceinline__ void ReduceCopy128bMulti(const int w, const int nw, c
     Pack128 vals[UNROLL];
     // Load and reduce
     for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
-    if (preOpSrc0) {
+    if (PreOpN) {
+      FUNC fn(redOpArgs[0]);
       for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>().preOp(fn, vals[u]);
     }
 
     #pragma unroll
     for (int i=1; i<MINSRCS; i++) {
       Pack128 vals2[UNROLL];
+      FUNC fn(redOpArgs[i]);
       for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      if (i<PreOpN) {
+        for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>().preOp(fn, vals2[u]);
+      }
       for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(fn, vals[u], vals2[u]);
     }
     #pragma unroll
     for (int i=MINSRCS; i<MAXSRCS; i++) {
       if (i<nsrcs) {
         Pack128 vals2[UNROLL];
+        FUNC fn(redOpArgs[i]);
         for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+        if (i<PreOpN) {
+          for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>().preOp(fn, vals2[u]);
+        }
         for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(fn, vals[u], vals2[u]);
       }
     }
 
     if (postOp) {
+      FUNC fn(redOpArgs[0]);
       #pragma unroll
       for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>().postOp(fn, vals[u]);
     }
@@ -606,11 +625,11 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
 
 #define PACKELEMS (sizeof(Pack128) / sizeof(T))
 
-template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS, int PreOpN, typename Int>
 __device__ __forceinline__ void ReduceOrCopyMulti(
-    const int tid, const int nthreads, FUNC fn, bool preOpSrc0, bool postOp, int nsrcs, const T** srcs, int ndsts, T** dsts, int N
+    const int tid, const int nthreads, uint64_t* redOpArgs, bool postOp, int nsrcs, const T** srcs, int ndsts, T** dsts, Int N
   ) {
-  int Nrem = N;
+  Int Nrem = N;
   if (Nrem <= 0) return;
 
   int w = tid / WARP_SIZE;       // Warp number
@@ -626,17 +645,17 @@ __device__ __forceinline__ void ReduceOrCopyMulti(
   for (int i=0; i<MINDSTS; i++) align |= ptrAlign128(dsts[i]);
   for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) align |= ptrAlign128(dsts[i]);
 
-  int offset = 0;
+  Int offset = 0;
   if (align == 0) {
     // fast path: use 128b loads/stores to do the bulk of the work,
     // assuming the pointers we have are all 128-bit aligned.
 
     // main loop
-    int Npack = (Nrem / (PACKELEMS*UNROLL*WARP_SIZE)) * (UNROLL*WARP_SIZE); // round down
-    int Nelem = Npack * PACKELEMS;
+    Int Npack = (Nrem / (PACKELEMS*UNROLL*WARP_SIZE)) * (UNROLL*WARP_SIZE); // round down
+    Int Nelem = Npack * PACKELEMS;
 
-    ReduceCopy128bMulti<FUNC, T, UNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>
-      (w, nw, t, fn, preOpSrc0, postOp, nsrcs, srcs, ndsts, dsts, offset, Npack);
+    ReduceCopy128bMulti<FUNC, T, UNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS, PreOpN>
+      (w, nw, t, redOpArgs, postOp, nsrcs, srcs, ndsts, dsts, offset, Npack);
 
     Nrem -= Nelem;
     if (Nrem == 0) return;
@@ -646,8 +665,8 @@ __device__ __forceinline__ void ReduceOrCopyMulti(
     Npack = Nrem / PACKELEMS;
     Nelem = Npack * PACKELEMS;
 
-    ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>
-      (w, nw, t, fn, preOpSrc0, postOp, nsrcs, srcs, ndsts, dsts, offset, Npack);
+    ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS, PreOpN>
+      (w, nw, t, redOpArgs, postOp, nsrcs, srcs, ndsts, dsts, offset, Npack);
 
     Nrem -= Nelem;
     if (Nrem == 0) return;
@@ -655,18 +674,18 @@ __device__ __forceinline__ void ReduceOrCopyMulti(
   }
 
   // unrolled, by-type (mostly for unaligned buffers)
-  int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
+  Int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
 
-  ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>
-    (w, nw, t, fn, preOpSrc0, postOp, nsrcs, srcs, ndsts, dsts, offset, Nelem);
+  ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS, PreOpN>
+    (w, nw, t, redOpArgs, postOp, nsrcs, srcs, ndsts, dsts, offset, Nelem);
 
   Nrem -= Nelem;
   if (Nrem == 0) return;
   offset += Nelem;
 
   // no unroll, by type. Should finish what's remaining.
-  ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>
-    (w, nw, t, fn, preOpSrc0, postOp, nsrcs, srcs, ndsts, dsts, offset, Nrem);
+  ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS, PreOpN>
+    (w, nw, t, redOpArgs, postOp, nsrcs, srcs, ndsts, dsts, offset, Nrem);
 }
 
 #endif // COMMON_KERNEL_H_
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 15d7a6e..c7060f3 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -10,95 +10,100 @@
 
 __shared__ ncclShmemData ncclShmem;
 
-#define NCCL_FUNC5(func, algo, redop, type) \
-  NCCL_FUNC_NAME(func, algo, LL,     redop, type), \
-  NCCL_FUNC_NAME(func, algo, LL128,  redop, type), \
-  NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
+#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128,  devredop, type)), \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
 
-#define NCCL_FUNC4(func, redop, type) \
-  NCCL_FUNC5(func, TREE,    redop, type), \
-  NCCL_FUNC5(func, RING,    redop, type), \
-  NCCL_FUNC5(func, COLLNET, redop, type)
+#define NCCL_FUNC4(func, devredop, type, nullify) \
+  NCCL_FUNC5(func, TREE,    devredop, type, nullify), \
+  NCCL_FUNC5(func, RING,    devredop, type, nullify), \
+  NCCL_FUNC5(func, COLLNET, devredop, type, nullify)
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, redop) \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, uint8_t), \
-  NCCL_FUNC4(func, redop, int32_t), \
-  NCCL_FUNC4(func, redop, uint32_t), \
-  NCCL_FUNC4(func, redop, int64_t), \
-  NCCL_FUNC4(func, redop, uint64_t), \
-  NCCL_FUNC4(func, redop, half), \
-  NCCL_FUNC4(func, redop, float), \
-  NCCL_FUNC4(func, redop, double), \
-  NCCL_FUNC4(func, redop, __nv_bfloat16)
-#define NCCL_FUNCS3B(func, redop) \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t)
+#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, uint8_t, 0), \
+  NCCL_FUNC4(func, devredop, int32_t, 0), \
+  NCCL_FUNC4(func, devredop, uint32_t, 0), \
+  NCCL_FUNC4(func, devredop, int64_t, 0), \
+  NCCL_FUNC4(func, devredop, uint64_t, 0), \
+  NCCL_FUNC4(func, devredop, half, nullForFloat), \
+  NCCL_FUNC4(func, devredop, float, nullForFloat), \
+  NCCL_FUNC4(func, devredop, double, nullForFloat), \
+  NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
+#define NCCL_FUNCS3B(func, devredop) \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0)
 #else
 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, redop) \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, uint8_t), \
-  NCCL_FUNC4(func, redop, int32_t), \
-  NCCL_FUNC4(func, redop, uint32_t), \
-  NCCL_FUNC4(func, redop, int64_t), \
-  NCCL_FUNC4(func, redop, uint64_t), \
-  NCCL_FUNC4(func, redop, half), \
-  NCCL_FUNC4(func, redop, float), \
-  NCCL_FUNC4(func, redop, double)
-#define NCCL_FUNCS3B(func, redop) \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t), \
-  NCCL_FUNC4(func, redop, int8_t)
+#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, uint8_t, 0), \
+  NCCL_FUNC4(func, devredop, int32_t, 0), \
+  NCCL_FUNC4(func, devredop, uint32_t, 0), \
+  NCCL_FUNC4(func, devredop, int64_t, 0), \
+  NCCL_FUNC4(func, devredop, uint64_t, 0), \
+  NCCL_FUNC4(func, devredop, half, nullForFloat), \
+  NCCL_FUNC4(func, devredop, float, nullForFloat), \
+  NCCL_FUNC4(func, devredop, double, nullForFloat)
+#define NCCL_FUNCS3B(func, devredop) \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0), \
+  NCCL_FUNC4(func, devredop, int8_t, 0)
 #endif
 
 // Must be consistent with ncclRedOp_t
 #define NCCL_FUNCS2A(func) \
-  NCCL_FUNCS3A(func, Sum ), \
-  NCCL_FUNCS3A(func, Prod), \
-  NCCL_FUNCS3A(func, Max ), \
-  NCCL_FUNCS3A(func, Min ), \
-  NCCL_FUNCS3A(func, Avg)
+  NCCL_FUNCS3A(func, Sum,        /*nullForFloat=*/0), \
+  NCCL_FUNCS3A(func, Prod,       /*nullForFloat=*/0), \
+  NCCL_FUNCS3A(func, Max,        /*nullForFloat=*/0), \
+  NCCL_FUNCS3A(func, Min,        /*nullForFloat=*/0), \
+  NCCL_FUNCS3A(func, PreMulSum,  /*nullForFloat=*/0), \
+  NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1)
 
 #define NCCL_FUNCS2B(func) \
   NCCL_FUNCS3B(func, Sum), \
   NCCL_FUNCS3B(func, Sum), \
   NCCL_FUNCS3B(func, Sum), \
   NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
   NCCL_FUNCS3B(func, Sum)
 
-// Must be consistent with ncclFunc_t
-#define NCCL_FUNCS() { \
-  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),\
-  NCCL_FUNCS2B(Broadcast), \
-  NCCL_FUNCS2A(Reduce), \
-  NCCL_FUNCS2B(AllGather), \
-  NCCL_FUNCS2A(ReduceScatter), \
-  NCCL_FUNCS2A(AllReduce) }
-
 // Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
+__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
 #if __CUDA_ARCH__
   NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
+  NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+    NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
+  #endif
   NCCL_FUNCS2B(Broadcast),
   NCCL_FUNCS2A(Reduce),
   NCCL_FUNCS2B(AllGather),
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index e99dc61..aaf3685 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -17,7 +17,7 @@ targets="GENOBJS := \\\\\n"
 
 for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
   opn=0
-  for op in sum prod min max avg; do
+  for op in sum prod min max premulsum sumpostdiv; do
     dtn=0
     # Order must match that of the ncclDataType_t enum
     for dt in ${datatypes}; do
diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu
new file mode 100644
index 0000000..f451582
--- /dev/null
+++ b/src/collectives/device/onerank_reduce.cu
@@ -0,0 +1,60 @@
+/*************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "devcomm.h"
+#include "collectives.h"
+#include "reduce_kernel.h"
+#include "common.h"
+
+namespace {
+  template<typename T, typename RedOp>
+  __device__ __forceinline__ void oneRankReduce() {
+    ncclWork *w = &ncclShmem.work;
+    int tid = threadIdx.x;
+    int tn = blockDim.x;
+    #pragma unroll 1
+    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e++) {
+      ncclWorkElem *we = &w->elems[e];
+      intptr_t eltN = we->coll.count;
+      int bid = we->coll.bid;
+      int bn = we->coll.nChannels;
+      T const *src = (T const*)we->sendbuff;
+      T *dst = (T*)we->recvbuff;
+
+      // each block/channel gets a roughly equal segment of 16 byte packs
+      constexpr int EltPerPack = 16/sizeof(T);
+      intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack;
+      intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn);
+      intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn);
+      i0 *= EltPerPack;
+      i0 = i0 < eltN ? i0 : eltN;
+      i1 *= EltPerPack;
+      i1 = i1 < eltN ? i1 : eltN;
+      src += i0;
+      dst += i0;
+      ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1>
+        (tid, tn, &(we->coll.redOpArg), true, 1, &src, 1, &dst, i1-i0);
+    }
+  }
+}
+
+#define INSTANTIATE(devredop, type) \
+  __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
+    oneRankReduce<type, Func##devredop<type>>(); \
+  }
+
+INSTANTIATE(PreMulSum, int8_t)
+INSTANTIATE(PreMulSum, uint8_t)
+INSTANTIATE(PreMulSum, int32_t)
+INSTANTIATE(PreMulSum, uint32_t)
+INSTANTIATE(PreMulSum, int64_t)
+INSTANTIATE(PreMulSum, uint64_t)
+INSTANTIATE(PreMulSum, half)
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+INSTANTIATE(PreMulSum, __nv_bfloat16)
+#endif
+INSTANTIATE(PreMulSum, float)
+INSTANTIATE(PreMulSum, double)
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index 507cfba..8fa84e5 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -218,7 +218,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
   }
 
   template <int RECV, int SEND, int SrcBuf, int DstBuf>
-  __device__ void LLGenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
+  __device__ __forceinline__ void LLGenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
     constexpr int SRC = SrcBuf != -1 ? 1 : 0;
     constexpr int DST = DstBuf != -1 ? 1 : 0;
     T *srcElts = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
@@ -316,9 +316,9 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
  public:
   __device__  Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
-      void const *inputBuf, void *outputBuf, int group=0
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
     ):
-    redOp(FuncTraits<RedOp>().make(ncclShmem.comm.nRanks)),
+    redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
     stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
 
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 439072e..3c049d1 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -277,7 +277,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128>:
   static constexpr int DataEltPerSlice = (WireWordPerSlice - WireWordPerSlice/NCCL_LL128_LINEELEMS)*(sizeof(uint64_t)/sizeof(T));
 
   template <int RECV, int SEND, int SrcBuf, int DstBuf>
-  __device__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
+  __device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
     constexpr int SRC = SrcBuf != -1 ? 1 : 0;
     constexpr int DST = DstBuf != -1 ? 1 : 0;
     static_assert(-1<=SrcBuf && SrcBuf < 2, "Uhoh");
@@ -354,9 +354,9 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128>:
 public:
   __device__ Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
-      void const *inputBuf, void *outputBuf, int group=0
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
     ):
-    redOp(FuncTraits<RedOp>().make(ncclShmem.comm.nRanks)),
+    redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
     flagThread((tid%8)==7), group(group),
     stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
index 9238d63..c30ff40 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@@ -20,14 +20,14 @@ class Primitives<
                        Aborted = 0x40,
                        PtrsFifoEnabled = 0x80,
                        SizesFifoEnabled = 0x100,
-                       DirectEnabled = 0x200,
-                       ThreadsSynced = 0x400;
+                       DirectWrite = 0x200,
+                       DirectRead = 0x400,
+                       ThreadsSynced = 0x800;
   const int tid;
   int nthreads;
   int nworkers;
   const int stepSize;
   Fan fan;
-  RedOp const redOp;
   int index; // Peer index I'm responsible for
   int flags;
   int group;
@@ -69,16 +69,21 @@ class Primitives<
   }
 
   template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
-  inline __device__ void waitPeer(intptr_t dstIx, intptr_t remoteOutIx, int offset, int nelts) {
-    if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
-      bool const isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
+  __device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
+    const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
+    const bool noRecvWait = DirectRecv && Src && (flags & DirectRead);        // no wait when directly reading from remote input
+    const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
+    if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
+        ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
       int spins = 0;
       while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
         connStepCache = *connStepPtr;
         if (checkAbort(spins)) break;
         //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
       }
+    }
 
+    if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
       if (isSendNotRecv && (flags & SizesFifoEnabled))
         connSizesFifoPtr[step%NCCL_STEPS] = nelts*sizeof(T);
 
@@ -86,10 +91,26 @@ class Primitives<
                                   : (ncclShmem.groups[group].srcs + Src);
       if (flags & PtrsFifoEnabled)
         loadPtr(connPtrsFifoPtr + step%NCCL_STEPS, ptrs[index]);
-      else if ((isSendNotRecv ? DirectSend : DirectRecv) && (flags & DirectEnabled))
-        ptrs[index] = directBuff + (isSendNotRecv ? remoteOutIx : dstIx) + offset;
-      else
+      else if (isSendNotRecv && DirectSend) {
+        if (flags & DirectWrite) {
+          ptrs[index] = directBuff + remoteIx + offset;
+        } else if (flags & DirectRead) {  // empty send
+          ptrs[index] = nullptr;
+        } else {
+          ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+        }
+      } else if (!isSendNotRecv && DirectRecv) {
+        if (flags & DirectRead) {
+          ptrs[index] = directBuff + remoteIx + offset;
+        } else if (flags & DirectWrite) {
+          ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
+        } else {
+          ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+        }
+      }
+      else {
         ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+      }
       step += StepPerSlice;
     }
   }
@@ -103,8 +124,8 @@ class Primitives<
   }
 
   template <int DirectRecv1, int DirectSend1, int Recv, int Send, int SrcBuf, int DstBuf>
-  inline __device__ void genericOp(
-      intptr_t srcIx, intptr_t dstIx, intptr_t remoteOutIx, int nelem, bool postOp
+  __device__ __forceinline__ void genericOp(
+      intptr_t srcIx, intptr_t dstIx, intptr_t remoteIx, int nelem, bool postOp
     ) {
     constexpr int DirectRecv = 1 && Direct && DirectRecv1;
     constexpr int DirectSend = 1 && Direct && DirectSend1;
@@ -153,21 +174,30 @@ class Primitives<
           ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
         if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
           ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
-        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteOutIx, offset, sliceSize);
+        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
         subBarrier();
         if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
           // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
           if (Send) {
             // (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0).
-            ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend>
-              (tid, nworkers, redOp, false, false,
+            ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0>
+              (tid, nworkers, nullptr, false,
                1, (T const**)ncclShmem.groups[group].srcs,
                fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
                sliceSize);
           }
+        } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
+          // For broadcast in CollNet to do empty send
+          ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
+            (tid, nworkers, ncclShmem.redOpArgs, postOp,
+             Recv, (T const**)ncclShmem.groups[group].srcs,
+             Dst, (T**)ncclShmem.groups[group].dsts,
+             sliceSize);
         } else {
-          ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst>
-            (tid, nworkers, redOp, SrcBuf==Input, postOp,
+          constexpr int PreOpN = SrcBuf != Input ? 0 :
+                                 DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
+          ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN>
+            (tid, nworkers, ncclShmem.redOpArgs, postOp,
              Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
              Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
              sliceSize);
@@ -201,10 +231,12 @@ class Primitives<
     }
   }
 
-  // Scatter and gather do not support Direct
-  template <int Recv, int Send>
-  inline __device__ void
+  // Scatter/Gather generic op
+  template <int DirectRecv1, int DirectSend1, int Recv, int Send>
+  __device__ __forceinline__ void
   ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) {
+    constexpr int DirectRecv = 1 && Direct && DirectRecv1;
+    constexpr int DirectSend = 1 && Direct && DirectSend1;
     int offset = 0; // slice offset
     int sliceSize = stepSize*StepPerSlice;
     int dataSize = max(DIVUP(peerElem, 16*SlicePerChunk)*16, sliceSize/32);  // per-peer slice size
@@ -213,12 +245,14 @@ class Primitives<
     for (int slice=0; slice<SlicePerChunk; ++slice) {
       int realSize = max(0, min(dataSize, peerElem-offset));
       if (tid < nworkers) {
-        if (Send && (flags & RoleInput)) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
-        if (Recv && (flags & RoleOutput)) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
-        // realSize is not accurate here; but intra-node does not rely on sizes FIFO
-        waitPeer<0, 0, Recv, Send, 0, 0>(0, 0, 0, realSize);
-        subBarrier();
         if (Send) {
+          // Scatter pre-scales data of input buffer only in non-Direct case
+          constexpr int PreOpN = DirectSend ? 0 : 1;
+          if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
+          if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence
+          // realSize is not accurate here; but intra-node does not rely on sizes FIFO
+          waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
+          subBarrier();
           #pragma unroll
           for (int j=0; j<fan.nsend(); j++) {
             int i = (j+shift)%fan.nsend();
@@ -226,34 +260,45 @@ class Primitives<
             if (skip >= 0 && i >= skip) peerOffset += peerElem;
             const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
             int realPeerSize = min(realSize, totalElem-peerOffset);
-            if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1>(tid, nworkers, redOp, true, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
+            if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
+              ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
+              if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
+            }
           }
         } else if (Recv) {
-          #pragma unroll
-          for (int j=0; j<fan.nrecv(); j++) {
-            int i = (j+shift)%fan.nrecv();
-            int peerOffset = i*peerElem;
-            if (skip >= 0 && i >= skip) peerOffset += peerElem;
-            T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset;
-            int realPeerSize = min(realSize, totalElem-peerOffset);
-            if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1>(tid, nworkers, redOp, false, postOp, 1, (T const**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
+          if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
+          int peerOffset = index*peerElem;
+          if (skip >= 0 && index >= skip) peerOffset += peerElem;
+          // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
+          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+peerOffset, offset, realSize);
+          subBarrier();
+          if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
+            // Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
+            // Do nothing
+          } else {
+            #pragma unroll
+            for (int j=0; j<fan.nrecv(); j++) {
+              int i = (j+shift)%fan.nrecv();
+              peerOffset = i*peerElem;
+              if (skip >= 0 && i >= skip) peerOffset += peerElem;
+              T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset;
+              int realPeerSize = min(realSize, totalElem-peerOffset);
+              if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
+            }
           }
         }
       }
       barrier();
-      if (Send && (flags & RolePostSend) && realSize > 0 && index == 0) __threadfence_system();
+      if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
+        __threadfence_system();
       __syncwarp();
       postPeer<Recv, Send>();
       offset += realSize;
     }
   }
 
-  __device__ __forceinline__ void loadRecvConn(ncclPeer *peer) {
+  __device__ __forceinline__ void loadRecvConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
     if (flags & (RoleWaitRecv|RolePostRecv)) {
-      // For other colls: group <= 2, hence always use conn 0
-      // For P2P: Direct is set to 1, hence always use conn 0
-      // Ideally we should be accepting connIndex from the constructor!
-      const int connIndex = Direct ? 0 : group/4;
       auto *conn = &peer->recv[connIndex].conn;
       step = conn->step;
       step = roundUp(step, SlicePerChunk*StepPerSlice);
@@ -266,7 +311,25 @@ class Primitives<
         connStepPtr = conn->tail;
         connStepCache = *connStepPtr;
         flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0;
-        flags |= (Direct && (conn->direct & NCCL_DIRECT_GPU)) ? DirectEnabled : 0;
+        if (Direct) {
+          // User buffers have been registered
+          if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
+            if (connIndex == 1) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
+                       (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
+            }
+          } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
+            if (connIndex == 1) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              // direct read not allowed in non-register case
+              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
+              flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
+            }
+          }
+        }
         if (flags & PtrsFifoEnabled)
           connPtrsFifoPtr = conn->ptrsFifo;
         else
@@ -275,12 +338,8 @@ class Primitives<
     }
   }
 
-  __device__ __forceinline__ void loadSendConn(ncclPeer *peer) {
+  __device__ __forceinline__ void loadSendConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
     if (flags & (RoleWaitSend|RolePostSend)) {
-      // For other colls: group <= 2, hence always use conn 0
-      // For P2P: Direct is set to 1, hence always use conn 0
-      // Ideally we should be accepting connIndex from the constructor!
-      const int connIndex = Direct ? 0 : group/4;
       auto *conn = &peer->send[connIndex].conn;
       step = conn->step;
       step = roundUp(step, SlicePerChunk*StepPerSlice);
@@ -300,9 +359,25 @@ class Primitives<
         if (conn->sizesFifo != nullptr) {
           flags |= SizesFifoEnabled;
           connSizesFifoPtr = conn->sizesFifo;
+        } else if (Direct) {
+          // User buffers have been registered
+          if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
+            if (connIndex == 1) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
+                       (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
+            }
+          } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
+            if (connIndex == 1) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              // direct read not allowed in non-register case
+              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
+              flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
+            }
+          }
         }
-        else if (Direct && (conn->direct & NCCL_DIRECT_GPU))
-          flags |= DirectEnabled;
       }
     }
   }
@@ -310,16 +385,16 @@ class Primitives<
  public:
   __device__ Primitives(
       int tid, int nthreads, int const *recvPeers, int const *sendPeers,
-      void const *inputBuf, void *outputBuf, int group=0
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
     ):
     tid(tid),
-    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)),
-    redOp(FuncTraits<RedOp>::make(ncclShmem.comm.nRanks)) {
+    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
 
     // For send operations, we need an extra warp to overlap the threadfence and the copy
     this->nthreads = nthreads;
     this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
-    this->group = group;
+    this->group = group & (uint16_t)0xFFFF;
+    int connIndex = group >> 16;
 
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@@ -349,10 +424,10 @@ class Primitives<
     if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
     if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
 
-    loadRecvConn(&ncclShmem.channel.devPeers[peer]);
-    loadSendConn(&ncclShmem.channel.devPeers[peer]);
+    loadRecvConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
+    loadSendConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
 
-    setDataPtrs(inputBuf, outputBuf);
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkRegElem*)e);
   }
 
   __device__ ~Primitives() {
@@ -369,10 +444,19 @@ class Primitives<
     barrier();
   }
 
-  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf) {
-    if (flags & RoleInput) userBuff = (T*)inputBuf;
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkRegElem* e) {
+    if (flags & RoleInput) {
+      userBuff = (T*)inputBuf;
+      ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
+    }
     if (flags & RoleOutput) userBuff = (T*)outputBuf;
-    if (Direct && flags == (flags|RoleWaitRecv|DirectEnabled)) {
+    bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
+    bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite);
+    bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
+    bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer
+    int regUsed = e != nullptr ? e->elem.regUsed : 0;
+
+    if (Direct && recvProvider) {
       int spins = 0;
       void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
       // Wait for consumer to consume previous value before trampling it.
@@ -381,9 +465,9 @@ class Primitives<
       // Encode pointer by XOR'ing against some address they definitely wouldn't send
       // since we want to allow them sending us nullptr while not colliding with
       // the empty slot value.
-      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(outputBuf) ^ reinterpret_cast<uintptr_t>(slot));
+      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
     }
-    if (Direct && flags == (flags|RoleWaitSend|DirectEnabled)) {
+    if (Direct && sendAcceptor) {
       int spins = 0;
       void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
       void *ptr;
@@ -391,7 +475,51 @@ class Primitives<
         ptr = *slot;
         if (ptr != nullptr || checkAbort(spins)) break;
       }
-      directBuff = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
+      directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
+                   reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
+      *slot = nullptr;
+    }
+    if (Direct && sendProvider) {
+      int spins = 0;
+      void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
+      volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
+      volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
+      // Wait for consumer to consume previous value before trampling it.
+      while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
+      // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
+      // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
+      directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
+      // Exchange pre-scalers for use in direct pull
+      *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
+      *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
+      // Encode pointer by XOR'ing against some address they definitely wouldn't send
+      // since we want to allow them sending us nullptr while not colliding with
+      // the empty slot value.
+      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+    }
+    if (Direct && recvAcceptor) {
+      int spins = 0;
+      void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
+      volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
+      volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
+      void *ptr;
+      while (true) {
+        ptr = *slot;
+        if (ptr != nullptr || checkAbort(spins)) break;
+      }
+      directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
+                   reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
+      if (MaxSend != 0) { // reduce group rather than gather group
+        // Store scalers for remote inputs
+        uint64_t arg0, arg1;
+        while (true) {
+          arg0 = *argSlot0;
+          arg1 = *argSlot1;
+          if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+        }
+        ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
+      }
+      *argSlot0 = 0; *argSlot1 = 0;
       *slot = nullptr;
     }
   }
@@ -434,6 +562,9 @@ class Primitives<
   __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
     genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, false);
   }
+  __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, postOp);
+  }
 
   __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, -1, eltN, postOp);
@@ -442,6 +573,9 @@ class Primitives<
   __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, -1, eltN, postOp);
   }
+  __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, intptr_t remoteInpIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, remoteInpIx, eltN, postOp);
+  }
 
   __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp);
@@ -453,11 +587,19 @@ class Primitives<
 
   __device__ __forceinline__ void
   scatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) {
-    ScatterGatherOp<0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
+    ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
+  }
+  __device__ __forceinline__ void
+  directScatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) {
+    ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
   }
 
   __device__ __forceinline__ void
   gather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp=false) {
-    ScatterGatherOp<1, 0>(-1, outIx, totalElem, peerElem, skip, shift, postOp);
+    ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, postOp);
+  }
+  __device__ __forceinline__ void
+  directGather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift) {
+    ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, /*postOp=*/false);
   }
 };
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 1ce4c2e..fbc5be9 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -10,7 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ void runRing(ncclWorkElem *args) {
+  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
     const int nthreads = args->nThreads;
     const int bid = args->coll.bid;
@@ -26,7 +26,7 @@ namespace {
     const int root = args->coll.root;
 
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff);
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
 
     auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
       int realChunkSize;
@@ -70,7 +70,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -78,14 +78,14 @@ struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h
index 87a6823..878ec79 100644
--- a/src/collectives/device/reduce_kernel.h
+++ b/src/collectives/device/reduce_kernel.h
@@ -14,6 +14,7 @@
 
 template<typename T>
 struct FuncNull {
+  __device__ FuncNull(uint64_t opArg=0) {}
   __device__ T operator()(const T x, const T y) const {
     return 0;
   }
@@ -21,6 +22,7 @@ struct FuncNull {
 
 template<typename T>
 struct FuncSum {
+  __device__ FuncSum(uint64_t opArg=0) {}
   __device__ T operator()(const T x, const T y) const {
     return x + y;
   }
@@ -28,6 +30,7 @@ struct FuncSum {
 
 template<typename T>
 struct FuncProd {
+  __device__ FuncProd(uint64_t opArg=0) {}
   __device__ T operator()(const T x, const T y) const {
     return x * y;
   }
@@ -35,6 +38,7 @@ struct FuncProd {
 
 template<typename T>
 struct FuncMax {
+  __device__ FuncMax(uint64_t opArg=0) {}
   __device__ T operator()(const T x, const T y) const {
     return (x < y) ? y : x;
   }
@@ -42,6 +46,7 @@ struct FuncMax {
 
 template<typename T>
 struct FuncMin {
+  __device__ FuncMin(uint64_t opArg=0) {}
   __device__ T operator()(const T x, const T y) const {
     return (x < y) ? x : y;
   }
@@ -52,7 +57,6 @@ struct FuncTraits { // generic implementation for FuncSum,Prod,Min,Max
   static constexpr bool IsPreOpIdentity = true;
   static constexpr bool IsPostOpIdentity = true;
 
-  __device__ static Fn make(int rankN) { return Fn(); }
   template<typename T>
   __device__ static T preOp(Fn, T x) { return x; }
   template<typename T>
@@ -74,6 +78,7 @@ static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
 
 template<>
 struct FuncSum<int8_t> {
+  __device__ FuncSum(uint64_t opArg=0) {}
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
@@ -89,6 +94,7 @@ struct FuncSum<int8_t> {
 };
 template<>
 struct FuncSum<uint8_t> {
+  __device__ FuncSum(uint64_t opArg=0) {}
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
@@ -118,6 +124,7 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
 
 template<>
 struct FuncProd<int8_t> {
+  __device__ FuncProd(uint64_t opArg=0) {}
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
     return mulChar4(x, y);
   }
@@ -127,6 +134,7 @@ struct FuncProd<int8_t> {
 };
 template<>
 struct FuncProd<uint8_t> {
+  __device__ FuncProd(uint64_t opArg=0) {}
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
     return mulChar4(x, y);
   }
@@ -137,6 +145,7 @@ struct FuncProd<uint8_t> {
 
 template<>
 struct FuncMax<int8_t> {
+  __device__ FuncMax(uint64_t opArg=0) {}
   union converter { uint32_t storage; char4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
@@ -160,6 +169,7 @@ struct FuncMax<int8_t> {
 };
 template<>
 struct FuncMax<uint8_t> {
+  __device__ FuncMax(uint64_t opArg=0) {}
   union converter { uint32_t storage; uchar4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
@@ -184,6 +194,7 @@ struct FuncMax<uint8_t> {
 
 template<>
 struct FuncMin<int8_t> {
+  __device__ FuncMin(uint64_t opArg=0) {}
   union converter { uint32_t storage; char4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
@@ -207,6 +218,7 @@ struct FuncMin<int8_t> {
 };
 template<>
 struct FuncMin<uint8_t> {
+  __device__ FuncMin(uint64_t opArg=0) {}
   union converter { uint32_t storage; uchar4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
@@ -231,6 +243,7 @@ struct FuncMin<uint8_t> {
 
 template<>
 struct FuncSum<half> {
+  __device__ FuncSum(uint64_t opArg=0) {}
   __device__ half2 operator()(const half2 x, const half2 y) const {
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
     return __hadd2(x, y);
@@ -255,6 +268,7 @@ struct FuncSum<half> {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 template<>
 struct FuncSum<__nv_bfloat16> {
+  __device__ FuncSum(uint64_t opArg=0) {}
   __device__ __nv_bfloat162 operator()(const __nv_bfloat162 x, const __nv_bfloat162 y) const {
 #if __CUDA_ARCH__ >= 800
     return __hadd2(x, y);
@@ -279,6 +293,7 @@ struct FuncSum<__nv_bfloat16> {
 
 template<>
 struct FuncProd<half> {
+  __device__ FuncProd(uint64_t opArg=0) {}
   __device__ half2 operator()(const half2 x, const half2 y) const {
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
     return __hmul2(x, y);
@@ -303,6 +318,7 @@ struct FuncProd<half> {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 template<>
 struct FuncProd<__nv_bfloat16> {
+  __device__ FuncProd(uint64_t opArg=0) {}
   __device__ __nv_bfloat162 operator()(const __nv_bfloat162 x, const __nv_bfloat162 y) const {
 #if __CUDA_ARCH__ >= 800
     return __hmul2(x, y);
@@ -327,6 +343,7 @@ struct FuncProd<__nv_bfloat16> {
 
 template<>
 struct FuncMax<half> {
+  __device__ FuncMax(uint64_t opArg=0) {}
   __device__ half2 operator()(const half2 x, const half2 y) const {
     float2 fx, fy, fr;
     fx = __half22float2(x);
@@ -347,6 +364,7 @@ struct FuncMax<half> {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 template<>
 struct FuncMax<__nv_bfloat16> {
+  __device__ FuncMax(uint64_t opArg=0) {}
   __device__ __nv_bfloat162 operator()(const __nv_bfloat162 x, const __nv_bfloat162 y) const {
 #if __CUDA_ARCH__ >= 800
     return __hmax2(x, y);
@@ -374,6 +392,7 @@ struct FuncMax<__nv_bfloat16> {
 
 template<>
 struct FuncMin<half> {
+  __device__ FuncMin(uint64_t opArg=0) {}
   __device__ half2 operator()(const half2 x, const half2 y) const {
     float2 fx, fy, fr;
     fx = __half22float2(x);
@@ -394,6 +413,7 @@ struct FuncMin<half> {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 template<>
 struct FuncMin<__nv_bfloat16> {
+  __device__ FuncMin(uint64_t opArg=0) {}
    __device__ __nv_bfloat162 operator()(const __nv_bfloat162 x, const __nv_bfloat162 y) const {
 #if __CUDA_ARCH__ >= 800
     return __hmin2(x, y);
@@ -421,12 +441,14 @@ struct FuncMin<__nv_bfloat16> {
 
 template<>
 struct FuncMax<float> {
+  __device__ FuncMax(uint64_t opArg=0) {}
   __device__ float operator()(float x, float y) const {
     return fmaxf(x, y);
   }
 };
 template<>
 struct FuncMin<float> {
+  __device__ FuncMin(uint64_t opArg=0) {}
   __device__ float operator()(float x, float y) const {
     return fminf(x, y);
   }
@@ -434,71 +456,98 @@ struct FuncMin<float> {
 
 template<>
 struct FuncMax<double> {
+  __device__ FuncMax(uint64_t opArg=0) {}
   __device__ double operator()(double x, double y) const {
     return fmax(x, y);
   }
 };
 template<>
 struct FuncMin<double> {
+  __device__ FuncMin(uint64_t opArg=0) {}
   __device__ double operator()(double x, double y) const {
     return fmin(x, y);
   }
 };
 
 template<typename T>
-struct FuncAvg: FuncSum<T> {
-  static_assert(!std::is_floating_point<T>::value, "Uhoh");
+struct IsFloatingPoint: std::false_type {};
+template<>
+struct IsFloatingPoint<half>: std::true_type {};
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
+#endif
+template<>
+struct IsFloatingPoint<float>: std::true_type {};
+template<>
+struct IsFloatingPoint<double>: std::true_type {};
+
+template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
+struct FuncSumPostDiv;
+
+template<typename T>
+struct FuncSumPostDiv<T, /*IsFloating=*/false>: FuncSum<T> {
   static constexpr bool IsPreOpIdentity = true;
   static constexpr bool IsPostOpIdentity = false;
   int n;
+  __device__ FuncSumPostDiv(uint64_t opArg): n(opArg) {}
+  // inherits FuncSum::operator()
+  __device__ T preOp(T x) const { return x; }
+  __device__ T postOp(T x) const { return T(x/n); }
+};
 
-  template<typename ...Arg>
-  __device__ FuncAvg(int n): n(n) {}
+template<typename T>
+struct FuncSumPostDiv<T, /*IsFloating=*/true> {
+  static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types.");
+};
 
-  __device__ T preOp(T x) const {
-    return x;
-  }
-  __device__ T postOp(T x) const {
-    return T(x/n);
-  }
+template<typename T>
+struct FuncPreMulSum: FuncSum<T> { // integral T since all floats are specialized below
+  static constexpr bool IsPreOpIdentity = false;
+  static constexpr bool IsPostOpIdentity = true;
+  T scale;
+  __device__ FuncPreMulSum(uint64_t opArg) { scale = *(T*)&opArg; }
+  // inherits FuncSum::operator()
+  __device__ T preOp(T x) const { return x*scale; }
+  __device__ T postOp(T x) const { return x; }
 };
 
 template<>
-struct FuncAvg<double>: FuncSum<double> {
+struct FuncPreMulSum<double>: FuncSum<double> {
   static constexpr bool IsPreOpIdentity = false;
   static constexpr bool IsPostOpIdentity = true;
-  double rcp;
-  __device__ FuncAvg(int n) {
-    rcp = __drcp_rn(double(n));
+  double scale;
+  __device__ FuncPreMulSum(uint64_t opArg) {
+    scale = *(double*)&opArg;
   }
   // inherits FuncSum::operator()
   __device__ double preOp(double x) const {
-    return IsPreOpIdentity ? x : x*rcp;
+    return IsPreOpIdentity ? x : x*scale;
   }
   __device__ double postOp(double x) const {
-    return IsPostOpIdentity ? x : x*rcp;
+    return IsPostOpIdentity ? x : x*scale;
   }
 };
 
 template<>
-struct FuncAvg<float>: FuncSum<float> {
+struct FuncPreMulSum<float>: FuncSum<float> {
   static constexpr bool IsPreOpIdentity = false;
   static constexpr bool IsPostOpIdentity = true;
-  float rcp;
-  __device__ FuncAvg(int n) {
-    rcp = __frcp_rn(float(n));
+  float scale;
+  __device__ FuncPreMulSum(uint64_t opArg) {
+    scale = *(float*)&opArg;
   }
   // inherits FuncSum::operator()
   __device__ float preOp(float x) const {
-    return IsPreOpIdentity ? x : x*rcp;
+    return IsPreOpIdentity ? x : x*scale;
   }
   __device__ float postOp(float x) const {
-    return IsPostOpIdentity ? x : x*rcp;
+    return IsPostOpIdentity ? x : x*scale;
   }
 };
 
 template<>
-struct FuncAvg<half>: FuncSum<half> {
+struct FuncPreMulSum<half>: FuncSum<half> {
   // Change these to switch between all prescale, all postscale, or both by sqrt(N).
   // Obviously, the only invalid combination is both true. An improvement would be
   // make this parameterized as a build time setting and passed here through
@@ -508,11 +557,8 @@ struct FuncAvg<half>: FuncSum<half> {
 
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
   half2 scale;
-  __device__ FuncAvg(int n) {
-    if (!IsPreOpIdentity && !IsPostOpIdentity)
-      scale.x = __float2half(__frsqrt_rn(float(n)));
-    else
-      scale.x = __float2half(__frcp_rn(float(n)));
+  __device__ FuncPreMulSum(uint64_t opArg) {
+    scale.x = *(half*)&opArg;
     scale.y = scale.x;
   }
   // inherits FuncSum::operator()
@@ -530,11 +576,8 @@ struct FuncAvg<half>: FuncSum<half> {
   }
 #else
   float scale;
-  __device__ FuncAvg(int n) {
-    if (!IsPreOpIdentity && !IsPostOpIdentity)
-      scale = __frsqrt_rn(float(n));
-    else
-      scale = __frcp_rn(float(n));
+  __device__ FuncPreMulSum(uint64_t opArg) {
+    scale = __half2float(*(half*)&opArg);
   }
   // inherits FuncSum::operator()
   __device__ half preOp(half x) const {
@@ -568,7 +611,7 @@ struct FuncAvg<half>: FuncSum<half> {
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 template<>
-struct FuncAvg<__nv_bfloat16>: FuncSum<__nv_bfloat16> {
+struct FuncPreMulSum<__nv_bfloat16>: FuncSum<__nv_bfloat16> {
   // Change these to switch between all prescale, all postscale, or both by sqrt(N).
   // Obviously, the only invalid combination is both true. An improvement would be
   // make this parameterized as a build time setting and passed here through
@@ -578,11 +621,8 @@ struct FuncAvg<__nv_bfloat16>: FuncSum<__nv_bfloat16> {
 
 #if __CUDA_ARCH__ >= 800
   __nv_bfloat162 scale;
-  __device__ FuncAvg(int n) {
-    if (!IsPreOpIdentity && !IsPostOpIdentity)
-      scale.x = __float2bfloat16(__frsqrt_rn(float(n)));
-    else
-      scale.x = __float2bfloat16(__frcp_rn(float(n)));
+  __device__ FuncPreMulSum(uint64_t opArg) {
+    scale.x = *(__nv_bfloat16*)&opArg;
     scale.y = scale.x;
   }
   // inherits FuncSum::operator()
@@ -600,11 +640,8 @@ struct FuncAvg<__nv_bfloat16>: FuncSum<__nv_bfloat16> {
   }
 #else
   float scale;
-  __device__ FuncAvg(int n) {
-    if (!IsPreOpIdentity && !IsPostOpIdentity)
-      scale = __frsqrt_rn(float(n));
-    else
-      scale = __frcp_rn(float(n));
+  __device__ FuncPreMulSum(uint64_t opArg) {
+    scale = *(__nv_bfloat16*)&opArg;
   }
   // inherits FuncSum::operator()
   __device__ __nv_bfloat16 preOp(__nv_bfloat16 x) const {
@@ -638,21 +675,31 @@ struct FuncAvg<__nv_bfloat16>: FuncSum<__nv_bfloat16> {
 #endif
 
 template<typename T>
-struct FuncTraits<FuncAvg<T>> {
-  static constexpr bool IsPreOpIdentity = FuncAvg<T>::IsPreOpIdentity;
-  static constexpr bool IsPostOpIdentity = FuncAvg<T>::IsPostOpIdentity;
+struct FuncTraits<FuncPreMulSum<T>> {
+  static constexpr bool IsPreOpIdentity = FuncPreMulSum<T>::IsPreOpIdentity;
+  static constexpr bool IsPostOpIdentity = FuncPreMulSum<T>::IsPostOpIdentity;
 
-  __device__ static FuncAvg<T> make(int rankN) {
-    return FuncAvg<T>(rankN);
-  }
   template<typename U>
-  __device__ static U preOp(FuncAvg<T> fn, U x) {
+  __device__ static U preOp(FuncPreMulSum<T> fn, U x) {
     return fn.preOp(x);
   }
   template<typename U>
-  __device__ static U postOp(FuncAvg<T> fn, U x) {
+  __device__ static U postOp(FuncPreMulSum<T> fn, U x) {
     return fn.postOp(x);
   }
 };
+template<typename T>
+struct FuncTraits<FuncSumPostDiv<T>> {
+  static constexpr bool IsPreOpIdentity = FuncSumPostDiv<T>::IsPreOpIdentity;
+  static constexpr bool IsPostOpIdentity = FuncSumPostDiv<T>::IsPostOpIdentity;
 
+  template<typename U>
+  __device__ static U preOp(FuncSumPostDiv<T> fn, U x) {
+    return fn.preOp(x);
+  }
+  template<typename U>
+  __device__ static U postOp(FuncSumPostDiv<T> fn, U x) {
+    return fn.postOp(x);
+  }
+};
 #endif // REDUCE_KERNEL_H_
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index c61a028..0334448 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -10,7 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ void runRing(ncclWorkElem *args) {
+  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
     const int nthreads = args->nThreads;
     const int bid = args->coll.bid;
@@ -25,7 +25,7 @@ namespace {
     const ssize_t size = args->coll.count;
 
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff);
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -68,7 +68,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -76,14 +76,14 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROT
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index e5e948f..76f49c0 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -10,7 +10,7 @@
 
 template<typename T, typename RedOp>
 struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ void run(ncclWork *work) {
+  __device__ __forceinline__ void run(ncclWork *work) {
     int tid = threadIdx.x;
     int group = 0;
     const int rank = ncclShmem.comm.rank;
@@ -38,16 +38,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 
         if (delta == 0) {
           if (sendbuff != recvbuff) {
-            // local copy : ReduceOrCopyMulti takes an int as number of elements,
-            // so we split it in blocks of 1G elements.
-            int blockSize = 1<<30;
-            for (size_t offset=0; offset<sendCount; offset += blockSize) {
-              size_t remaining = sendCount - offset;
-              if (remaining < blockSize) blockSize = remaining;
-              ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1>(tid, nThreadsSegment, RedOp(), false, false, 1, &sendbuff, 1, &recvbuff, blockSize);
-              sendbuff += blockSize;
-              recvbuff += blockSize;
-            }
+            ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nThreadsSegment, nullptr, false, 1, &sendbuff, 1, &recvbuff, sendCount);
           }
         }
         else {
@@ -57,7 +48,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
             int const nt = nThreadsSplit;
             int const chunkSize = args->p2p.recvChunkSize/sizeof(T);
             Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto> prims
-              (tid-t0, nt, &peer, nullptr, nullptr, recvbuff, groupRecv);
+              (tid-t0, nt, &peer, nullptr, nullptr, recvbuff, /*redOpArg(ignored)=*/0, groupRecv);
             ssize_t offset = 0;
             do {
               int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T)));
@@ -73,7 +64,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
             int const nt = nThreadsSegment - nThreadsSplit;
             int const chunkSize = args->p2p.sendChunkSize/sizeof(T);
             Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto> prims
-              (tid-t0, nt, nullptr, &peer, sendbuff, nullptr, groupSend);
+              (tid-t0, nt, nullptr, &peer, sendbuff, nullptr, /*redOpArg(ignored)=*/0, groupSend);
             ssize_t offset = 0;
             do {
               int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T)));
diff --git a/src/debug.cc b/src/debug.cc
index a47ceaf..795c401 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -138,7 +138,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   int cudaDev;
   cudaGetDevice(&cudaDev);
   int pid = getpid();
-  int tid = gettid();
+  int tid = syscall(SYS_gettid);
 
   char buffer[1024];
   size_t len = 0;
diff --git a/src/enqueue.cc b/src/enqueue.cc
index df09166..e863b82 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -8,83 +8,99 @@
 #include "argcheck.h"
 #include "coll_net.h"
 #include "gdrwrap.h"
+#include "bootstrap.h"
 
 // Only generate inline kernels for LL
-#define NCCL_FUNC5(func, algo, redop, dtype) \
-  (void*)NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
-  (void*)NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
-  (void*)NCCL_KERN_NAME(func, algo, LL, redop, dtype)
+#define NCCL_FUNC5(func, algo, devredop, dtype) \
+  (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
+  (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
+  (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)
 
-#define NCCL_FUNC4(func, redop, type) \
-  (void*)NCCL_FUNC5(func, TREE,    redop, type), \
-  (void*)NCCL_FUNC5(func, RING,    redop, type), \
-  (void*)NCCL_FUNC5(func, COLLNET, redop, type)
+#define NCCL_FUNC4(func, devredop, type) \
+  (void*)NCCL_FUNC5(func, TREE,    devredop, type), \
+  (void*)NCCL_FUNC5(func, RING,    devredop, type), \
+  (void*)NCCL_FUNC5(func, COLLNET, devredop, type)
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, redop) \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, uint8_t), \
-  (void*)NCCL_FUNC4(func, redop, int32_t), \
-  (void*)NCCL_FUNC4(func, redop, uint32_t), \
-  (void*)NCCL_FUNC4(func, redop, int64_t), \
-  (void*)NCCL_FUNC4(func, redop, uint64_t), \
-  (void*)NCCL_FUNC4(func, redop, half), \
-  (void*)NCCL_FUNC4(func, redop, float), \
-  (void*)NCCL_FUNC4(func, redop, double), \
-  (void*)NCCL_FUNC4(func, redop, __nv_bfloat16)
-#define NCCL_FUNCS3B(func, redop) \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t)
+#define NCCL_FUNCS3A(func, devredop) \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, uint8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int32_t), \
+  (void*)NCCL_FUNC4(func, devredop, uint32_t), \
+  (void*)NCCL_FUNC4(func, devredop, int64_t), \
+  (void*)NCCL_FUNC4(func, devredop, uint64_t), \
+  (void*)NCCL_FUNC4(func, devredop, half), \
+  (void*)NCCL_FUNC4(func, devredop, float), \
+  (void*)NCCL_FUNC4(func, devredop, double), \
+  (void*)NCCL_FUNC4(func, devredop, __nv_bfloat16)
+#define NCCL_FUNCS3B(func, devredop) \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t)
 #else
 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, redop) \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, uint8_t), \
-  (void*)NCCL_FUNC4(func, redop, int32_t), \
-  (void*)NCCL_FUNC4(func, redop, uint32_t), \
-  (void*)NCCL_FUNC4(func, redop, int64_t), \
-  (void*)NCCL_FUNC4(func, redop, uint64_t), \
-  (void*)NCCL_FUNC4(func, redop, half), \
-  (void*)NCCL_FUNC4(func, redop, float), \
-  (void*)NCCL_FUNC4(func, redop, double)
-#define NCCL_FUNCS3B(func, redop) \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t), \
-  (void*)NCCL_FUNC4(func, redop, int8_t)
+#define NCCL_FUNCS3A(func, devredop) \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, uint8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int32_t), \
+  (void*)NCCL_FUNC4(func, devredop, uint32_t), \
+  (void*)NCCL_FUNC4(func, devredop, int64_t), \
+  (void*)NCCL_FUNC4(func, devredop, uint64_t), \
+  (void*)NCCL_FUNC4(func, devredop, half), \
+  (void*)NCCL_FUNC4(func, devredop, float), \
+  (void*)NCCL_FUNC4(func, devredop, double)
+#define NCCL_FUNCS3B(func, devredop) \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t), \
+  (void*)NCCL_FUNC4(func, devredop, int8_t)
 #endif
 
-// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
+// Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums.
 #define NCCL_FUNCS2A(func) \
-  NCCL_FUNCS3A(func, Sum), \
-  NCCL_FUNCS3A(func, Sum), \
-  NCCL_FUNCS3A(func, Sum), \
-  NCCL_FUNCS3A(func, Sum), \
-  NCCL_FUNCS3A(func, Sum)
+  NCCL_FUNCS3A(func, Sum), /*Sum*/ \
+  NCCL_FUNCS3A(func, Sum), /*Prod*/ \
+  NCCL_FUNCS3A(func, Sum), /*Max*/ \
+  NCCL_FUNCS3A(func, Sum), /*Min*/ \
+  NCCL_FUNCS3A(func, Sum), /*PreMulSum*/ \
+  NCCL_FUNCS3A(func, Sum)  /*SumPostDiv*/
 #define NCCL_FUNCS2B(func) \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum)
+  NCCL_FUNCS3B(func, Sum), /*Sum*/ \
+  NCCL_FUNCS3B(func, Sum), /*Prod*/ \
+  NCCL_FUNCS3B(func, Sum), /*Max*/ \
+  NCCL_FUNCS3B(func, Sum), /*Min*/ \
+  NCCL_FUNCS3B(func, Sum), /*PreMulSum*/ \
+  NCCL_FUNCS3B(func, Sum)  /*SumPostDiv*/
 
 // Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
+static void* const ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
   (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  // We don't bake special kernels for the one-rank reductions
+  /*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  /*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  /*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  /*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  /*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  /*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  /*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  /*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  /*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+    /*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  #endif
   NCCL_FUNCS2B(Broadcast),
   NCCL_FUNCS2A(Reduce),
   NCCL_FUNCS2B(AllGather),
@@ -145,7 +161,6 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
   // Initialize with work elem if provided
   if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
   e->active = 1;
-  e->index = opIndex;
   channel->workFifoTail++;
   channel->workCount++;
   if (work) *work = w;
@@ -183,10 +198,11 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
 
     if (c == 0) {
       // As we inline the first coll directly, we can free it immediately.
-      // Except P2P or aggregation cases
+      // Except P2P or aggregation or registration cases
       struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS);
       struct ncclWorkElem* elem = work->elems;
-      if (elem->funcIndex != FUNC_INDEX_P2P && eqInfo->elemList->count() == 1) elem->active = 0;
+      if (elem->funcIndex != FUNC_INDEX_P2P && eqInfo->elemList->count() == 1 && elem->regUsed == 0)
+        elem->active = 0;
     }
 
     if (channel->gdrMemDesc) {
@@ -370,7 +386,7 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) {
 
 static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
   if (info->comm->collNetSupport > 0) {
-    ncclRedOp_t netOp = info->op == ncclAvg ? ncclSum : info->op;
+    ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
     NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport));
   } else {
     *collNetTypeSupport = 0;
@@ -380,30 +396,35 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet
 
 static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) {
   struct ncclComm* comm = info->comm;
-  float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
-  // Find algorithm / protocol.
-  info->algorithm = -1;
-  info->protocol = -1;
-  if (comm->nRanks == 1) return ncclSuccess;
-  int nAlgos = NCCL_NUM_ALGORITHMS;
-  for (int a=0; a<nAlgos; a++) {
-    if (a == NCCL_ALGO_COLLNET && collNetTypeSupport != 1) continue;
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      float time;
-      NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
-      if (time >= 0 && time < minTime) {
-        info->algorithm = a;
-        info->protocol = p;
-        minTime = time;
+  if (comm->nRanks == 1) {
+    info->algorithm = NCCL_ALGO_RING;
+    info->protocol = NCCL_PROTO_SIMPLE;
+  }
+  else {
+    float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
+    // Find algorithm / protocol.
+    info->algorithm = -1;
+    info->protocol = -1;
+    int nAlgos = NCCL_NUM_ALGORITHMS;
+    for (int a=0; a<nAlgos; a++) {
+      if (a == NCCL_ALGO_COLLNET && collNetTypeSupport != 1) continue;
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        float time;
+        NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
+        if (time >= 0 && time < minTime) {
+          info->algorithm = a;
+          info->protocol = p;
+          minTime = time;
+        }
       }
     }
+    if (info->algorithm == -1 || info->protocol == -1) {
+      WARN("Error : no algorithm/protocol available");
+      return ncclInternalError;
+    }
+    //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
+    TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
   }
-  if (info->algorithm == -1 || info->protocol == -1) {
-    WARN("Error : no algorithm/protocol available");
-    return ncclInternalError;
-  }
-  //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
-  TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
 
   int nc = (info->nChannels > 0) ? info->nChannels : comm->nChannels;
   int nt = comm->maxThreads[info->algorithm][info->protocol];
@@ -494,8 +515,16 @@ comp_next:
   work->coll.count = info->count;
   work->coll.nChannels = info->nChannels;
   work->nThreads = info->nThreads;
+  work->coll.redOpArg = info->opFull.scalarArg;
+  work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
 
-  work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
+  if (info->comm->nRanks == 1) {
+    // one-rank reduce index
+    work->funcIndex = 1 + int(info->datatype);
+    return ncclSuccess;
+  }
+
+  work->funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
 
   int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -519,6 +548,8 @@ comp_next:
     while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
     // Use lastChunkSize as chunkSize
     work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    // Set direct direction for broadcast-gather (read or write)
+    work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
   } else if (info->protocol == NCCL_PROTO_LL) {
     const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
     const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@@ -548,7 +579,7 @@ comp_next:
   proxyArgs->protocol = info->protocol;
   proxyArgs->dtype = info->datatype;
   proxyArgs->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
-                     info->op == ncclAvg ? ncclSum : // Network sees avg as sum
+                     info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
                      info->op;
   proxyArgs->pattern = info->pattern;
   proxyArgs->root = info->root;
@@ -574,12 +605,61 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
+struct ncclBuffRegHandle {
+  cudaIpcMemHandle_t sendBuffIpc;
+  cudaIpcMemHandle_t recvBuffIpc;
+  ssize_t sendBuffOffset;
+  ssize_t recvBuffOffset;
+};
+
+// Register input and output buffers
+// Exchange with ranks on the same host
+static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuffRegInfo* regInfo) {
+  ncclComm_t comm = info->comm;
+  if (comm->localRanks == 1) return ncclSuccess;
+  if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess;  // CUDA toolkit or driver version too old
+
+  struct ncclBuffRegHandle regHandles[NCCL_MAX_INTRA_RANKS];
+  // Get IPC handles
+  // Note: the handle only corresponds to the base address of the allocation
+  CUDACHECK(cudaIpcGetMemHandle(&regHandles[comm->intraNodeRank].sendBuffIpc, (void*)info->sendbuff));
+  CUDACHECK(cudaIpcGetMemHandle(&regHandles[comm->intraNodeRank].recvBuffIpc, (void*)info->recvbuff));
+  // Get offset of user buffer within allocation
+  void* baseAddr;
+  size_t size;
+  CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff));
+  regHandles[comm->intraNodeRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
+  CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff));
+  regHandles[comm->intraNodeRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
+  TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->intraNodeRank].recvBuffOffset);
+
+  // Exchange handles within node
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
+  // Open handles at local process
+  for (int i=0; i<comm->localRanks; i++) {
+    if (i == comm->intraNodeRank) {
+      regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL;
+      continue;
+    }
+    CUDACHECK(cudaIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, cudaIpcMemLazyEnablePeerAccess));
+    CUDACHECK(cudaIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, cudaIpcMemLazyEnablePeerAccess));
+    // Get real address of buffer
+    regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset;
+    regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset;
+  }
+  regInfo->nBuffs = comm->localRanks;
+  TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs);
+  return ncclSuccess;
+}
+
 // Compute enqueue element, save it in list
 // Compute CUDA launch parameters
 // Capture time code in view of CUDA graph
 static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
   ncclComm_t comm = info->comm;
-  if (comm->nRanks == 1) {
+  if (comm->nRanks == 1 &&
+      // User-defined reduction ops may need alter the data even for unitary reductions
+      info->op < ncclNumOps) {
     if (info->sendbuff != info->recvbuff)
       CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
     return ncclSuccess;
@@ -607,6 +687,19 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
     comm->args.active = 2;    // I am so far the last element; may be changed later in aggregation mode
   }
 
+  // Register and exchange input and output buffers
+  if (comm->usingCudaGraph &&                   // only in CUDA graph mode
+      comm->graphRegister == 1 &&               // when registration is enabled
+      info->algorithm == NCCL_ALGO_COLLNET &&   // limited to CollNet for now
+      comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
+      comm->intraRanks == 1) {                  // only in multi-process mode
+    NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo));
+    // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo
+    // because the registered addresses are in ncclWork
+    if (eqElem->buffRegInfo.nBuffs > 0) comm->args.active = 0;
+    comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs;
+  }
+
   return ncclSuccess;
 }
 
@@ -623,41 +716,15 @@ static inline int findShortestChannel(ncclComm_t comm) {
   return minC;
 }
 
-static inline ncclResult_t getNextChannel(ncclComm_t comm, int* nextChannel) {
-  if (comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) {
-    *nextChannel = findShortestChannel(comm);
+static inline int getNextChannel(ncclComm_t comm, int aggMode) {
+  int nextChannel = 0;
+  if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) {
+    nextChannel = findShortestChannel(comm);
   } else {
-    *nextChannel = comm->lastChannel % comm->nChannels;
+    nextChannel = comm->lastChannel % comm->nChannels;
     comm->lastChannel++;
   }
-  return ncclSuccess;
-}
-
-// Dynamic enqueue code
-static ncclResult_t ncclEnqueueCollKernel(ncclComm_t comm, struct ncclQueueElem* eqElem) {
-  struct ncclWorkElem* work = &eqElem->work;
-  struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
-
-  int nChannels = work->coll.nChannels;
-  for (int bid=0; bid<nChannels; bid++) {
-    int channelId = comm->lastChannel % comm->nChannels;
-    struct ncclChannel* channel = comm->channels+channelId;
-
-    // Proxy
-    proxyArgs->subs[0].channel = channel;
-    proxyArgs->opCount = comm->collOpCount;
-    proxyArgs->commOpCount = comm->opCount;
-
-    if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks));
-
-    comm->lastChannel++;
-    work->coll.bid = bid % nChannels;
-    NCCLCHECK(getNextOp(channel, NULL, work));
-    //INFO(NCCL_COLL, "Host enqueue: bid %d channel %d index %ld nThreads %d funcIndex %d count %ld nChannels %d",
-    //      work->coll.bid, channelId, channel->workFifoTail, work->nThreads, work->funcIndex, work->coll.count, work->coll.nChannels);
-  }
-  comm->collOpCount++;
-  return ncclSuccess;
+  return nextChannel;
 }
 
 ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
@@ -689,7 +756,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
       channelUsed += info->nChannels;
       // We can use fast path if all collectives are the same
       homogeneous &= info->coll == comm->asyncOps[0].coll &&
-                     info->op == comm->asyncOps[0].op &&
+                     info->opFull.op == comm->asyncOps[0].opFull.op &&
                      info->datatype == comm->asyncOps[0].datatype;
       if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport));
     }
@@ -766,13 +833,22 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
-enum { COLL_SEGMENT=0, P2P_SEGMENT=1 };
+enum { RingTree_Segment=0, P2P_Segment=1, CollNet_Segment=2 };
 static int getSegment(int type, int delta, struct ncclWork* work) {
-  if (type == P2P_SEGMENT) {  // P2P
+  // Current ncclWork is full
+  if (work->elems[NCCL_MAX_WORK_ELEMENTS-1].active != 0) return -1;
+
+  if (type == P2P_Segment) {  // P2P
+    // Do not mix P2P and collective ops
+    if (work->elems[0].funcIndex != FUNC_INDEX_P2P) return -1;
     for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != delta; s++) {
       if (work->elems[s].active == 0) return s;
     }
-  } else { // aggregation
+  } else if (type == CollNet_Segment) { // CollNet
+    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s+=NCCL_REG_ELEM_FACTOR) {
+      if (work->elems[s].active == 0) return s;
+    }
+  } else {  // Ring or Tree
     for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
       if (work->elems[s].active == 0) return s;
     }
@@ -794,13 +870,14 @@ static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct
   return ncclSuccess;
 }
 
-static ncclResult_t enqueueSegOp(int type, struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s) {
+static ncclResult_t enqueueSegOp(int type, struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s,
+    struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) {
   // Copy element into corresponding segment of ncclWork
   memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
   work->elems[s].active = 1;
 
   // Determine nThreads at dynamic time
-  if (type == P2P_SEGMENT) {
+  if (type == P2P_Segment) {
     const int nsegments = s+1;
     int nThreads = 512;
     while (nsegments*nThreads > 512) nThreads /= 2;
@@ -808,6 +885,33 @@ static ncclResult_t enqueueSegOp(int type, struct ncclWorkElem* elem /* input */
     for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
   }
 
+  // Copy registered buffer addresses into ncclWork
+  if (regInfo->nBuffs > 0) {
+    struct ncclWorkRegElem* regElem = (struct ncclWorkRegElem*)(work->elems+s);
+    // For CollNet
+    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
+      int peer = channel->collTree.down[i];
+      if (peer == -1) break;
+      int j = comm->rankToIntraNodeRank[peer];
+      if (j < 0) {
+        WARN("Invalid intra-node rank %d for peer %d", j, peer);
+        return ncclInternalError;
+      }
+      regElem->dnInputs[i] = regInfo->sendbuffs[j];
+      regElem->dnOutputs[i] = regInfo->recvbuffs[j];
+    }
+    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
+      int peer = channel->collTree.up[i];
+      if (peer == -1) break;
+      int j = comm->rankToIntraNodeRank[peer];
+      if (j < 0) {
+        WARN("Invalid intra-node rank %d for peer %d", j, peer);
+        return ncclInternalError;
+      }
+      regElem->upOutputs[i] = regInfo->recvbuffs[j];
+    }
+    work->elems[s].regUsed = 1;
+  }
   return ncclSuccess;
 }
 
@@ -820,9 +924,9 @@ ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* e
   int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
   struct ncclWork* w = channel->workFifo+opIndex;
   int segment = -1;
-  if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].active == 0) {
+  if (channel->workCount) {
     // Try to pack more segments into a single operation
-    segment = getSegment(P2P_SEGMENT, workElem->p2p.delta, w);
+    segment = getSegment(P2P_Segment, workElem->p2p.delta, w);
   }
   if (segment == -1) {
     NCCLCHECK(getNextOp(channel, &w, NULL));
@@ -831,7 +935,7 @@ ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* e
 
   // store work element into FIFO
   NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs));
-  NCCLCHECK(enqueueSegOp(P2P_SEGMENT, workElem, w, segment));
+  NCCLCHECK(enqueueSegOp(P2P_Segment, workElem, w, segment, &eqElem->buffRegInfo, channel, comm));
   return ncclSuccess;
 }
 
@@ -861,15 +965,18 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclEnqueueAsyncKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) {
+// Dynamic enqueue function for collective kernels
+// Supports both aggregated and non-aggregated modes
+ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) {
   struct ncclWorkElem* work = &eqElem->work;
   struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
 
   int nChannels = work->coll.nChannels;
   size_t channelSize = work->coll.count*ncclTypeSize(proxyArgs->dtype)/work->coll.nChannels;
+  int segmentType = proxyArgs->redOp == ncclNumOps ? RingTree_Segment : CollNet_Segment;  // redOp is only set when using CollNet
+
   for (int bid=0; bid<nChannels; bid++) {
-    int channelId;
-    NCCLCHECK(getNextChannel(comm, &channelId));
+    int channelId = getNextChannel(comm, aggMode);
     struct ncclChannel* channel = comm->channels+channelId;
 
     // Proxy
@@ -878,18 +985,19 @@ ncclResult_t ncclEnqueueAsyncKernel(struct ncclComm* comm, struct ncclQueueElem*
     proxyArgs->commOpCount = comm->opCount;
     if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks));
 
-    // Try to reuse last work if not full yet
     work->coll.bid = bid % nChannels;
-    int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
-    struct ncclWork* w = channel->workFifo+opIndex;
+    struct ncclWork* w = NULL;
     int segment = -1;
-    if (channel->workCount && w->elems[NCCL_MAX_WORK_ELEMENTS-1].active == 0 &&
-        // All elems in work must have same (funcIndex,nThreads),
-        // see "src/collectives/device/common.h"
-        w->elems[0].funcIndex == work->funcIndex &&
-        w->elems[0].nThreads == work->nThreads) {
+    if (aggMode && channel->workCount) {
       // Try to pack more segments into a single operation
-      segment = getSegment(COLL_SEGMENT, 0, w);
+      int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
+      w = channel->workFifo+opIndex;
+      // All elems in work must have same (funcIndex,nThreads),
+      // see "src/collectives/device/common.h"
+      if (w->elems[0].funcIndex == work->funcIndex &&
+          w->elems[0].nThreads == work->nThreads) {
+        segment = getSegment(segmentType, 0, w);
+      }
     }
     if (segment == -1) {
       NCCLCHECK(getNextOp(channel, &w, NULL));
@@ -897,7 +1005,7 @@ ncclResult_t ncclEnqueueAsyncKernel(struct ncclComm* comm, struct ncclQueueElem*
     }
 
     // store work element into FIFO
-    NCCLCHECK(enqueueSegOp(COLL_SEGMENT, work, w, segment));
+    NCCLCHECK(enqueueSegOp(segmentType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
     channel->totalSize += channelSize;
   }
   comm->collOpCount++;
@@ -909,17 +1017,15 @@ void CUDART_CB ncclEnqueueHostSetup(void* arg) {
   ncclResult_t ret;
   struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg;
   ncclComm_t comm = eqInfo->comm;
+  int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0;
 
   // Iterate through the element list
   struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
   while (eqElem != NULL) {
     if (eqElem->work.funcIndex == FUNC_INDEX_P2P) {
       NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end);
-    } else if (eqInfo->elemList->count() > 1) {
-      // We have more than one operation, hence aggregating
-      NCCLCHECKGOTO(ncclEnqueueAsyncKernel(comm, eqElem), ret, cb_end);
     } else {
-      NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem), ret, cb_end);
+      NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end);
     }
     eqElem = eqInfo->elemList->getNext();
   }
@@ -937,6 +1043,41 @@ cb_end:
 template void CUDART_CB ncclEnqueueHostSetup<0>(void*);
 template void CUDART_CB ncclEnqueueHostSetup<1>(void*);
 
+void* graphHelperFunc(void *args) {
+  struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args;
+  if (res == NULL) {
+    WARN("CUDA Graph helper resource is null");
+    return NULL;
+  }
+  int dev = res->comm->cudaDev;
+  CUDACHECKIGNORE(cudaSetDevice(dev));
+  INFO(NCCL_COLL, "CUDA Graph helper thread created for device %d", dev);
+
+  volatile enum helperThreadState* state = &res->threadState;
+  volatile int* ipcTail = &res->ipcTail;
+  while (1) {
+    int ipcTailMark = *ipcTail;
+    int ipcCount = 0;
+    while (res->ipcHead != ipcTailMark) {
+      if (res->ipcBases[res->ipcHead] != NULL)
+        CUDACHECKIGNORE(cudaIpcCloseMemHandle(res->ipcBases[res->ipcHead]));
+      res->ipcBases[res->ipcHead] = NULL;
+      res->ipcHead = (res->ipcHead+1)%NCCL_IPC_POOL_SIZE;
+      ipcCount++;
+    }
+    TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount);
+    pthread_mutex_lock(&res->threadLock);
+    while (res->ipcHead == *ipcTail && *state != ThreadStop) {
+      pthread_cond_wait(&res->threadCond, &res->threadLock);
+    }
+    pthread_mutex_unlock(&res->threadLock);
+    if (*state == ThreadStop) {
+      INFO(NCCL_COLL, "CUDA Graph helper thread for device %d returning", dev);
+      return NULL;
+    }
+  }
+}
+
 ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
   comm->usingCudaGraph = 0;
 #if CUDART_VERSION >= 11030
@@ -961,6 +1102,15 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
     }
     if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
     comm->usingCudaGraph = 1;
+
+    // Create helper thread that closes IPC handles during graph destruction
+    // Only create this thread when buffer registration is enabled
+    if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
+      pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL);
+      pthread_cond_init(&comm->graphHelperResources->threadCond, NULL);
+      comm->graphHelperResources->threadState = ThreadStart;
+      pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources);
+    }
   }
 #endif
   return ncclSuccess;
@@ -990,18 +1140,92 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
 #endif
 }
 
-ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
-  // Launch asynchronously if needed
-  if (ncclAsyncMode()) {
-    ncclResult_t ret = ncclSuccess;
-    int savedDev = -1;
-    // Check arguments
-    NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
-    if (info->comm->checkPointers) {
-      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
-      CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
+static ncclResult_t hostToDevRedOp(
+    ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm
+  ) {
+  union {
+    int8_t i8;
+    uint8_t u8;
+    int32_t i32;
+    uint32_t u32;
+    int64_t i64;
+    uint64_t u64;
+    half f16;
+    #if defined(__CUDA_BF16_TYPES_EXIST__)
+      __nv_bfloat16 bf16;
+    #endif
+    float f32;
+    double f64;
+    void *ptr;
+  };
+  u64 = 0;
+  opFull->scalarArgIsPtr = false;
+  switch (int(op)) {
+  case ncclSum:  opFull->op = ncclDevSum;  break;
+  case ncclProd: opFull->op = ncclDevProd; break;
+  case ncclMax:  opFull->op = ncclDevMax;  break;
+  case ncclMin:  opFull->op = ncclDevMin;  break;
+  case ncclAvg:
+    switch ((int)datatype) {
+    case ncclInt8:  case ncclInt32:  case ncclInt64:
+    case ncclUint8: case ncclUint32: case ncclUint64:
+      opFull->op = ncclDevSumPostDiv;
+      u64 = comm->nRanks;
+      break;
+    case ncclFloat16:
+      opFull->op = ncclDevPreMulSum;
+      f16 = __float2half(float(1.0/comm->nRanks)); // __double2half not supported pre CUDA 11.x
+      break;
+    #if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      opFull->op = ncclDevPreMulSum;
+      bf16 = __float2bfloat16(float(1.0/comm->nRanks));
+      break;
+    #endif
+    case ncclFloat32:
+      opFull->op = ncclDevPreMulSum;
+      f32 = float(1.0/comm->nRanks);
+      break;
+    case ncclFloat64:
+      opFull->op = ncclDevPreMulSum;
+      f64 = 1.0/comm->nRanks;
+      break;
     }
-    NCCLCHECKGOTO(ArgsCheck(info), ret, end);
+    opFull->scalarArgIsPtr = false;
+    opFull->scalarArg = u64;
+    break;
+  default: // user created
+    int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
+    ncclUserRedOp *user = &comm->userRedOps[ix];
+    if (datatype != user->datatype) {
+      WARN("Data type supplied to user-created ncclRedOp_t does not match type "
+           "given to reduction operation");
+      return ncclInvalidArgument;
+    }
+    *opFull = user->opFull;
+    break;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  ncclResult_t ret = ncclSuccess;
+  bool isAsync = ncclAsyncMode();
+  int savedDev = -1;
+  // Check arguments
+  NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
+  if (isAsync && info->comm->checkPointers) {
+    CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
+    CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
+  }
+  NCCLCHECKGOTO(ArgsCheck(info), ret, end);
+
+  // Copy reduction op state from op handle into info struct here since the
+  // op handle may be destroyed before ncclGroupEnd().
+  NCCLCHECKGOTO(hostToDevRedOp(&info->opFull, info->op, info->datatype, info->comm), ret, end);
+
+  // Launch asynchronously if needed
+  if (isAsync) {
     // Always register comm even in case of error to make sure ncclGroupEnd
     // cleans it up.
     NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
@@ -1016,14 +1240,8 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
     } else {
       NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
     }
-end:
-    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
-    ncclAsyncErrCheck(ret);
-    return ret;
   } else {
-    NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
-    NCCLCHECK(ArgsCheck(info));
-    NCCLCHECK(checkSetStream(info));
+    NCCLCHECKGOTO(checkSetStream(info), ret, end);
 
     INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
@@ -1032,24 +1250,82 @@ end:
     // Check whether we are in cuda graph mode
     cudaGraph_t graph;
     ncclComm_t comm = info->comm;
-    NCCLCHECK(ncclGetCudaGraph(comm, &graph));
+    NCCLCHECKGOTO(ncclGetCudaGraph(comm, &graph), ret, end);
 
     // Common part between graph mode and non-graph mode
-    NCCLCHECK(ncclSetupCollKernel(info));
+    NCCLCHECKGOTO(ncclSetupCollKernel(info), ret, end);
 
     // Host setup
     if (comm->usingCudaGraph) {
-      NCCLCHECK(ncclCudaGraphHostSetup(comm, graph));
+      NCCLCHECKGOTO(ncclCudaGraphHostSetup(comm, graph), ret, end);
     } else {
       ncclEnqueueHostSetup<0>(comm->enqueueInfo);
-      NCCLCHECK(comm->enqueueInfo->ret);
+      NCCLCHECKGOTO(comm->enqueueInfo->ret, ret, end);
     }
 
     // Common part between graph mode and non-graph mode
-    NCCLCHECK(ncclLaunchBarrier(comm));
-    NCCLCHECK(ncclLaunchKernel(comm));
-    NCCLCHECK(ncclRecordEvents(comm));
-    NCCLCHECK(ncclLaunchReset(comm));
-    return ncclSuccess;
+    NCCLCHECKGOTO(ncclLaunchBarrier(comm), ret, end);
+    NCCLCHECKGOTO(ncclLaunchKernel(comm), ret, end);
+    NCCLCHECKGOTO(ncclRecordEvents(comm), ret, end);
+    NCCLCHECKGOTO(ncclLaunchReset(comm), ret, end);
   }
+end:
+  if (isAsync && savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
+  if (isAsync) ncclAsyncErrCheck(ret);
+  return ret;
+}
+
+NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
+  if (comm->userRedOpFreeHead == comm->userRedOpCapacity) {
+    // double capacity and resize
+    int cap = 2*comm->userRedOpCapacity;
+    if (cap < 4) cap = 4;
+    ncclUserRedOp *ops = new ncclUserRedOp[cap];
+    std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
+    for(int ix=comm->userRedOpCapacity; ix < cap; ix++)
+      ops[ix].freeNext = ix + 1;
+    delete[] comm->userRedOps;
+    comm->userRedOps = ops;
+    comm->userRedOpCapacity = cap;
+  }
+  // pop from free list
+  int ix = comm->userRedOpFreeHead;
+  ncclUserRedOp *user = &comm->userRedOps[ix];
+  comm->userRedOpFreeHead = user->freeNext;
+
+  user->freeNext = -1; // allocated
+  user->datatype = datatype;
+  user->opFull.op = ncclDevPreMulSum;
+  if (residence == ncclScalarHostImmediate) {
+    user->opFull.scalarArgIsPtr = false;
+    std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype));
+  } else {
+    user->opFull.scalarArgIsPtr = true;
+    user->opFull.scalarArg = reinterpret_cast<uint64_t>(scalar);
+  }
+  *op = ncclRedOp_t(int(ncclNumOps) + ix);
+  *op = ncclUserRedOpMangle(comm, *op);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm);
+ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
+  if (0 <= int(op) && int(op) < int(ncclNumOps)) {
+    WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
+    return ncclInvalidArgument;
+  }
+  if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) {
+    WARN("ncclRedOpDestroy :  operator is garbage.");
+    return ncclInvalidArgument;
+  }
+  int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
+  if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) {
+    WARN("ncclRedOpDestroy : operator unknown to this communicator.");
+    return ncclInvalidArgument;
+  }
+  // push to free list
+  comm->userRedOps[ix].freeNext = comm->userRedOpFreeHead;
+  comm->userRedOpFreeHead = ix;
+  return ncclSuccess;
 }
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 135f569..1d34286 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -368,7 +368,10 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
 }
 
 struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
-struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
+struct kvDict kvDictPciGen[] = {
+  { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, /* Kernel 5.6 and earlier */
+  { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
+  { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
 ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
   const char* str;
 
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 29e8f00..8f50301 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -625,7 +625,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
   NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm));
 
   struct ncclXmlNode* nvlNode = NULL;
-  NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode));
+  NCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode));
   if (nvlNode == NULL) {
     // NVML NVLink detection
     int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 76f29b2..0c16b95 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -7,6 +7,11 @@
 #ifndef XML_H_
 #define XML_H_
 
+#include "nccl.h"
+#include "debug.h"
+#include "checks.h"
+#include <stdlib.h>
+
 // A few constraints to make the implementation easy
 #define MAX_STR_LEN 255
 #define MAX_ATTR_COUNT 16
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 9488c90..0791592 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -11,6 +11,9 @@
 #include "checks.h"
 #include "align.h"
 #include <sys/mman.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
 
 template <typename T>
 static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
@@ -27,7 +30,7 @@ static inline ncclResult_t ncclCudaHostFree(void* ptr) {
 }
 
 template <typename T>
-static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
   void* p = malloc(nelem*sizeof(T));
   if (p == NULL) {
     WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
@@ -35,10 +38,8 @@ static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc,
   }
   memset(p, 0, nelem*sizeof(T));
   *ptr = (T*)p;
-  INFO(NCCL_ALLOC, "%s:%d Mem Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
   return ncclSuccess;
 }
-#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
 
 template <typename T>
 static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 9c2e4f6..77ac12b 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -16,7 +16,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commSt
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
-ncclResult_t bootstrapBarrier(void* commState, int *ranks, int tag, int rank, int nranks);
+ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
+ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
 ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr);
 ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
 ncclResult_t bootstrapClose(void* commState);
diff --git a/src/include/collectives.h b/src/include/collectives.h
index db073f0..5fde721 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -7,74 +7,104 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_
 
+enum ncclDevRedOp_t {
+  ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin,
+  ncclDevPreMulSum, ncclDevSumPostDiv,
+  ncclNumDevRedOps
+};
+struct ncclDevRedOpFull {
+  ncclDevRedOp_t op;
+  bool scalarArgIsPtr;
+  uint64_t scalarArg;
+};
+
 #define FUNC_INDEX_P2P 0
-#define FUNC_INDEX(func, redop, ncclType, al, pr) (1+(((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
+#define FUNC_INDEX(func, devredop, ncclType, al, pr) (1+ncclNumTypes+(((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
 
-#define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
-  ncclFunction_##func##_##algo##_##proto##_##redop##_##type
+#define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \
+  ncclFunction_##func##_##algo##_##proto##_##devredop##_##type
 
-#define NCCL_KERN_NAME(func, algo, proto, redop, type) \
-  ncclKernel_##func##_##algo##_##proto##_##redop##_##type
+#define NCCL_ONERANK_REDUCE_NAME(devredop, type) \
+  ncclFunction_OneRankReduce_##devredop##_##type
+
+#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
+  ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
 
 #define NCCL_IMPL_NAME(func, algo, proto) \
   nccl##func##algo##proto
 
 /* Declare all collective operations */
-#define DECL5(func, algo, proto, redop, type) \
-  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, redop, type)(); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(ncclWorkElem c); \
+#define DECL5(func, algo, proto, devredop, type) \
+  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem c); \
 
-#define DECL4(func, algo, redop, type) \
-  DECL5(func, algo, SIMPLE, redop, type) \
-  DECL5(func, algo, LL,     redop, type) \
-  DECL5(func, algo, LL128,  redop, type)
+#define CONCAT(a,b) a##b
+#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
+#define MACRO_IF_0(t, f) f
+#define MACRO_IF_1(t, f) t
 
-#define DECL3(func, redop, type) \
-  DECL4(func, RING,    redop, type) \
-  DECL4(func, TREE,    redop, type) \
-  DECL4(func, COLLNET, redop, type)
+#define DECL4(func, algo, devredop, type, undef) \
+  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \
+  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL,     devredop, type)) \
+  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128,  devredop, type))
+
+#define DECL3(func, devredop, type, undef) \
+  DECL4(func, RING,    devredop, type, undef) \
+  DECL4(func, TREE,    devredop, type, undef) \
+  DECL4(func, COLLNET, devredop, type, undef)
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-#define DECL2(func, redop) \
-  DECL3(func, redop, int8_t) \
-  DECL3(func, redop, uint8_t) \
-  DECL3(func, redop, int32_t) \
-  DECL3(func, redop, uint32_t) \
-  DECL3(func, redop, int64_t) \
-  DECL3(func, redop, uint64_t) \
-  DECL3(func, redop, half) \
-  DECL3(func, redop, float) \
-  DECL3(func, redop, double) \
-  DECL3(func, redop, __nv_bfloat16)
+#define DECL2(func, devredop, undefForFloat) \
+  DECL3(func, devredop, int8_t, /*undef=*/0) \
+  DECL3(func, devredop, uint8_t, /*undef=*/0) \
+  DECL3(func, devredop, int32_t, /*undef=*/0) \
+  DECL3(func, devredop, uint32_t, /*undef=*/0) \
+  DECL3(func, devredop, int64_t, /*undef=*/0) \
+  DECL3(func, devredop, uint64_t, /*undef=*/0) \
+  DECL3(func, devredop, half, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, float, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, double, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, __nv_bfloat16, /*undef=*/undefForFloat)
 #else
-#define DECL2(func, redop) \
-  DECL3(func, redop, int8_t) \
-  DECL3(func, redop, uint8_t) \
-  DECL3(func, redop, int32_t) \
-  DECL3(func, redop, uint32_t) \
-  DECL3(func, redop, int64_t) \
-  DECL3(func, redop, uint64_t) \
-  DECL3(func, redop, half) \
-  DECL3(func, redop, float) \
-  DECL3(func, redop, double)
+#define DECL2(func, devredop, undefForFloat) \
+  DECL3(func, devredop, int8_t, /*undef=*/0) \
+  DECL3(func, devredop, uint8_t, /*undef=*/0) \
+  DECL3(func, devredop, int32_t, /*undef=*/0) \
+  DECL3(func, devredop, uint32_t, /*undef=*/0) \
+  DECL3(func, devredop, int64_t, /*undef=*/0) \
+  DECL3(func, devredop, uint64_t, /*undef=*/0) \
+  DECL3(func, devredop, half, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, float, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, double, /*undef=*/undefForFloat)
 #endif
 
 #define DECL(func) \
-  DECL2(func, Sum) \
-  DECL2(func, Prod) \
-  DECL2(func, Min) \
-  DECL2(func, Max) \
-  DECL2(func, Avg)
+  DECL2(func, Sum, /*undefForFloat=*/0) \
+  DECL2(func, Prod, /*undefForFloat=*/0) \
+  DECL2(func, Min, /*undefForFloat=*/0) \
+  DECL2(func, Max, /*undefForFloat=*/0) \
+  DECL2(func, PreMulSum, /*undefForFloat=*/0) \
+  DECL2(func, SumPostDiv, /*undefForFloat=*/1)
 
-#define DECL_ALL \
-  DECL2(Broadcast, Sum) \
-  DECL(Reduce) \
-  DECL2(AllGather, Sum) \
-  DECL(ReduceScatter) \
-  DECL(AllReduce) \
-  DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
+DECL2(Broadcast, Sum, /*undefForFloat=*/0)
+DECL(Reduce)
+DECL2(AllGather, Sum, /*undefForFloat=*/0)
+DECL(ReduceScatter)
+DECL(AllReduce)
+DECL5(SendRecv, RING, SIMPLE, Sum, int8_t)
 
-DECL_ALL
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)();
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16)();
+#endif
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
 
 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -88,5 +118,6 @@ DECL_ALL
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define SENDRECV_SLICEFACTOR 4
+#define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
 
 #endif
diff --git a/src/include/comm.h b/src/include/comm.h
index 214d988..bcbc695 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -9,6 +9,7 @@
 
 #include "transport.h"
 #include "p2p.h"
+#include "collectives.h"
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -30,13 +31,16 @@ struct cudaLaunchParams {
 #define NCCL_LL128_THREAD_THRESHOLD 8
 #define NCCL_SIMPLE_THREAD_THRESHOLD 64
 
+#define NCCL_MAX_INTRA_RANKS 32
+
 struct ncclSendMem {
   union {
     struct {
       uint64_t head;
       char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
       void* ptrExchange;
-      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t redOpArgExchange[2];
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
     };
     char pad3[MEM_ALIGN];
   };
@@ -56,6 +60,28 @@ struct ncclRecvMem {
   char buff[1]; // Actually larger than that
 };
 
+typedef cudaError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
+
+enum helperThreadState {ThreadStart, ThreadStop};
+
+#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_INTRA_RANKS*NCCL_MAX_OPS)
+
+struct ncclGraphHelperResources {
+  ncclComm* comm;
+  pthread_mutex_t threadLock;
+  pthread_cond_t  threadCond;
+  enum helperThreadState threadState;
+  void* ipcBases[NCCL_IPC_POOL_SIZE];
+  int ipcTail;
+  int ipcHead;
+};
+
+struct ncclUserRedOp {
+  int freeNext; // -1=allocated, otherwise index of next free entry in array
+  ncclDataType_t datatype;
+  ncclDevRedOpFull opFull;
+};
+
 struct ncclComm {
   struct ncclChannel channels[MAXCHANNELS];
 
@@ -76,7 +102,12 @@ struct ncclComm {
 
   int node;
   int nNodes;
+
+  // Intra-node rank info
+  int intraNodeGlobalRanks[NCCL_MAX_INTRA_RANKS];
   int localRanks;
+  int intraNodeRank;
+  int8_t* rankToIntraNodeRank;
 
   enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
   cudaStream_t userStream;
@@ -142,6 +173,7 @@ struct ncclComm {
 
   // Whether this communicator uses collNet
   int collNetSupport;
+  int intraHighestTransportType;
 
   // Store info of async operations
   struct ncclInfo* asyncOps;
@@ -160,9 +192,38 @@ struct ncclComm {
   // Store info for cudaGraph
   int usingCudaGraph; // Only use it during capture time, not launch time
   struct ncclQueueInfo* enqueueInfo;
+  int nQueueInfoCreated;
+  int nQueueInfoDestroyed;
   cudaGraphNode_t lastSetupNode;
   unsigned long long lastCudaGraphId;
   int driverVersion;
+  pfn_cuMemGetAddressRange_t pfnCuMemGetAddressRange;
+  pthread_t graphHelperThread;
+  struct ncclGraphHelperResources* graphHelperResources;
+  int disableGraphHelper;
+  int graphRegister;
+
+  // user-created reduction ops
+  int userRedOpCapacity, userRedOpFreeHead;
+  ncclUserRedOp *userRedOps;
 };
 
+// Scrambles the bits of non-builtin values of ncclRedOp_t according to the
+// communicator memory address. Used to catch bugs so that integer handles
+// associated with this communicator won't collide with handles of other
+// communicatrs. This function is its own inverse.
+static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
+  // Preserve the built-in values.
+  if(int(op) < int(ncclNumOps))
+    return op;
+  uint64_t h = reinterpret_cast<uint64_t>(comm);
+  h ^= h >> 32;
+  h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant
+  h >>= 32; // h is now an excellent 32-bit hash of the comm pointer
+  h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1
+  int op1 = int(h) ^ int(op);
+  // Since builtin values are preserved, we also have to preserve their preimage.
+  return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1);
+}
+
 #endif
diff --git a/src/include/debug.h b/src/include/debug.h
index e7a152c..6ce90ee 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -7,17 +7,14 @@
 #ifndef NCCL_DEBUG_H_
 #define NCCL_DEBUG_H_
 
-#include "core.h"
-
+#include "nccl_net.h"
 #include <stdio.h>
 #include <chrono>
 
 #include <sys/syscall.h>
 #include <limits.h>
 #include <string.h>
-#include "nccl_net.h"
-
-#define gettid() (pid_t) syscall(SYS_gettid)
+#include <pthread.h>
 
 extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index f172f38..676ffda 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -72,8 +72,11 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
 
-#define NCCL_DIRECT_GPU 0x01
-#define NCCL_DIRECT_NIC 0x10
+#define NCCL_DIRECT_WRITE 0x01
+#define NCCL_DIRECT_READ  0x02
+#define NCCL_DIRECT_NIC   0x04
+#define NCCL_IPC_WRITE    0x08
+#define NCCL_IPC_READ     0x10
 
 struct ncclConnInfo {
   // Regular comm mechanism
@@ -84,6 +87,7 @@ struct ncclConnInfo {
   int direct;         // Direct communication
   int shared;         // Buffers are shared
   void **ptrExchange; // Pointer exchange for direct communication
+  uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
 
   int *sizesFifo;     // Sizes fifo from GPU to proxy
   void* *ptrsFifo;      // Buffer fifo from proxy to GPU
@@ -154,8 +158,9 @@ struct ncclWorkElem {
   struct ncclDevComm* comm;
   uint16_t nThreads;
   uint16_t funcIndex;
-  uint16_t index;
-  uint16_t active;
+  uint8_t regUsed;
+  uint8_t direct;
+  uint8_t active, redOpArgIsPtr;
 
   const void * sendbuff;
   void * recvbuff;
@@ -168,6 +173,7 @@ struct ncclWorkElem {
       uint32_t root;
       uint8_t bid;
       uint8_t nChannels;
+      uint64_t redOpArg;
     } coll;
     struct {
       size_t sendCount;
@@ -180,11 +186,24 @@ struct ncclWorkElem {
     uint64_t align[4];
   };
 };
-struct ncclWork {
-  struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
-};
 static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
 
+struct ncclWorkRegElem {
+  struct ncclWorkElem elem;
+  void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
+  void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
+  void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
+};
+#define NCCL_REG_ELEM_FACTOR 4
+static_assert(sizeof(struct ncclWorkRegElem) == (NCCL_REG_ELEM_FACTOR*sizeof(struct ncclWorkElem)), "ncclWorkRegElem size must be pow2 times ncclWorkElem size");
+
+struct ncclWork {
+  union {
+    struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
+    struct ncclWorkRegElem regElems[NCCL_MAX_WORK_ELEMENTS/NCCL_REG_ELEM_FACTOR];
+  };
+};
+
 struct ncclChannel {
   union {
     struct {
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 4632c9b..962896e 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -30,10 +30,19 @@ void CUDART_CB ncclEnqueueHostSetup(void* arg);
 ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
 ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
 
+struct ncclBuffRegInfo {
+  void* sendbuffsBase[NCCL_MAX_INTRA_RANKS];
+  void* recvbuffsBase[NCCL_MAX_INTRA_RANKS];
+  void* sendbuffs[NCCL_MAX_INTRA_RANKS];
+  void* recvbuffs[NCCL_MAX_INTRA_RANKS];
+  int nBuffs;
+};
+
 // Enqueue information (for kernel and proxy) for each operation
 struct ncclQueueElem {
   struct ncclWorkElem work;
   struct ncclProxyArgs proxyArgs;
+  struct ncclBuffRegInfo buffRegInfo;
 };
 
 typedef ncclRecyclableList<struct ncclQueueElem> ncclQueueElemList;
@@ -43,6 +52,7 @@ struct ncclQueueInfo {
   ncclComm_t comm;
   int maxChannels;    // Dynamic version of gridDim
   ncclResult_t ret;   // Return value of host setup call
+  int nRegBuffs;
   ncclQueueElemList* elemList;
 };
 
@@ -50,6 +60,7 @@ static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_
   NCCLCHECK(ncclCalloc(eqInfo, 1));
   (*eqInfo)->comm = comm;
   (*eqInfo)->elemList = new ncclQueueElemList();
+  (*eqInfo)->comm->nQueueInfoCreated++;
   return ncclSuccess;
 }
 
@@ -58,6 +69,7 @@ static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
   if (eqInfo == NULL) return ncclInternalError;
   eqInfo->maxChannels = 0;
   eqInfo->ret = ncclSuccess;
+  eqInfo->nRegBuffs = 0;
   eqInfo->elemList->recycle();
   return ncclSuccess;
 }
@@ -67,7 +79,54 @@ static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
 static void ncclDestroyQueueInfo(void* ptr) {
   if (ptr == NULL) return;
   struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
+  struct ncclComm* comm = eqInfo->comm;
+  // Close IPC mem handles for registered buffers
+  struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
+#if 0
+  // Ideally, the deregistration should happen here
+  // but currently the destroy function of CUDA objects does not allow CUDA API calls
+  while (eqElem != NULL) {
+    for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
+      if (i == eqInfo->comm->intraNodeRank) continue;
+      CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
+      CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
+    }
+    eqElem = eqInfo->elemList->getNext();
+  }
+#else
+  // Instead, we push these pointers to a pool owned by ncclComm
+  // and asks a helper thread to close mem handles
+  struct ncclGraphHelperResources* res = comm->graphHelperResources;
+  int ipcTailOld = 0;
+  if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip;
+
+  pthread_mutex_lock(&res->threadLock);
+  ipcTailOld = res->ipcTail;
+  while (eqElem != NULL) {
+    for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
+      if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) {
+        res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i];
+        res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
+      }
+      if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) {
+        res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i];
+        res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
+      }
+    }
+    eqElem = eqInfo->elemList->getNext();
+  }
+  if (res->ipcTail != ipcTailOld) {
+    res->threadState = ThreadStart;
+    TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld);
+    pthread_cond_signal(&res->threadCond);
+  }
+  pthread_mutex_unlock(&res->threadLock);
+#endif
+
+skip:
   delete eqInfo->elemList;
   free(eqInfo);
+  comm->nQueueInfoDestroyed++;
+  return;
 }
 #endif // End include guard
diff --git a/src/include/gdrwrap.h b/src/include/gdrwrap.h
index a34193c..08bb2d8 100644
--- a/src/include/gdrwrap.h
+++ b/src/include/gdrwrap.h
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include <stdint.h> // for standard [u]intX_t types
 #include <stdio.h>
+#include <stdlib.h>
 
 // These can be used if the GDR library isn't thread safe
 #include <pthread.h>
diff --git a/src/include/info.h b/src/include/info.h
index 78a5297..2e99e9c 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -9,6 +9,7 @@
 
 #include "nccl.h"
 #include "devcomm.h"
+#include "collectives.h"
 
 typedef enum {
   ncclPatternRing,
@@ -38,6 +39,7 @@ struct ncclInfo {
   int chunkSteps;
   int sliceSteps;
   // Computed later
+  ncclDevRedOpFull opFull;
   int algorithm;
   int protocol;
   ncclPattern_t pattern;
diff --git a/src/include/param.h b/src/include/param.h
index e4c11df..49c4606 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -7,6 +7,7 @@
 #ifndef NCCL_PARAM_H_
 #define NCCL_PARAM_H_
 
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/types.h>
diff --git a/src/include/transport.h b/src/include/transport.h
index 115bdc5..e64dfbf 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -55,7 +55,7 @@ struct ncclTransport {
 };
 
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
 
 enum { collNetRecv=0, collNetSend=1 };
 int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
diff --git a/src/init.cc b/src/init.cc
index 6fb251f..1684cc9 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -67,13 +67,21 @@ ncclResult_t initCollNet(ncclCollNet_t* collnet) {
 }
 
 ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) {
-  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
+  char ncclNetPluginName[128];
+  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
+  if (envPluginName && strlen(envPluginName)) {
+    snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
+    INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName);
+  } else {
+    sprintf(ncclNetPluginName, "libnccl-net.so");
+  }
+  void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
   if (netPluginLib == NULL) {
     // dlopen does not guarantee to set errno, but dlerror only gives us a
     // string, so checking errno doesn't hurt to try to provide a better
     // error message
     if (errno == ENOENT) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so), using internal implementation");
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
     } else {
       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
     }
@@ -185,6 +193,9 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
 static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
+
+  delete[] comm->userRedOps;
+
   free(comm->connectSend);
   free(comm->connectRecv);
   for (int peer=0; peer<comm->nRanks; peer++) {
@@ -216,8 +227,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
     CUDACHECK(cudaStreamDestroy(comm->groupStream));
   }
 
-  ncclDestroyQueueInfo(comm->enqueueInfo);
-
   // Last rank frees shared resources between threads
   int isLast;
   NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
@@ -238,6 +247,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
 }
 
 NCCL_PARAM(AggChannelSize, "AGG_CHANNEL_SIZE", -2);
+NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0);
+NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0);
 
 static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   if (ndev < 1) {
@@ -294,11 +305,20 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
     comm->asyncAllocMode = ncclComm::ROUND_ROBIN;
   }
 
+  CUDACHECK(cudaDriverGetVersion(&comm->driverVersion));
+
   NCCLCHECK(ncclCreateQueueInfo(&comm->enqueueInfo, comm));
   comm->lastSetupNode = NULL;
   comm->lastCudaGraphId = -1;
-
-  CUDACHECK(cudaDriverGetVersion(&comm->driverVersion));
+  comm->disableGraphHelper = ncclParamDisableGraphHelper();
+  comm->graphRegister = ncclParamGraphRegister();
+#if CUDART_VERSION >= 11030
+  NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
+  comm->graphHelperResources->comm = comm;
+  if (comm->driverVersion >= 11030)
+    // cudaGetDriverEntryPoint requires R465 or above (enhanced compat need)
+    CUDACHECK(cudaGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, cudaEnableDefault));
+#endif
 
   static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
   static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
@@ -309,6 +329,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
   NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
 
+  // Create a map between global rank and intra-node rank
+  NCCLCHECK(ncclCalloc(&comm->rankToIntraNodeRank, comm->nRanks));
+  memset(comm->rankToIntraNodeRank, -1, comm->nRanks*sizeof(comm->rankToIntraNodeRank[0]));
+
   // Mark channels as non initialized.
   for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
 
@@ -528,13 +552,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0;
   int myCompCap = allGather1Data[rank].cudaCompCap;
   int minCompCap = myCompCap, maxCompCap = myCompCap;
-  int intraNodeGlobalRanks[256];
   for (int i = 0; i < nranks; i++) {
     if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
       // Rank is on same node
       if (intraNodeRanks == 0) intraNodeRank0 = i;
       if (i == rank) intraNodeRank = intraNodeRanks;
-      intraNodeGlobalRanks[intraNodeRanks++] = i;
+      comm->intraNodeGlobalRanks[intraNodeRanks] = i;
+      comm->rankToIntraNodeRank[i] = intraNodeRanks;
+      intraNodeRanks++;
       if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
         // Rank is in same process
         if (intraProcRanks == 0) intraProcRank0 = i;
@@ -563,6 +588,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   }
   struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm;
   uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash;
+  comm->intraNodeRank = intraNodeRank;
 
   free(allGather1Data);
 
@@ -792,6 +818,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Check if we can setup CollNet
   if (comm->collNetSupport > 0) {
     int collNetSetupFail = 0;
+    int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P};
     // Find all head ranks
     int nHeads = collNetGraph.nChannels;
     int *heads;
@@ -817,16 +844,26 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
 
     // Connect intra-node CollNet
+    int highestTransportType0, highestTransportType1;
     for (int c=0; c<comm->nChannels; c++) {
       struct ncclChannel* channelRecv = comm->channels+c;
       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
     }
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
     for (int c=0; c<comm->nChannels; c++) {
       struct ncclChannel* channelSend = comm->channels+c;
       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
     }
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
+
+    // Exchange highest intra-node transport type among ranks
+    // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
+    comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
+    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int)));
+    for (int i=0; i<comm->localRanks; i++) {
+      if (highestTypes[i] > comm->intraHighestTransportType)
+        comm->intraHighestTransportType = highestTypes[i];
+    }
     INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
 
 collnet_cleanup:
@@ -874,7 +911,7 @@ collnet_cleanup:
   NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, intraProcRank0Comm));
 
   /* Local intra-node barrier */
-  NCCLCHECK(bootstrapBarrier(comm->bootstrap, intraNodeGlobalRanks, (int)intraNodeRank0pidHash, intraNodeRank, intraNodeRanks));
+  NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->intraNodeGlobalRanks, intraNodeRank, intraNodeRanks, (int)intraNodeRank0pidHash));
 
   if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
 
@@ -974,6 +1011,22 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   return ncclSuccess;
 }
 
+static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) {
+  auto res = comm->graphHelperResources;
+  if (comm->graphHelperThread && res) {
+    pthread_mutex_lock(&res->threadLock);
+    res->threadState = ThreadStop;
+    pthread_cond_signal(&res->threadCond);
+    pthread_mutex_unlock(&res->threadLock);
+    pthread_join(comm->graphHelperThread, NULL);
+  }
+  if (res) {
+    free(res);
+    res = NULL;
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t commDestroy(ncclComm_t comm) {
   int savedDevice;
   CUDACHECK(cudaGetDevice(&savedDevice));
@@ -987,6 +1040,11 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
 
   CUDACHECK(cudaStreamSynchronize(comm->groupStream));
   NCCLCHECK(ncclProxyDestroy(comm));
+  ncclDestroyQueueInfo(comm->enqueueInfo);
+#if CUDART_VERSION >= 11030
+  NCCLCHECK(ncclGraphHelperDestroy(comm));
+#endif
+  INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
   NCCLCHECK(commFree(comm));
 
   if (savedDevice != commDevice)
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index c262f8c..fe4e760 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -51,10 +51,16 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
   }
   if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
 
-  if (info->op < 0 || info->op >= ncclNumOps) {
+  if (info->op < 0 || ncclMaxRedOp < info->op) {
     WARN("%s : invalid reduction operation %d", info->opName, info->op);
     return ncclInvalidArgument;
   }
+  int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps);
+  if (ncclNumOps <= info->op &&
+      (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) {
+    WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op);
+    return ncclInvalidArgument;
+  }
 
   if (info->comm->checkPointers) {
     if (info->coll == ncclFuncSendRecv) {
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index f47c141..439712e 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -60,7 +60,7 @@ ncclResult_t wrap_ibv_symbols(void) {
   if (!ibvhandle) {
     ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
     if (!ibvhandle) {
-      WARN("Failed to open libibverbs.so[.1]");
+      INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]");
       goto teardown;
     }
   }
diff --git a/src/nccl.h.in b/src/nccl.h.in
index a793cac..93a141c 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -19,7 +19,7 @@
 #define NCCL_SUFFIX "${nccl:Suffix}"
 
 #define NCCL_VERSION_CODE ${nccl:Version}
-#define NCCL_VERSION(X,Y,Z) (((X) >= 2 && (Y) >= 9) ? (X) * 10000 + (Y) * 100 + (Z) : (X) * 1000 + (Y) * 100 + (Z))
+#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
 
 #ifdef __cplusplus
 extern "C" {
@@ -102,12 +102,23 @@ ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
 ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
 
 /* Reduction operation selector */
+typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
 typedef enum { ncclSum        = 0,
                ncclProd       = 1,
                ncclMax        = 2,
                ncclMin        = 3,
                ncclAvg        = 4,
-               ncclNumOps     = 5 } ncclRedOp_t;
+               /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
+                * serves as the least possible value for dynamic ncclRedOp_t's
+                * as constructed by ncclRedOpCreate*** functions. */
+               ncclNumOps     = 5,
+               /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
+                * It is defined to be the largest signed value (since compilers
+                * are permitted to use signed enums) that won't grow
+                * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
+                * maintain ABI compatibility. */
+               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
+             } ncclRedOp_t;
 
 /* Data types */
 typedef enum { ncclInt8       = 0, ncclChar       = 0,
@@ -127,6 +138,40 @@ typedef enum { ncclInt8       = 0, ncclChar       = 0,
 #endif
 } ncclDataType_t;
 
+/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
+typedef enum {
+  /* ncclScalarDevice: The scalar is in device-visible memory and will be
+   * dereferenced while the collective is running. */
+  ncclScalarDevice = 0,
+
+  /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
+   * dereferenced before the ncclRedOpCreate***() function returns. */
+  ncclScalarHostImmediate = 1
+} ncclScalarResidence_t;
+
+/*
+ * ncclRedOpCreatePreMulSum
+ *
+ * Creates a new reduction operator which pre-multiplies input values by a given
+ * scalar locally before reducing them with peer values via summation. For use
+ * only with collectives launched against *comm* and *datatype*. The
+ * *residence* argument indicates how/when the memory pointed to by *scalar*
+ * will be dereferenced. Upon return, the newly created operator's handle
+ * is stored in *op*.
+ */
+ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+
+/*
+ * ncclRedOpDestroy
+ *
+ * Destroys the reduction operator *op*. The operator must have been created by
+ * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
+ * destroyed as soon as the last NCCL function which is given that operator returns.
+ */
+ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+
 /*
  * Collective communication operations
  *
diff --git a/src/transport.cc b/src/transport.cc
index c3a4176..d7eadcd 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -19,7 +19,7 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
 };
 
 template <int type>
-static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) {
+static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) {
   struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
   struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
   struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
@@ -32,6 +32,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
     if (ret) {
       connector->transportComm = transportComm;
       NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
+      if (transportType) *transportType = t;
       return ncclSuccess;
     }
   }
@@ -64,10 +65,11 @@ void dumpData(struct ncclConnect* data, int ndata) {
   }
 }
 
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) {
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
   cudaStream_t transportSetupStream;
   CUDACHECK(cudaStreamCreateWithFlags(&transportSetupStream, cudaStreamNonBlocking));
+  int highestType = TRANSPORT_P2P;  // track highest transport type
 
   struct ncclConnect data[2*MAXCHANNELS];
   for (int i=1; i<comm->nRanks; i++) {
@@ -79,15 +81,18 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
 
     struct ncclConnect* recvData = data;
     int sendChannels = 0, recvChannels = 0;
+    int type;
     for (int c=0; c<MAXCHANNELS; c++) {
       if (recvMask & (1<<c)) {
-        NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex));
+        NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
+        if (type > highestType) highestType = type;
       }
     }
     struct ncclConnect* sendData = recvData+recvChannels;
     for (int c=0; c<MAXCHANNELS; c++) {
       if (sendMask & (1<<c)) {
-        NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex));
+        NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
+        if (type > highestType) highestType = type;
       }
     }
 
@@ -125,6 +130,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
   }
   CUDACHECK(cudaStreamSynchronize(transportSetupStream));
   CUDACHECK(cudaStreamDestroy(transportSetupStream));
+  if (highestTransportType != NULL) *highestTransportType = highestType;
   return ncclSuccess;
 }
 
@@ -218,22 +224,18 @@ cleanup:
 }
 
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
-  int rank = comm->rank;
-  int nranks = comm->nRanks;
   // AllGather collNet setup results
-  int* allGatherFailures;
-  NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
-  allGatherFailures[rank] = collNetSetupFail;
-  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
-  for (int i=0; i<nranks; i++) {
+  int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0};
+  allGatherFailures[comm->intraNodeRank] = collNetSetupFail;
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int)));
+  for (int i=0; i<comm->localRanks; i++) {
     if (allGatherFailures[i] != 0) {
       collNetSetupFail = 1;
       break;
     }
   }
-  free(allGatherFailures);
   if (collNetSetupFail) {
-    if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
+    if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
     return ncclSystemError;
   }
   return ncclSuccess;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 5b9f01e..db27eae 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -88,6 +88,8 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
   } else {
     // Merge multi-port NICs into the same PCI device
     p[strlen(p)-1] = '0';
+    // Also merge virtual functions (VF) into the same device
+    p[strlen(p)-3] = '0';
     // And keep the real port aside (the ibv port is always 1 on recent cards)
     *realPort = 0;
     for (int d=0; d<ncclNIbDevs; d++) {
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 7764258..ca59f3b 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -21,6 +21,7 @@ struct p2pSendResources {
   void* ipcPtr;
   int remoteId;
   int memRank;
+  void* remIpcPtr;
   void* bootstrap;
 };
 
@@ -29,6 +30,7 @@ struct p2pRecvResources {
   void* ipcPtr;
   int remoteId;
   int memRank;
+  void* remIpcPtr;
   void* bootstrap;
 };
 
@@ -87,6 +89,21 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
     *ret = 0;
     return ncclSuccess;
   }
+
+  // Check that legacy IPC support is available
+  if (p2p != 0) {
+    char *dummy;
+    cudaIpcMemHandle_t ipc;
+    NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN));
+    if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) {
+      INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported on dev %d(=%lx)",
+           cudaDev1, info1->busId);
+      *ret = 0;
+    }
+    CUDACHECK(cudaFree(dummy));
+    return ncclSuccess;
+  }
+
   if (p2p == 0) {
     INFO(NCCL_INIT|NCCL_P2P,"Could not enable P2P between dev %d(=%lx) and dev %d(=%lx)",
          cudaDev1, info1->busId, cudaDev2, info2->busId);
@@ -164,10 +181,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize));
     info.rank = myInfo->rank;
     if (myInfo->pidHash == peerInfo->pidHash) {
-      if (info.read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
+      send->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
       INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
           channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
     } else {
+      send->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
       CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr));
       INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
           channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
@@ -212,8 +230,9 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize));
     info.rank = myInfo->rank;
     if (myInfo->pidHash == peerInfo->pidHash) {
-      if (info.read == 0) recv->conn.direct |= NCCL_DIRECT_GPU;
+      recv->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
     } else {
+      recv->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
       CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr));
     }
   } else {
@@ -235,7 +254,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
   struct ncclRecvMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr));
 
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -250,6 +269,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
   send->conn.tail = &remDevMem->tail;
   send->conn.head = &resources->devMem->head;
   send->conn.ptrExchange = &resources->devMem->ptrExchange;
+  send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
   return ncclSuccess;
 }
 
@@ -259,7 +279,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr));
 
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -274,6 +294,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   recv->conn.tail = &resources->devMem->tail;
   recv->conn.head = &remDevMem->head;
   recv->conn.ptrExchange = &remDevMem->ptrExchange;
+  recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
   return ncclSuccess;
 }
 
@@ -281,6 +302,8 @@ ncclResult_t p2pSendFree(void* resources) {
   struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
   if (sendRes->ipcPtr)
     CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
+  if (sendRes->remIpcPtr)
+    CUDACHECK(cudaIpcCloseMemHandle(sendRes->remIpcPtr));
   if (sendRes->remoteId != -1) {
     NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap));
     sendRes->devMem = NULL;
@@ -294,6 +317,8 @@ ncclResult_t p2pRecvFree(void* resources) {
   struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
   if (recvRes->ipcPtr)
     CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
+  if (recvRes->remIpcPtr)
+    CUDACHECK(cudaIpcCloseMemHandle(recvRes->remIpcPtr));
   if (recvRes->remoteId != -1) {
     NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap));
     recvRes->devMem = NULL;