diff --git a/makefiles/common.mk b/makefiles/common.mk
index 82164ab..1b1bb86 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -39,14 +39,20 @@ endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
+CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
+                 -gencode=arch=compute_120,code=sm_120
 
 CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
 CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
 CUDA12_PTX    = -gencode=arch=compute_90,code=compute_90
+CUDA13_PTX    = -gencode=arch=compute_120,code=compute_120
 
 
-ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
 # Include Hopper support if we're using CUDA11.8 or above
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 2523009..b02cf90 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 24
-NCCL_PATCH   := 3
+NCCL_MINOR   := 25
+NCCL_PATCH   := 1
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index 2c5d9e8..b66ebef 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
diff --git a/src/collectives.cc b/src/collectives.cc
index 479d4c5..03122f8 100644
--- a/src/collectives.cc
+++ b/src/collectives.cc
@@ -8,6 +8,7 @@
 #include "collectives.h"
 #include "enqueue.h"
 #include "nccl.h"
+#include "nvtx_payload_schemas.h"
 
 const char* ncclFuncToString(ncclFunc_t fn) {
   switch (fn) {
@@ -78,11 +79,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
   // Just pass the size of one message and not the total bytes sent/received.
-  constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
-  };
-  size_t msgsize = sendcount * ncclTypeSize(datatype);
-  NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
+  NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype)));
 
   struct ncclInfo info = { ncclFuncAllGather, "AllGather",
     sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
@@ -94,18 +92,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsAllReduce {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  // Just pass the size of one message and not the total bytes sent/received.
-  static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsAllReduce, op)}
-  };
-  NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op));
 
   struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
@@ -117,16 +105,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsBroadcast {
-    size_t bytes;
-    int root;
-  };
-  constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
-  };
-  NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
-  NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
 
   struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
     sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
@@ -145,19 +125,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsReduce {
-    size_t bytes;
-    int root;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduce, op)}
-  };
-  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
-  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, op));
 
   struct ncclInfo info = { ncclFuncReduce, "Reduce",
     sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
@@ -169,17 +138,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsReduceScatter {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduceScatter, op)}
-  };
-  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op));
 
   struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
     sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
@@ -187,21 +147,12 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
   return ncclEnqueueCheck(&info);
 }
 
-struct NvtxParamsSendRecv {
-    size_t bytes;
-    int peer;
-};
-constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
-};
-
 NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
 
   struct ncclInfo info = { ncclFuncSend, "Send",
     NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
@@ -213,8 +164,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
 
   struct ncclInfo info = { ncclFuncRecv, "Recv",
     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
diff --git a/src/device/Makefile b/src/device/Makefile
index 1e9311f..3562563 100644
--- a/src/device/Makefile
+++ b/src/device/Makefile
@@ -5,7 +5,7 @@
 #
 
 SHELL := /usr/bin/env bash
-MAKEFALGS += -r
+MAKEFLAGS += -r
 .SUFFIXES:
 .SECONDARY:
 
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index c6c1315..2161597 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -436,7 +436,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
-      } else if (tid < tidEndReduce) {
+      } else if (tid < tidEndReduce && nvls->headRank != -1) {
         // Reduce, broadcast through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 285e17f..23f4633 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -21,19 +21,21 @@
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
 // Returns maximum kernel stack size of all CUDA kernels
-ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
   ncclResult_t result = ncclSuccess;
+  int print = 0;
 
   if (maxStackSize) *maxStackSize = 0;
   int carveout = ncclParamL1SharedMemoryCarveout();
+  int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
 
   for (int k=0; k < ncclDevKernelCount; k++) {
     void* fn = ncclDevKernelList[k];
+    cudaFuncAttributes attr = {0};
     if (fn == nullptr) continue;
 
+    CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
     if (maxStackSize) {
-      cudaFuncAttributes attr = {0};
-      CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
       if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
     ignore0:;
     }
@@ -43,9 +45,17 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
         result, ignore1);
     ignore1:;
     }
-    if (ncclShmemDynamicSize(cudaArch) != 0) {
+    if (ncclMaxSharedMem != 0) {
+      int sharedMemSize = ncclMaxSharedMem;
+      if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
+        if (print++ == 0)
+          INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
+               sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
+        // Reduce requested MaxDynamicSharedMemorySize attribute
+        sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
+      }
       CUDACHECKGOTO(cudaFuncSetAttribute(fn,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
+        cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
         result, next_kernel);
     }
   next_kernel:;
@@ -1445,7 +1455,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
   NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
   if (driverVersion >= 11080) {
     int compCap = comm->compCap;
-    unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
+    unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
 
     CUlaunchConfig launchConfig = {0};
     CUlaunchAttribute launchAttrs[3];
@@ -1597,7 +1607,7 @@ static ncclResult_t updateCollCostTable(
     if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
     if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
     /* now we only support single-node NVLS allgather and reducescatter */
-    if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
+    if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue;
     /* Tree reduceScatter doesn't support scaling yet */
     if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
         && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 3f639a0..64fc1c5 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -19,7 +19,6 @@
 ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
   int rank = comm->rank;
   int localRanks = comm->topo->nodes[GPU].count;
-  int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
   int nChannels = comm->nChannels;
 
   topoRanks->nvlsHeadNum = 0;
@@ -74,7 +73,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
   // Get nvls heads and the number of heads. Duplicate head is not allowed.
   for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
     bool addHead = true;
-    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
+    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
 
     for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
       if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
@@ -259,8 +258,6 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
     channel->nvls.out = -1;       // NVLS+SHARP not yet implemented.
     channel->nvls.headRank = headRank;
     channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
-    channel->nvls.node = comm->node;
-    channel->nvls.nNodes = comm->nNodes;
     if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
   }
   if (comm->nNodes == 1) return ncclSuccess;
@@ -466,7 +463,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   }
 
   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
-  if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
+  if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
      nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
   }
 
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 6e93568..587a8b2 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -828,14 +828,37 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
   return ncclSuccess;
 }
 
-int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
-  int minPath = PATH_DIS;
+ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) {
+  int minPath = PATH_SYS;
   for (int i=0; i<system->nodes[GPU].count; i++) {
-    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
-    for (int j=0; j<system->nodes[GPU].count; j++) {
-      if (i == j) continue;
+    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
+    if (paths == NULL) continue;
+    for (int j=0; j<system->nodes[type].count; j++) {
+      if (type == GPU && i == j) continue;
       minPath = std::min(minPath, paths[j].type);
     }
   }
-  return minPath >= PATH_PIX ? 0 : 1;
+  *min = minPath;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) {
+  int maxPath = PATH_LOC;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
+    if (paths == NULL) continue;
+    for (int j=0; j<system->nodes[type].count; j++) {
+      if (type == GPU && i == j) continue;
+      maxPath = std::max(maxPath, paths[j].type);
+    }
+  }
+  *max = maxPath;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) {
+  int maxPath;
+  NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath));
+  *allNvLink = maxPath >= PATH_PIX ? 0 : 1;
+  return ncclSuccess;
 }
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 9b72ac1..0185b3f 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -937,6 +937,11 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0,
 #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
 
+float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 };
+float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
+#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
+
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
   int crossNic = (system->nodes[NET].count > 1) &&
@@ -946,8 +951,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   graph->crossNic = crossNic == 1 ? 1 : 0;
   graph->bwIntra = graph->bwInter = 0;
   graph->latencyInter = 0;
-  graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
-  graph->typeInter = PATH_PIX;
+  int minTypeIntra = PATH_LOC, minTypeInter = PATH_PIX;
+  int maxTypeIntra = PATH_SYS, maxTypeInter = PATH_SYS;
+  if (ngpus > 1) {
+    NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra));
+    NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra));
+  }
+  if (system->nodes[NET].count > 0) {
+    NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter));
+    NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter));
+    maxTypeIntra = maxTypeInter;
+  }
+
+  graph->typeIntra = minTypeIntra;
+  graph->typeInter = minTypeInter;
   graph->nChannels = 0;
   int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
   graph->sameChannels = trySameChannels;
@@ -972,14 +989,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
   if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
   // NVLS and COLLNET_DIRECT search must have ngpus heads at most.
-  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
-    graph->maxChannels = system->nodes[GPU].count;
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = std::min(NCCL_MAX_NVLS_ARITY, system->nodes[GPU].count);
+  if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) graph->maxChannels = std::min(NCCL_MAX_DIRECT_ARITY+1, system->nodes[GPU].count);
 
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
   if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
     // Force intra-node NVLS algorithm to pull evenly from all GPUs.
-    graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
+    graph->minChannels = graph->maxChannels;
   }
 
   struct ncclTopoGraph tmpGraph;
@@ -989,11 +1006,11 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   int nspeeds = 0;
   float* speedArray = NULL;
   if (system->nodes[NET].count == 0) {
-    nspeeds = ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA;
-    speedArray = ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra;
+    nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA);
+    speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra);
   } else {
-    nspeeds = ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER;
-    speedArray = ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter;
+    nspeeds = ccMin >= 100 ? NSPEEDSINTER_SM100 : (ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER);
+    speedArray = ccMin >= 100 ? sm100SpeedArrayInter : (ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter);
   }
   int pass = 1;
   int speedIndex = 0;
@@ -1048,18 +1065,18 @@ search:
     }
     tmpGraph.pattern = graph->pattern;
 
-    int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
-    if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
+    int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra;
+    if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
       tmpGraph.typeIntra += 1;
       goto search;
     }
-    tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
+    tmpGraph.typeIntra = minTypeIntra;
 
-    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
+    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
       tmpGraph.typeInter += 1;
       goto search;
     }
-    tmpGraph.typeInter = PATH_PIX;
+    tmpGraph.typeInter = minTypeInter;
 
     if (crossNic == 2 && tmpGraph.crossNic == 0
         && (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) {
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index d758ac9..ba82caf 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -1357,11 +1357,11 @@ fail:
   goto exit;
 }
 
-ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
+static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
+                                     int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) {
   int minType = PATH_DIS;
   float maxBw = 0;
   int count = 0;
-  NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
   struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
   if (paths == NULL) { *localCount = 0; return ncclSuccess; }
   for (int i=0; i<system->nodes[resultType].count; i++) {
@@ -1371,7 +1371,15 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
       if (pathType) *pathType = minType;
       count = 0;
     }
-    if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
+    if (paths[i].bw == maxBw && paths[i].type == minType) {
+      if (count == NCCL_TOPO_MAX_NODES) {
+        WARN("Error : ran out of room to store found nodes in ncclTopoGetLocal."
+             " Filled %d of type %d, starting from index %d of type %d.",
+             NCCL_TOPO_MAX_NODES, resultType, index, type);
+        return ncclInternalError;
+      }
+      locals[count++] = i;
+    }
   }
   *localCount = count;
   return ncclSuccess;
@@ -1379,7 +1387,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
 
 ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
   int localNetCount = 0, netCountByBw = 0;
-  int* localNets;
+  int localNets[NCCL_TOPO_MAX_NODES];
   float totalNetBw = 0, gpuBw = 0;
 
   for (int l=0; l<system->nodes[GPU].nodes[gpu].nlinks; l++) {
@@ -1391,54 +1399,55 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
     }
   }
 
-  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
   for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
      totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
   }
   *count = netCountByBw;
 
-  free(localNets);
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
-  ncclResult_t ret = ncclSuccess;
   int gpu;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
-  int* localNets;
+
+  int localNets[NCCL_TOPO_MAX_NODES];
   int localNetCount;
-  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
-  int* localGpus = NULL;
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
+  if (localNetCount==0) {
+    WARN("Could not find any local path from gpu %d to net.", gpu);
+    return ncclInternalError;
+  }
+
+  int localGpus[NCCL_TOPO_MAX_NODES];
   int localGpuCount;
-  int net;
-  NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail);
-  net = system->nodes[GPU].nodes[gpu].gpu.dev;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
+
+  int net = system->nodes[GPU].nodes[gpu].gpu.dev;
   if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
   net += channelId%(DIVUP(localNetCount,localGpuCount));
   if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
   if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
-exit:
-  free(localNets);
-  if (localGpus) free(localGpus);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
   ncclResult_t ret = ncclSuccess;
   int netIndex;
   NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
-  int* localGpus = NULL;
+
+  int localGpus[NCCL_TOPO_MAX_NODES];
   int localGpuCount;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL));
+
   int foundGpu = -1;
-  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
   for (int c=0; c<MAXCHANNELS; c++) {
     for (int lg=0; lg<localGpuCount; lg++) {
       int g = localGpus[lg];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       int64_t id;
-      NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail);
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
       if (netId == id) {
         foundGpu = g;
         goto exit;
@@ -1447,8 +1456,6 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i
   }
 exit:
   *gpuIndex = foundGpu;
-fail:
-  free(localGpus);
   return ret;
 }
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 8e7cda5..2be029b 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -16,6 +16,7 @@
 #define SM80_NVLINK_BW 20.0
 #define SM90_NVLINK_BW 20.6
 #define SM86_NVLINK_BW 12.0
+#define SM100_NVLINK_BW 40.0
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define QPI_BW 6.0
 #define AMD_BW 16.0
@@ -91,7 +92,8 @@ struct ncclTopoLink {
   float bw;
   struct ncclTopoNode* remNode;
 };
-#define NCCL_TOPO_MAX_LINKS 128
+// Allows for up to 32 NICs per node on GB200-NVL72
+#define NCCL_TOPO_MAX_LINKS 576
 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
 
 struct ncclTopoLinkList {
@@ -172,6 +174,8 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
 ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
+ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
+ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
 
 #define NCCL_TOPO_XML_MAX_NODES 256
 #define NCCL_GRAPH_XML_MAX_NODES 4096
@@ -230,6 +234,7 @@ static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id
 // Returns NVLink bw in GB/s
 static float ncclTopoNVLinkBw(int cudaCompCap) {
   return
+    cudaCompCap >= 100 ? SM100_NVLINK_BW :
     cudaCompCap >= 90 ? SM90_NVLINK_BW :
     cudaCompCap == 86 ? SM86_NVLINK_BW :
     cudaCompCap >= 80 ? SM80_NVLINK_BW :
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index f5f2e11..8da4aeb 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -145,28 +145,33 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 #define VOLTA_COMPCAP_IDX 0
 #define AMPERE_COMPCAP_IDX 1
 #define HOPPER_COMPCAP_IDX 2
+#define BLACKWELL_COMPCAP_IDX 3
 
 // LL128 max BW per channel
-static const double llMaxBws[3][3] = {
+static const double llMaxBws[][3] = {
   /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
   /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
-  /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}
+  /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0},
+  /* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0},
 };
 
-static const double perChMaxRingLL128Bws[3][3] = {
+static const double perChMaxRingLL128Bws[][3] = {
   /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
   /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
   /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
+  /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7},
 };
-static const double perChMaxTreeLL128Bws[3][3] = {
+static const double perChMaxTreeLL128Bws[][3] = {
   /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
   /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
   /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
+  /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0},
 };
-static const double perChMaxTreeBws[3][3] = {
+static const double perChMaxTreeBws[][3] = {
   /* Volta (N1/N2/N4) */  {26.5, 18.5, 10.0},
   /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
   /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
+  /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
 };
 
 NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
@@ -207,7 +212,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   int nRanks = comm->nRanks;
   if (nRanks <= 1) return ncclSuccess;
 
-  int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
+  int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
   int index2 = nNodes <= 2 ? nNodes-1 : 2;
   // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
   int index1 = nNodes == 1 ? compCapIndex :
@@ -418,7 +423,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
     if (pEnable == 2 && p == NCCL_PROTO_LL128) {
-      // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
+      // Enable LL128 by default only on Volta/Ampere/Hopper/Blackwell+NVLink. Other cases are not tested and may cause silent data corruption.
       pEnable = 1;
       pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
       pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
@@ -427,6 +432,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       case 70: pEnable &= 1; break;
       case 80: pEnable &= 1; break;
       case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break;
+      case 100: pEnable &= 1; break;
+      case 120: pEnable &= 1; break;
       default: pEnable &= 0; break;
       }
     }
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 7744119..021c91f 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -204,14 +204,13 @@ static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
   return result;
 }
 
-static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, CUmemAllocationHandleType type, size_t size) {
   ncclResult_t result = ncclSuccess;
   size_t granularity = 0;
   CUdevice currentDev;
   CUmemAllocationProp prop = {};
   CUmemAccessDesc accessDesc = {};
   CUmemGenericAllocationHandle handle;
-  CUmemAllocationHandleType type = ncclCuMemHandleType;
   int cudaDev;
   int flag = 0;
   CUDACHECK(cudaGetDevice(&cudaDev));
@@ -260,7 +259,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
 
 extern int ncclCuMemEnable();
 
-static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) {
   WARN("CUMEM not supported prior to CUDA 11.3");
   return ncclInternalError;
 }
@@ -288,7 +287,7 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   if (nelem > 0) {
     if (ncclCuMemEnable()) {
-      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT<T>()), result, finish);
     } else {
       CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
     }
@@ -312,7 +311,7 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in
     cudaStream_t stream;
     CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
     if (ncclCuMemEnable()) {
-      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT<T>()), result, finish);
     } else {
       CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
     }
@@ -336,7 +335,7 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   if (nelem > 0) {
     if (ncclCuMemEnable()) {
-      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT<T>()), result, finish);
     } else {
       CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
     }
diff --git a/src/include/device.h b/src/include/device.h
index 0c861f5..3f918ab 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -59,8 +59,8 @@ union ncclLLFifoLine {
 };
 
 #define WARP_SIZE 32
-#define MAXCHANNELS 32
-#define NCCL_MAX_LOCAL_RANKS 64
+#define MAXCHANNELS 64
+#define NCCL_MAX_LOCAL_RANKS 72
 #define NCCL_MAX_NTHREADS 640
 #define NCCL_MIN_NTHREADS (4*WARP_SIZE)
 #define NCCL_SIMPLE_MAX_NTHREADS 512
@@ -187,8 +187,6 @@ struct ncclNvls {
   int down;
   int treeUp;
   int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
-  int node;
-  int nNodes;
 };
 
 #if __CUDA_ARCH__ >= 900
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 3eb6c07..5337eeb 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -17,7 +17,7 @@
 #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
 #define NCCL_BYTES_ALIGNMENT 16
 
-ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
 ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
diff --git a/src/include/graph.h b/src/include/graph.h
index 602cc8c..a22b62b 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -28,7 +28,8 @@ void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
-int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink);
+
 ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
diff --git a/src/include/mnnvl.h b/src/include/mnnvl.h
new file mode 100644
index 0000000..dedbefe
--- /dev/null
+++ b/src/include/mnnvl.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_MNNVL_H_
+#define NCCL_MNNVL_H_
+
+#include "nccl.h"
+#include "comm.h"
+
+ncclResult_t ncclMnnvlCheck(struct ncclComm* comm);
+
+#endif
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index 14b317f..5d00f07 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -30,6 +30,7 @@
 #define NVTX_SID_CommInitRankConfig   11 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommSplit            13
+#define NVTX_SID_CommFinalize         14
 
 // Define static schema ID for the reduction operation.
 #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
@@ -38,11 +39,13 @@ extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
 struct nccl_domain{static constexpr char const* name{"NCCL"};};
 
+/// @brief Register an NVTX payload schema for static-size payloads.
 class payload_schema {
  public:
-  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
+  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries,
+    const uint64_t schemaId, const size_t size) noexcept
   {
-    schema_attr.name = schemaName;
+    schema_attr.payloadStaticSize = size;
     schema_attr.entries = entries;
     schema_attr.numEntries = numEntries;
     schema_attr.schemaId = schemaId;
@@ -63,26 +66,84 @@ class payload_schema {
     NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
     NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
     NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
-    nullptr,
+    nullptr, /* schema name is not needed */
     NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
     NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
     nullptr, 0, 0, 0, 0, nullptr};
 };
 
+// Convenience macro to give the payload parameters a scope.
+#define NVTX3_PAYLOAD(...) __VA_ARGS__
+
 // Create NVTX push/pop range with parameters
-// @param name of the operation (see `NVTX_SID_*`)
-// @param N  schema name
-// @param S  schema (entries)
-// @param P  payload (struct)
-#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
-  static const payload_schema schema{S, std::extent<decltype(S)>::value, \
-    NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
+// @param N NCCL API name without the `nccl` prefix.
+// @param T name of the used NVTX payload schema without "Schema" suffix.
+// @param P payload parameters/entries
+#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \
+  constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
+  static const payload_schema schema{T##Schema, std::extent<decltype(T##Schema)>::value - 1, \
+    schemaId, sizeof(T)}; \
   static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
-  nvtxPayloadData_t nvtx3_bpl__[] = { \
-    {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
+  const T _payload = {P}; \
+  nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \
   ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
   ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
 
+/// @brief Creates an NVTX range with extended payload using the RAII pattern.
+/// @tparam PayloadType Data type of the payload.
+template <typename PayloadType>
+class ncclNvtxRange {
+ public:
+  explicit ncclNvtxRange(const nvtxEventAttributes_t* evtAttr) noexcept {
+    nvtxDomainRangePushEx(nvtx3::domain::get<nccl_domain>(), evtAttr);
+  }
+
+  ~ncclNvtxRange() noexcept {
+    if (payloadData.payload) {
+      nvtxRangePopPayload(nvtx3::domain::get<nccl_domain>(), &payloadData, 1);
+    } else {
+      nvtxDomainRangePop(nvtx3::domain::get<nccl_domain>());
+    }
+  }
+
+  void setPayloadData(const uint64_t schemaId) noexcept
+  {
+    payloadData = {schemaId, sizeof(PayloadType), &payload};
+  }
+
+  ncclNvtxRange() = delete;
+  ncclNvtxRange(ncclNvtxRange const&) = default;
+  ncclNvtxRange& operator=(ncclNvtxRange const&) = default;
+  ncclNvtxRange(ncclNvtxRange&&) = default;
+  ncclNvtxRange& operator=(ncclNvtxRange&&) = default;
+
+  // Holds the payload data.
+  PayloadType payload{};
+
+ private:
+  nvtxPayloadData_t payloadData = {NVTX_PAYLOAD_ENTRY_TYPE_INVALID, 0, NULL};
+};
+
+// Create an NVTX range with the function name as the range name. Use RAII pattern.
+// @param T Type ID of the NVTX payload (pointer for variable-size payloads).
+#define NVTX3_RANGE(T) \
+  static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
+  ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
+  ncclNvtxRange<T> nvtx3_range__{nvtx3_func_attr__.get()};
+
+// Add static-size payload to the NVTX range created with `NVTX3_RANGE()`,
+// which must be in this or an outer scope.
+// @param N NCCL API name without the `nccl` prefix.
+// @param S name of the used NVTX payload schema.
+// @param P payload parameters/entries
+#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \
+  constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
+  static const payload_schema schema{S, std::extent<decltype(S)>::value - 1, schema_id, \
+    sizeof(nvtx3_range__.payload)}; \
+  nvtx3_range__.payload = {P}; \
+  nvtx3_range__.setPayloadData(schema_id); \
+} while (0)
+
 extern void initNvtxRegisteredEnums();
 
 #endif
diff --git a/src/include/nvtx3/nvToolsExtPayloadHelper.h b/src/include/nvtx3/nvToolsExtPayloadHelper.h
index 304d5d6..0f0c87d 100644
--- a/src/include/nvtx3/nvToolsExtPayloadHelper.h
+++ b/src/include/nvtx3/nvToolsExtPayloadHelper.h
@@ -11,7 +11,7 @@
 
 /* This is just an empty marker (for readability), which can be omitted. */
 /* TODO: Fix issue with trailing comma at end of entry list. */
-#define NVTX_PAYLOAD_ENTRIES
+#define NCCL_NVTX_PAYLOAD_ENTRIES
 
 
 /**
@@ -32,7 +32,7 @@
  *
  * Example:
  *  NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName",
- *      NVTX_PAYLOAD_ENTRIES(
+ *      NCCL_NVTX_PAYLOAD_ENTRIES(
  *          (index, TYPE_INT, "integer value"),
  *          (dpfloat, TYPE_DOUBLE, "fp64 value"),
  *          (text, TYPE_CSTRING, "text", NULL, 24)
@@ -80,7 +80,7 @@
  *
  * Example:
  *  NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name",
- *      NVTX_PAYLOAD_ENTRIES(
+ *      NCCL_NVTX_PAYLOAD_ENTRIES(
  *          (int, index, TYPE_INT, "integer value"),
  *          (double, dpfloat, TYPE_DOUBLE, "fp64 value"),
  *          (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24)
diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h
new file mode 100644
index 0000000..228a192
--- /dev/null
+++ b/src/include/nvtx_payload_schemas.h
@@ -0,0 +1,125 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/// Definitions of NVTX payload types and schemas used for the NVTX
+/// instrumentation in init.cc and collectives.cc.
+
+#ifndef NVTX_PAYLOAD_SCHEMAS_H_
+#define NVTX_PAYLOAD_SCHEMAS_H_
+
+
+#include "nccl.h"
+#include "nvtx3/nvToolsExtPayload.h"
+#include "nvtx3/nvToolsExtPayloadHelper.h"
+
+/**
+ * \brief Define a C struct together with the matching schema entries.
+ *
+ * Does the same as `NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but without creating the
+ * schema attributes. (Remove this helper when it is available in the NVTX headers.)
+ */
+#define NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(struct_id, prefix, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries)
+
+// C strings used as NVTX payload entry names.
+static constexpr char const* nccl_nvtxCommStr = "NCCL communicator ID";
+static constexpr char const* nccl_nvtxCudaDevStr = "CUDA device";
+static constexpr char const* nccl_nvtxRankStr = "Rank";
+static constexpr char const* nccl_nvtxNranksStr = "No. of ranks";
+static constexpr char const* nccl_nvtxMsgSizeStr = "Message size [bytes]";
+static constexpr char const* nccl_nvtxReductionOpStrpStr = "Reduction operation";
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitAll, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, commhash, TYPE_UINT64, nccl_nvtxCommStr),
+    (int, ndev, TYPE_INT, "No. of devices")
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitRank, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
+    (int, nranks, TYPE_INT, nccl_nvtxNranksStr),
+    (int, myrank, TYPE_INT, nccl_nvtxRankStr),
+    (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr)
+  )
+)
+// The typedef and payload schema for ncclCommInitRank is also used for,
+// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, and ncclCommAbort.
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankConfig;
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankScalable;
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommAbort;
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommDestroy;
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
+    (uint64_t, parentcomm, TYPE_UINT64, "Parent NCCL communicator ID"),
+    (int, nranks, TYPE_INT, nccl_nvtxNranksStr),
+    (int, myrank, TYPE_INT, nccl_nvtxRankStr),
+    (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
+    (int, color, TYPE_INT, "Color"),
+    (int, key, TYPE_INT, "Key")
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllGather, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr)
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllReduce, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr)
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsBroadcast, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (int, root, TYPE_INT, "Root")
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduce, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (int, root, TYPE_INT, "Root"),
+    (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr)
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduceScatter, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr)
+  )
+)
+
+// Used in NCCL APIs `ncclSend` and `ncclRecv`.
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsSendRecv, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (int, peer, TYPE_INT, "Peer rank")
+  )
+)
+
+#endif // end include guard
diff --git a/src/include/proxy.h b/src/include/proxy.h
index b6ef0fa..c97a4d7 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -363,6 +363,8 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
 ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
+
+// NB: ncclProxyMsgTypeStr[] in proxy.cc needs to match
 enum ncclProxyMsgType {
   ncclProxyMsgInit = 1,
   ncclProxyMsgSharedInit = 2,
diff --git a/src/init.cc b/src/init.cc
index 5caaaae..3e218ab 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -18,6 +18,7 @@
 #include "argcheck.h"
 #include "tuner.h"
 #include "ras.h"
+#include "mnnvl.h"
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
@@ -27,6 +28,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include "param.h"
+#include "nvtx_payload_schemas.h"
 
 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -213,6 +215,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
   free(comm->rankToNode);
   free(comm->rankToLocalRank);
   free(comm->collNetHeads);
+  free(comm->clique.ranks);
 
   if (comm->bootstrap)
     NCCLCHECK(bootstrapClose(comm->bootstrap));
@@ -530,6 +533,7 @@ static void showVersion() {
   }
 }
 
+NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1);
 NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
 
 static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
@@ -564,12 +568,16 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
     info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
     (void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
     if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
+      if (ncclParamMNNVLUUID() != -1) {
+        ((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID();
+        ((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID();
+      }
+      if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
       INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
            info->busId,
            ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
            info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
     }
-    if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
   }
 
   return ncclSuccess;
@@ -638,71 +646,6 @@ NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
 // MNNVL: Flag to indicate whether to enable Multi-Node NVLink
 NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
 
-#if CUDART_VERSION >= 11030
-
-#include <cuda.h>
-#include "cudawrap.h"
-
-// Determine if MNNVL support is available
-static int checkMNNVL(struct ncclComm* comm) {
-  ncclResult_t ret = ncclSuccess;
-
-  // MNNVL requires cuMem to be enabled
-  if (!ncclCuMemEnable()) return 0;
-
-  // MNNVL also requires FABRIC handle support
-  int cudaDev;
-  int flag = 0;
-  CUdevice currentDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
-  // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
-  (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
-  if (!flag) return 0;
-  // Check that all ranks have initialized the fabric fully
-  for (int i = 0; i < comm->nRanks; i++) {
-    if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0;
-  }
-
-  // Determine our MNNVL domain/clique
-  NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail);
-  comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
-  for (int i = 0; i < comm->nRanks; i++) {
-    nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
-    nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
-    // Check if the cluster UUID and cliqueId match
-    // A zero UUID means we don't have MNNVL fabric info - disable MNNVL
-    if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail;
-    if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
-        (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
-      if (i == comm->rank) {
-        comm->cliqueRank = comm->clique.size;
-      }
-      comm->clique.ranks[comm->clique.size++] = i;
-    }
-  }
-  // Determine whether to enable MNNVL or not
-  comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable();
-  INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
-
-  if (comm->MNNVL) {
-    // Force the CUMEM handle type to be FABRIC for MNNVL
-    ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
-  }
-
-  return comm->MNNVL;
-
-fail:
-  if (comm->clique.ranks) free(comm->clique.ranks);
-  return 0;
-}
-
-#else
-static int checkMNNVL(struct ncclComm* comm) {
-  return 0;
-}
-#endif
-
 #define TIMER_INIT_TOTAL 0
 #define TIMER_INIT_KERNELS 1
 #define TIMER_INIT_BOOTSTRAP 2
@@ -782,12 +725,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // AllGather1 - end
   timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
 
-  // MNNVL support
-  if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) {
-    // Return an error if the user specifically requested MNNVL support
-    WARN("MNNVL is not supported on this system");
-    ret = ncclSystemError;
-    goto fail;
+  // Check for MNNVL support
+  if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) {
+    NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
   }
 
   do {
@@ -1079,7 +1019,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       comm->collNetSupport = 0;
     }
   }
-  comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo);
+  NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
   comm->isOneRPN = (comm->maxLocalRanks == 1);
 
   NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
@@ -1406,18 +1346,20 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   int cudaDev = job->cudaDev;
   int* parentRanks = NULL;
   int cudaArch;
+  int maxSharedMem = 0;
   double sum_timers = 0;
   uint64_t timers[TIMERS_INIT_COUNT] = {0};
   unsigned long long commIdHash;
 
   timers[TIMER_INIT_TOTAL] = clockNano();
   CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
+  CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem, cudaDevAttrMaxSharedMemoryPerBlockOptin, cudaDev), res, fail);
   CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
   CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
   cudaArch = 100*archMajor + 10*archMinor;
 
   timers[TIMER_INIT_KERNELS] = clockNano();
-  NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes));
+  NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
   // Set the maximum kernel stack size of all kernels to avoid
   // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
   if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
@@ -1533,18 +1475,24 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
     comm->config.cgaClusterSize = cgaClusterSizeEnv;
   } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
-    WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
+    INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
     comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
   }
 
   minCTAsEnv = ncclParamMinCTAs();
   if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
-    comm->config.minCTAs = minCTAsEnv;
+    if (minCTAsEnv <= 0)
+      INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs);
+    else
+      comm->config.minCTAs = minCTAsEnv;
   }
 
   maxCTAsEnv = ncclParamMaxCTAs();
   if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
-    comm->config.maxCTAs = maxCTAsEnv;
+    if (maxCTAsEnv <= 0)
+      INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs);
+    else
+      comm->config.maxCTAs = maxCTAsEnv;
   }
 
   envNetName = ncclGetEnv("NCCL_NET");
@@ -1565,22 +1513,22 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
 
   /* cap channels if needed */
   if (comm->config.minCTAs > MAXCHANNELS) {
-    WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
+    INFO(NCCL_ENV, "minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
     comm->config.minCTAs = MAXCHANNELS;
   }
 
   if (comm->config.maxCTAs > MAXCHANNELS) {
-    WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
+    INFO(NCCL_ENV, "maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
     comm->config.maxCTAs = MAXCHANNELS;
   }
 
   if (comm->config.minCTAs > comm->config.maxCTAs) {
-    WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
+    INFO(NCCL_ENV, "minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
     comm->config.minCTAs = comm->config.maxCTAs;
   }
 
   if (comm->config.splitShare != 1 && comm->config.splitShare != 0) {
-    WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
+    INFO(NCCL_ENV, "splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
     comm->config.splitShare = 0;
   }
 
@@ -1763,20 +1711,9 @@ fail:
   goto exit;
 }
 
-struct NvtxParamsCommInitRank
-{
-  int rank;
-  int nranks;
-  int cudaDev;
-};
-constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
-  {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
-  {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommInitRank, nranks)},
-  {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommInitRank, cudaDev)},
-};
-
 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
 ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  NVTX3_RANGE(NcclNvtxParamsCommInitRank)
   // Load the CUDA driver and dlsym hooks (can fail on old drivers)
   (void)ncclCudaLibraryInit();
 
@@ -1784,10 +1721,11 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
   ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
   CUDACHECK(cudaGetDevice(&cudaDev));
 
-  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
-
   NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
+
+  NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema,
+    NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
+
   return ncclSuccess;
 }
 
@@ -1799,10 +1737,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
   int oldDev = 0;
 
-  constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
-  };
-  NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev)
+  NVTX3_RANGE(NcclNvtxParamsCommInitAll);
 
   // Load the CUDA driver and dlsym hooks (can fail on old drivers)
   (void)ncclCudaLibraryInit();
@@ -1840,14 +1775,17 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
 
   ncclUniqueId uniqueId;
   NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail);
-  NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
+  NCCLCHECKGOTO(ncclGroupStartInternal(), ret, fail);
   for (int i=0; i<ndev; i++) {
     // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
     int dev = devlist ? devlist[i] : i;
     CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
     ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
   }
-  NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
+  NCCLCHECKGOTO(ncclGroupEndInternal(), ret, fail);
+
+  NVTX3_RANGE_ADD_PAYLOAD(CommInitAll, NcclNvtxParamsCommInitAllSchema,
+    NVTX3_PAYLOAD(comms[0]->commHash, ndev));
 
 exit:
   (void)cudaSetDevice(oldDev);
@@ -1873,14 +1811,14 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
   ncclResult_t ret = ncclSuccess;
   ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
   ncclConfig_t *internalConfigPtr = NULL;
+
+  NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig);
+
   NCCLCHECK(ncclGroupStartInternal());
 
   (void)ncclCudaLibraryInit();
   CUDACHECK(cudaGetDevice(&cudaDev));
 
-  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload)
-
   if (config == NULL)
     internalConfigPtr = &internalConfig;
   else
@@ -1890,7 +1828,13 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
+  if (newcomm && *newcomm) {
+    if (!(*newcomm)->config.blocking) {
+      (void) ncclCommGetAsyncError(*newcomm, &ret);
+    }
+    NVTX3_RANGE_ADD_PAYLOAD(CommInitRankConfig, NcclNvtxParamsCommInitRankSchema,
+      NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
+  }
   return ret;
 fail:
   if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
@@ -1899,6 +1843,8 @@ fail:
 
 NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
 ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
+  NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable);
+
   int cudaDev;
   ncclResult_t ret = ncclSuccess;
   ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
@@ -1908,9 +1854,6 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
   (void)ncclCudaLibraryInit();
   CUDACHECK(cudaGetDevice(&cudaDev));
 
-  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload)
-
   if (config == NULL)
     internalConfigPtr = &internalConfig;
   else
@@ -1920,7 +1863,13 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
+  if (newcomm && *newcomm) {
+    if (!(*newcomm)->config.blocking) {
+      (void) ncclCommGetAsyncError(*newcomm, &ret);
+    }
+    NVTX3_RANGE_ADD_PAYLOAD(CommInitRankScalable, NcclNvtxParamsCommInitRankSchema,
+      NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
+  }
   return ret;
 fail:
   if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
@@ -1980,7 +1929,8 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
 
 NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
 ncclResult_t ncclCommFinalize(ncclComm_t comm) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NVTX3_RANGE(NcclNvtxParamsCommFinalize);
+
   ncclResult_t ret = ncclSuccess;
   struct ncclCommFinalizeAsyncJob *job = NULL;
 
@@ -2005,7 +1955,13 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); }
+  if (comm) {
+    if (!comm->config.blocking) {
+      NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
+    }
+    NVTX3_RANGE_ADD_PAYLOAD(CommFinalize, NcclNvtxParamsCommFinalizeSchema,
+      NVTX3_PAYLOAD(comm->commHash));
+  }
   return ret;
 fail:
   free(job);
@@ -2077,8 +2033,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   struct ncclCommFinalizeAsyncJob *job = NULL;
   ncclResult_t res = ncclSuccess;
 
-  NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(CommDestroy, NcclNvtxParamsCommInitRank,
+    NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
 
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
   NCCLCHECK(ncclGroupStartInternal());
@@ -2105,8 +2061,9 @@ fail:
 
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
 ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  NVTX3_RANGE(NcclNvtxParamsCommAbort);
+
   if (comm == NULL) {
-    NVTX3_FUNC_RANGE_IN(nccl_domain);
     return ncclSuccess;
   }
   NCCLCHECK(ncclGroupStartInternal());
@@ -2127,8 +2084,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   struct ncclCommFinalizeAsyncJob *job = NULL;
   ncclResult_t res = ncclSuccess;
 
-  NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
+  NVTX3_RANGE_ADD_PAYLOAD(CommAbort, NcclNvtxParamsCommInitRankSchema,
+    NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
 
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
 
@@ -2144,29 +2101,13 @@ fail:
   goto exit;
 }
 
-struct NvtxParamsCommSplit {
-  int rank;
-  int nranks;
-  int cudaDev;
-  int color;
-  int key;
-};
-constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)},
-};
-
 NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
 ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
   struct ncclCommInitRankAsyncJob *job = NULL;
   struct ncclComm* childComm = NCCL_COMM_NULL;
   ncclResult_t res = ncclSuccess;
 
-  NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key};
-  NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload)
+  NVTX3_RANGE(NcclNvtxParamsCommSplit)
 
   int oldDev;
   CUDACHECK(cudaGetDevice(&oldDev));
@@ -2224,6 +2165,12 @@ exit:
   (void)cudaSetDevice(oldDev);
   (void)ncclGroupErrCheck(res);
   NCCLCHECK(ncclGroupEndInternal());
+
+  if (res == ncclSuccess && *newcomm) {
+    NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema,
+      NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
+  }
+
   return res;
 fail:
   if (childComm) {
diff --git a/src/mnnvl.cc b/src/mnnvl.cc
new file mode 100644
index 0000000..07e8b21
--- /dev/null
+++ b/src/mnnvl.cc
@@ -0,0 +1,82 @@
+/*************************************************************************
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "mnnvl.h"
+#include "transport.h"
+#include <cuda.h>
+#include "cudawrap.h"
+
+// Determine if MNNVL support is available
+ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
+  // MNNVL requires cuMem to be enabled
+  if (!ncclCuMemEnable()) return ncclSuccess;
+
+  // MNNVL also requires FABRIC handle support
+  int cudaDev;
+  int flag = 0;
+  CUdevice currentDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+  // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
+  (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
+  if (!flag) return ncclSuccess;
+  // Check that all ranks have initialized the fabric fully
+  for (int i = 0; i < comm->nRanks; i++) {
+    if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return ncclSuccess;
+  }
+
+  // Determine our MNNVL domain/clique
+  NCCLCHECK(ncclCalloc(&comm->clique.ranks, comm->nRanks));
+  comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
+  for (int i = 0; i < comm->nRanks; i++) {
+    nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
+    nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
+    // Check if the cluster UUID and cliqueId match
+    // A zero UUID means we don't have MNNVL fabric info - disable MNNVL
+    if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
+    if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
+        (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
+      if (i == comm->rank) {
+        comm->cliqueRank = comm->clique.size;
+      }
+      comm->clique.ranks[comm->clique.size++] = i;
+    }
+  }
+
+  // No MNNVL clique found
+  if (comm->clique.size <= 1) return ncclSuccess;
+
+  // Check that FABRIC handles can be exported & imported by IMEX
+  {
+    void *ptr = NULL;
+    CUmemGenericAllocationHandle handle;
+    ncclCuDesc cuDesc;
+    CUresult err;
+
+    // Allocate FABRIC handle compatible memory
+    ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN);
+    if (ret != ncclSuccess) return ncclSuccess;
+    err = CUPFN(cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
+    if (err != CUDA_SUCCESS ||
+        (err = CUPFN(cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC))) != CUDA_SUCCESS) {
+      const char *errStr;
+      (void) pfn_cuGetErrorString(err, &errStr);
+      NCCLCHECK(ncclCuMemFree(ptr));
+      // Return an error if this is a MNNVL capable system but it's not working
+      WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.",
+          comm->clique.size);
+      return ncclSystemError;
+    }
+    NCCLCHECK(ncclCuMemFree(ptr));
+
+    // Force the CUMEM handle type to be FABRIC for MNNVL
+    ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
+    comm->MNNVL = 1;
+    INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d",
+        comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
+  }
+  return ncclSuccess;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index bd8188a..5a83ef3 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -21,6 +21,8 @@
 #include <sys/time.h>
 #include <sched.h>
 
+#define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1)
+
 enum { proxyRecv=0, proxySend=1 };
 void* ncclProxyServiceUDS(void* _args);
 
@@ -770,8 +772,8 @@ process_nextops:
   ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
   ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend);
   TIME_START(2);
-  int freeOp[NCCL_MAX_LOCAL_RANKS];
-  int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
+  int freeOp[NCCL_MAX_PROXY_CONNECTIONS];
+  int freeOpEnd[NCCL_MAX_PROXY_CONNECTIONS];
   for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1;
 
   uint64_t lastOpCount = 0;
@@ -1060,7 +1062,8 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
   struct ncclProxyState* sharedProxyState = comm->proxyState;
   int tpProxyRank = comm->topParentRanks[proxyRank];
 
-  proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  proxyConn->sameProcess = ((comm->peerInfo[proxyRank].hostHash == comm->peerInfo[comm->rank].hostHash) &&
+                            (comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash)) ? 1 : 0;
   // Keep one connection per local rank
   proxyConn->connection = NULL;
   proxyConn->tpRank = tpProxyRank;
@@ -1193,7 +1196,7 @@ fail:
   goto exit;
 }
 
-const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
+const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd", "QueryFd", "Register", "Deregister" };
 ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
   struct ncclSocket* sock;
   ncclResult_t ret = ncclSuccess;
@@ -1552,18 +1555,18 @@ void* ncclProxyService(void* _args) {
   connectionPool.banks = 0;
   connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE;
 
-  struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
-  struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
-  memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
-  for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
+  struct pollfd pollfds[NCCL_MAX_PROXY_CONNECTIONS+1]; // one extra for listenSock fd
+  struct ncclProxyLocalPeer peers[NCCL_MAX_PROXY_CONNECTIONS];
+  memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_PROXY_CONNECTIONS);
+  for (int s=0; s<NCCL_MAX_PROXY_CONNECTIONS; s++) {
     pollfds[s].fd = -1;
     pollfds[s].events = POLLHUP|POLLIN;
   }
-  if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
+  if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_PROXY_CONNECTIONS].fd) != ncclSuccess) {
     WARN("[Proxy Service] Get listenSock fd fails");
     return NULL;
   };
-  pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
+  pollfds[NCCL_MAX_PROXY_CONNECTIONS].events = POLLIN;
 
   int maxnpeers = 0;
   int npeers = 0;
@@ -1577,17 +1580,19 @@ void* ncclProxyService(void* _args) {
     /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
     int ret;
     do {
-      ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500);
+      // poll all fds including the listenSock
+      ret = poll(pollfds, NCCL_MAX_PROXY_CONNECTIONS+1, asyncOpCount ? 0 : 500);
     } while (ret < 0 && errno == EINTR);
     if (ret < 0) {
       WARN("[Proxy Service] Poll failed: %s", strerror(errno));
       return NULL;
     }
-    if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
+    if (pollfds[NCCL_MAX_PROXY_CONNECTIONS].revents) {
+      // We got an event on the listenSock
       int s = 0;
-      while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++;
-      if (s == NCCL_MAX_LOCAL_RANKS) {
-        WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
+      while (s < NCCL_MAX_PROXY_CONNECTIONS && pollfds[s].fd >= 0) s++;
+      if (s == NCCL_MAX_PROXY_CONNECTIONS) {
+        WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_PROXY_CONNECTIONS);
         return NULL;
       }
       if (maxnpeers < s+1) maxnpeers = s+1;
@@ -1819,6 +1824,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
 
     if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
       if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
+        // We need to send a ncclProxyMsgStop message to our own proxy
         struct ncclSocket sock;
         int type = ncclProxyMsgStop;
         NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc
index 414a1ed..3e4e9a5 100644
--- a/src/ras/client_support.cc
+++ b/src/ras/client_support.cc
@@ -80,7 +80,7 @@ static int rasOutBufferSize = 0;
 
 // We use them all over the place; no point in wasting the stack...
 static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers
-                           // or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS)
+                           // or for printing the local GPU devices, which can't be more than 64
                            // small numbers (times two if the NVML mask is different than the CUDA mask).
                            // Still, 1024 should normally be plenty (verbose output may make things more difficult,
                            // but we do check for overflows, so it will just be trimmed).
@@ -1687,7 +1687,7 @@ static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
 const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) {
   bool first = true;
   buf[0] = '\0';
-  for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
+  for (int i = 0; i < sizeof(cudaDevs)*8; i++)
     if (cudaDevs & (1UL << i)) {
       snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
       first = false;
@@ -1695,7 +1695,7 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf,
   if (cudaDevs != nvmlDevs) {
     snprintf(buf+strlen(buf), size-strlen(buf), " (NVML ");
     first = true;
-    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
+    for (int i = 0; i < sizeof(nvmlDevs)*8; i++)
       if (nvmlDevs & (1UL << i)) {
         snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
         first = false;
diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h
index 68cac0b..715fff4 100644
--- a/src/ras/ras_internal.h
+++ b/src/ras/ras_internal.h
@@ -78,7 +78,7 @@ struct rasCollResponse {
 struct rasPeerInfo {
   union ncclSocketAddress addr;
   pid_t pid;
-  uint64_t cudaDevs; // Bitmask.  Conveniently, NCCL_MAX_LOCAL_RANKS == 64.
+  uint64_t cudaDevs; // Bitmask.  This is for local devices so 64 bits is enough.
   uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
 };
 
diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc
index 4282dc9..2ab7e94 100644
--- a/src/register/coll_reg.cc
+++ b/src/register/coll_reg.cc
@@ -73,15 +73,19 @@ ncclResult_t ncclRegisterCollNvlsBuffers(
 
     if (nvlsReged) {
       *regNeedConnect = 0;
-      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
-       * saturate bandwidth. */
+      /* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */
       if (comm->nNodes == 1) {
-        if (info->func == ncclFuncReduceScatter)
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
-        else
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+        if (info->func == ncclFuncReduceScatter) {
+          // RS: Further tweaks for Blackwell with NVLS registered buffers
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5));
+        }
+        else {
+          // AR/AG: Further tweaks for Blackwell with NVLS registered buffers
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4));
+        }
       } else {
-        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+        // Further tweaks for Blackwell with NVLS registered buffers
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6));
       }
       info->regBufType |= NCCL_NVLS_REG_BUFFER;
     }
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 582c30a..3fe25a3 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -141,9 +141,11 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr,
 #include "channel.h"
 
 #define NVLS_MEM_ALIGN_SIZE (1 << 21)
+#define NVLS_NCHANNELS_SM90 16
+#define NVLS_NCHANNELS_SM100 32
 
 NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
-NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
+NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", -2);
 NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024);
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
@@ -152,7 +154,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
 
   int gpuCount;
   NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
-  if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess;
+  if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
 
   CUdevice dev;
   int driverVersion;
@@ -170,7 +172,11 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
   }
 
   INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
-  if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels()));
+  if (comm->nvlsSupport) {
+    int channels = (comm->compCap >= 100) ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM90;
+    if (ncclParamNvlsChannels() >= 0) channels = ncclParamNvlsChannels();
+    comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, channels));
+  }
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 3ae514e..dac7621 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -213,7 +213,7 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDe
 
     // cuMem API support
     CUmemGenericAllocationHandle handle;
-    NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
+    NCCLCHECK(ncclCuMemAlloc(ptr, &handle, type, size));
     if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
       // Return the native cuMem handle for later Export/Import via UDS
       memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle));
@@ -816,7 +816,7 @@ ncclResult_t ret = ncclSuccess;
   if (isLegacyIpc) *isLegacyIpc = false;
   if (regRecord) {
     // buffer was registered by by users, we need to start to register or reuse it
-    int peerLocalRank;
+    int peerLocalRank = -1;
     for (int p = 0; p < nPeers; p++) {
       int peerRank = peerRanks[p];
       peerLocalRank = comm->rankToLocalRank[peerRank];
@@ -886,8 +886,10 @@ ncclResult_t ret = ncclSuccess;
         ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
         // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
         // and get the remote register address back.
-        if (proxyConn)
+        if (proxyConn) {
+          INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank);
           NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
+        }
         if (rmtRegAddr) {
           NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
           assert(regRecord->ipcInfos[peerLocalRank] == NULL);
@@ -905,7 +907,7 @@ ncclResult_t ret = ncclSuccess;
           regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
           needUpdate = true;
           *regBufFlag = 1;
-          INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
+          INFO(NCCL_REG, "rank %d - IPC registered buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
         }
       }
     }
@@ -1039,6 +1041,8 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
   assert(sizeof(struct p2pIpcExpInfo) == reqSize);
   assert(sizeof(void*) == respSize);
 
+  INFO(NCCL_REG, "Proxy rank %d register reqBuff %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, reqBuff, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
+
   // request peer passes all necessary buffer info to import. The proxy thread would register
   // the buffer locally and return register addr back
   if (ipcExpInfo->legacyIpcCap) {
@@ -1070,7 +1074,7 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
     CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)regAddr, ipcExpInfo->size, &accessDesc, 1), ret, fail);
     regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
   }
-  INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
+  INFO(NCCL_REG, "Proxy rank %d register success regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
 
 exit:
   memcpy(respBuff, (void*)&regAddr, sizeof(void*));