diff --git a/ext-net/README.md b/ext-net/README.md
index aa1a394..90fe89b 100644
--- a/ext-net/README.md
+++ b/ext-net/README.md
@@ -60,20 +60,20 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v9)
+# API (v10)
 
-Below is the main `ncclNet_v9` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
   // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -83,13 +83,13 @@ typedef struct {
   // should return successfully with sendComm == NULL with the expectation that
   // it will be called again until sendComm != NULL.
   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
   // Finalize connection establishment after remote peer has called connect.
   // This call must not block for the connection to be established, and instead
   // should return successfully with recvComm == NULL with the expectation that
   // it will be called again until recvComm != NULL.
   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -98,10 +98,10 @@ typedef struct {
   ncclResult_t (*deregMr)(void* comm, void* mhandle);
   // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
   // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
@@ -200,6 +200,9 @@ the plugin code adding the following definitions:
 #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 ```
 
+The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
+record its own events with the NCCL profiler plugin.
+
 `devices`
 
 Once the plugin is initialized, NCCL will query the number of devices available. It should not
@@ -301,6 +304,11 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
 should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
 succeeds.
 
+The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+This field can be used by the network plugin to specify the QoS level of the connection. By default,
+`trafficClass` is set to -1 but can be configured by the application during communicator initialization
+to select a plugin-supported QoS level.
+
 `closeListen`/`closeSend`/`closeRecv`
 
 Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
@@ -354,6 +362,9 @@ The `isend` operation returns a handle in the `request` argument for further cal
 the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
 `isend` again later.
 
+The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
+to support network defined events.
+
 `irecv`
 
 To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
@@ -375,6 +386,9 @@ of irecv and is resilient to redundant network writes. This allows the plugin to
 completions on such irecvs (for example, complete the request immediately). The plugin is still
 expected to set a valid request pointer on return which NCCL can poll to check for completion.
 
+The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
+network plugin to support network defined events.
+
 Note: for a given connection, send/receive operations should always match in the order they were
 posted. Tags provided for receive operations are only used to assign a given send operation to one
 of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 112967a..85ea79e 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -2,14 +2,15 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
+#ifndef NET_H_
+#define NET_H_
 
 #include <stdint.h>
 #include <stdlib.h>
 
 #include "common.h"
 #include "err.h"
+#include "net_device.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
@@ -22,6 +23,9 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
+#include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
 #include "net_v7.h"
@@ -31,4 +35,9 @@
 #include "net_v3.h"
 #include "net_v2.h"
 
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
 #endif // end include guard
diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h
index 874fb59..d693101 100644
--- a/ext-net/example/nccl/net_device.h
+++ b/ext-net/example/nccl/net_device.h
@@ -26,6 +26,7 @@ typedef struct {
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/ext-net/example/nccl/net_v10.h b/ext-net/example/nccl/net_v10.h
new file mode 100644
index 0000000..809e7c0
--- /dev/null
+++ b/ext-net/example/nccl/net_v10.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+#endif // end include guard
diff --git a/ext-net/example/nccl/net_v2.h b/ext-net/example/nccl/net_v2.h
index 0d9c906..dd9f39b 100644
--- a/ext-net/example/nccl/net_v2.h
+++ b/ext-net/example/nccl/net_v2.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V2_H_
-#define NCCL_NET_V2_H_
+#ifndef NET_V2_H_
+#define NET_V2_H_
 
 typedef struct {
   // Name of the network (mainly for logs)
diff --git a/ext-net/example/nccl/net_v3.h b/ext-net/example/nccl/net_v3.h
index db1287b..9002165 100644
--- a/ext-net/example/nccl/net_v3.h
+++ b/ext-net/example/nccl/net_v3.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V3_H_
-#define NCCL_NET_V3_H_
+#ifndef NET_V3_H_
+#define NET_V3_H_
 
 #define NCCL_NET_MAX_REQUESTS_V3 16
 
diff --git a/ext-net/example/nccl/net_v4.h b/ext-net/example/nccl/net_v4.h
index efe4824..41cef56 100644
--- a/ext-net/example/nccl/net_v4.h
+++ b/ext-net/example/nccl/net_v4.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V4_H_
-#define NCCL_NET_V4_H_
+#ifndef NET_V4_H_
+#define NET_V4_H_
 
 #define NCCL_NET_HANDLE_MAXSIZE_V4 64
 
diff --git a/ext-net/example/nccl/net_v5.h b/ext-net/example/nccl/net_v5.h
index b96b6fc..47f446c 100644
--- a/ext-net/example/nccl/net_v5.h
+++ b/ext-net/example/nccl/net_v5.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V5_H_
-#define NCCL_NET_V5_H_
+#ifndef NET_V5_H_
+#define NET_V5_H_
 
 typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
 typedef struct {
diff --git a/ext-net/example/nccl/net_v6.h b/ext-net/example/nccl/net_v6.h
index fffaf8c..de90f29 100644
--- a/ext-net/example/nccl/net_v6.h
+++ b/ext-net/example/nccl/net_v6.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V6_H_
-#define NCCL_NET_V6_H_
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
+#ifndef NET_V6_H_
+#define NET_V6_H_
 
 typedef struct {
   char* name;     // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v7.h b/ext-net/example/nccl/net_v7.h
index d607095..3802a3d 100644
--- a/ext-net/example/nccl/net_v7.h
+++ b/ext-net/example/nccl/net_v7.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V7_H_
-#define NCCL_NET_V7_H_
-
-#include "net_device.h"
+#ifndef NET_V7_H_
+#define NET_V7_H_
 
 typedef struct {
   char* name;                      // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v8.h b/ext-net/example/nccl/net_v8.h
index 54a61f6..74eb72d 100644
--- a/ext-net/example/nccl/net_v8.h
+++ b/ext-net/example/nccl/net_v8.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V8_H_
-#define NCCL_NET_V8_H_
-
-#include "net_device.h"
+#ifndef NET_V8_H_
+#define NET_V8_H_
 
 typedef struct {
   char* name;                      // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h
index 61035ec..ca60ad6 100644
--- a/ext-net/example/nccl/net_v9.h
+++ b/ext-net/example/nccl/net_v9.h
@@ -2,18 +2,14 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V9_H_
-#define NCCL_NET_V9_H_
-
-#include "net_device.h"
+#ifndef NET_V9_H_
+#define NET_V9_H_
 
 #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
 typedef struct {
   int ndevs;
   int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
 } ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
 
 typedef struct {
   char* name;                      // Used mostly for logging.
@@ -35,8 +31,6 @@ typedef struct {
   size_t maxCollBytes;             // Max transfer size for collective operations
 } ncclNetProperties_v9_t;
 
-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
@@ -93,7 +87,7 @@ typedef struct {
 
   // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
   // what index this new vNIC exists at
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
 } ncclNet_v9_t;
 
 #endif // end include guard
diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c
index 2852242..97a2987 100644
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@@ -11,7 +11,7 @@
 
 int max_requests = NCCL_NET_MAX_REQUESTS;
 
-__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -52,13 +52,13 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
 }
 
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
@@ -70,7 +70,7 @@ __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
 
 #define PLUGIN_NAME "Plugin"
 
-ncclNet_v9_t ncclNetPlugin_v9 = {
+const ncclNet_v10_t ncclNetPlugin_v10 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
@@ -93,6 +93,51 @@ ncclNet_v9_t ncclNetPlugin_v9 = {
   .makeVDevice   = pluginMakeVDevice,
 };
 
+__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
+  return pluginInit(logFunction, NULL);
+}
+
+__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
+  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+}
+
+__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
+  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+}
+
+__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+  return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
+}
+
+__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; }
+
+const ncclNet_v9_t ncclNetPlugin_v9 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v9,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v9,
+  .irecv = pluginIrecv_v9,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v9,
+};
+
 __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
   ncclNetProperties_t props;
   ncclResult_t ret = pluginGetProperties(dev, &props);
@@ -113,22 +158,22 @@ __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* pr
 }
 
 __hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
-  return pluginIsend(sendComm, data, (int)size, tag, mhandle, request);
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request);
 }
 
 __hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
   for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
-  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, request);
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, NULL, request);
 }
 
 const ncclNet_v8_t ncclNetPlugin_v8 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v8,
   .listen = pluginListen,
-  .connect = pluginConnect,
+  .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr,
   .regMrDmaBuf = pluginRegMrDmaBuf,
@@ -168,11 +213,11 @@ __hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int t
 
 const ncclNet_v7_t ncclNetPlugin_v7 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v7,
   .listen = pluginListen,
-  .connect = pluginConnect,
+  .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr_v7,
   .regMrDmaBuf = pluginRegMrDmaBuf,
@@ -209,7 +254,7 @@ __hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { retur
 
 const ncclNet_v6_t ncclNetPlugin_v6 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v6,
   .listen = pluginListen,
@@ -230,7 +275,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
 /* v5 Compat */
 const ncclNet_v5_t ncclNetPlugin_v5 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v6,
   .listen = pluginListen,
@@ -275,7 +320,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
   ncclResult_t ret;
   do {
     ncclNetDeviceHandle_v7_t* handle = NULL;
-    ret = pluginConnect(dev, handle, sendComm, &handle);
+    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
   } while (ret == ncclSuccess && *sendComm == NULL);
   return ret;
 }
@@ -289,7 +334,7 @@ static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
 }
 const ncclNet_v4_t ncclNetPlugin_v4 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v4,
   .listen = pluginListen,
@@ -318,7 +363,7 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
 }
 static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
   max_requests = NCCL_NET_MAX_REQUESTS_V3;
-  return pluginInit(logFunction);
+  return pluginInit(logFunction, NULL);
 }
 #include <string.h>
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 1b1bb86..545203a 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -16,6 +16,7 @@ WERROR ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
 RDMA_CORE ?= 0
+NET_PROFILER ?= 0
 
 NVCC = $(CUDA_HOME)/bin/nvcc
 
@@ -137,3 +138,7 @@ endif
 ifneq ($(RDMA_CORE), 0)
 CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
 endif
+
+ifneq ($(NET_PROFILER), 0)
+CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1
+endif
diff --git a/makefiles/version.mk b/makefiles/version.mk
index b02cf90..df3ee5c 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 25
-NCCL_PATCH   := 1
+NCCL_MINOR   := 26
+NCCL_PATCH   := 2
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index b66ebef..65da630 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,11 +10,15 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
 	$(wildcard register/*.cc) \
+	$(wildcard plugin/*.cc) \
+	$(wildcard plugin/net/*.cc) \
+	$(wildcard plugin/tuner/*.cc) \
+	$(wildcard plugin/profiler/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc
 
@@ -49,6 +53,7 @@ LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 BINOBJ     := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
+INCPLUGIN  := include/plugin
 
 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
 
@@ -126,8 +131,8 @@ $(PKGDIR)/%.pc : %.pc
 $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
-	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 675bcfc..9e24faa 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -153,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*sendReq) {
-    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq));
+    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq));
   }
   if (*sendReq) {
     NCCLCHECK(net->test(*sendReq, done, NULL));
@@ -167,8 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*recvReq) {
-    size_t size64 = size; 
-    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq));
+    size_t size64 = size;
+    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, NULL, recvReq));
   }
   if (*recvReq) {
     NCCLCHECK(net->test(*recvReq, done, NULL));
@@ -484,7 +484,7 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
   if (devOOB < 0) {
     pthread_mutex_lock(&bootstrapNetLock);
     if (devOOB < 0) {
-      char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME");
+      const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
       if (userIfEnv && strlen(userIfEnv) > 0) {
         INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv);
         bool searchNot = userIfEnv && userIfEnv[0] == '^';
@@ -540,7 +540,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
   do {
     NCCLCHECK(checkAbort(abortFlag, &abortCounter));
     if (!*sendComm)
-      NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle));
+      NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
     if (!*recvComm)
       NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
   } while (!*sendComm || !*recvComm);
@@ -736,6 +736,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
     rasRanks[rank].pid = getpid();
     rasRanks[rank].cudaDev = comm->cudaDev;
     rasRanks[rank].nvmlDev = comm->nvmlDev;
+    rasRanks[rank].hostHash = getHostHash();
+    rasRanks[rank].pidHash = getPidHash();
     if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
       INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
       // We should still participate in the ringAllInfo below as the peers will be waiting for us.
@@ -967,7 +969,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s
   NCCLCHECK(socketAccept(commState, peer, tag, &sock));
   TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size);
   NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail);
-  NCCLCHECK(ncclSocketClose(&sock));
+  NCCLCHECKGOTO(ncclSocketClose(&sock, /*wait*/true), ret, fail);
   return ret;
 fail:
   (void)ncclSocketClose(&sock);
@@ -1062,7 +1064,7 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i
    * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
    * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
    */
-  int data[1];
+  int data[1] = {0};
   for (int mask = 1; mask < nranks; mask <<= 1) {
     int src = (rank - mask + nranks) % nranks;
     int dst = (rank + mask) % nranks;
diff --git a/src/channel.cc b/src/channel.cc
index b3a8f29..bc48986 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -20,8 +20,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
   channel->workFifoProduced = 0;
 
   struct ncclSharedResources* sharedRes = comm->sharedRes;
-
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  cudaStream_t deviceStream;
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   if (channel->peers == NULL) {
     // The extra on nRanks+1 is for collnet root (i.e. network)
@@ -39,33 +39,33 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
 
   if (channel->devPeers == NULL) {
     if (sharedRes->devPeers[channelId] == NULL) {
-      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream));
     }
     /* channel->devPeers is not shared, so just free it when calling commFree() */
-    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream));
     ncclCommPushCudaFree(comm, channel->devPeers);
     NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
     for (int r = 0; r < nRanks; r++) {
       uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
     }
   }
 
   channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, deviceStream));
   ncclCommPushCudaFree(comm, channel->devRingUserRanks);
 
   /* guarantee addr has been copied into channel->devPeers */
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
-
   return ncclSuccess;
 }
 
 ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
   struct ncclChannel* channel = &comm->channels[channelId];
   struct ncclSharedResources* sharedRes = comm->sharedRes;
+  cudaStream_t deviceStream;
 
   if (channel->nvlsPeers != NULL)
     return ncclSuccess;
@@ -73,7 +73,7 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
   if (channel->id == -1)
     NCCLCHECK(initChannel(comm, channelId));
 
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   int nvlsRanks = comm->localRanks;
 
@@ -84,24 +84,24 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
       int tr = comm->topParentLocalRanks[r];
       uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
       channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
       ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
     }
   } else {
     NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks));
-    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, deviceStream));
     for (int r = 0; r < nvlsRanks; ++r) {
       uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
       channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
       ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
     }
   }
 
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
 
   return ncclSuccess;
 }
@@ -110,6 +110,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
   struct ncclChannel* channel = &comm->channels[channelId];
   struct ncclSharedResources* sharedRes = comm->sharedRes;
   uintptr_t addr;
+  cudaStream_t deviceStream;
 
   if (channel->collnetPeers != NULL)
     return ncclSuccess;
@@ -117,28 +118,28 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
   if (channel->id == -1)
     NCCLCHECK(initChannel(comm, channelId));
 
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   if (share) {
     channel->collnetPeers = parent->channels[channelId].collnetPeers;
     channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
     addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
     channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
-    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
     channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
     ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
   } else {
     NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
-    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, deviceStream));
     addr = (uintptr_t)channel->collnetDevPeers;
     channel->peers[comm->nRanks] = channel->collnetPeers;
-    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
     channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
     ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
   }
 
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
 
   return ncclSuccess;
 }
diff --git a/src/debug.cc b/src/debug.cc
index 2ea6eab..2eb8d77 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -6,6 +6,7 @@
 
 #include "core.h"
 #include "nccl_net.h"
+#include <ctime>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -16,6 +17,11 @@
 #include "param.h"
 
 int ncclDebugLevel = -1;
+static uint32_t ncclDebugTimestampLevels = 0;     // bitmaps of levels that have timestamps turned on
+static char ncclDebugTimestampFormat[256];        // with space for subseconds
+static int ncclDebugTimestampSubsecondsStart;     // index where the subseconds starts
+static uint64_t ncclDebugTimestampMaxSubseconds;  // Max number of subseconds plus 1, used in duration ratio
+static int ncclDebugTimestampSubsecondDigits;     // Number of digits to display
 static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
@@ -112,6 +118,84 @@ static void ncclDebugInit() {
       ncclWarnSetDebugInfo = value;
   }
 
+  // Determine which debug levels will have timestamps.
+  const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS");
+  if (timestamps == nullptr) {
+    ncclDebugTimestampLevels = (1<<NCCL_LOG_WARN);
+  } else {
+    int invert = 0;
+    if (timestamps[0] == '^') { invert = 1; ++timestamps; }
+    ncclDebugTimestampLevels = invert ? ~0U : 0U;
+    char *timestampsDup = strdup(timestamps);
+    char *level = strtok(timestampsDup, ",");
+    while (level != NULL) {
+      uint32_t mask = 0;
+      if (strcasecmp(level, "ALL") == 0) {
+        mask = ~0U;
+      } else if (strcasecmp(level, "VERSION") == 0) {
+        mask = (1<<NCCL_LOG_VERSION);
+      } else if (strcasecmp(level, "WARN") == 0) {
+        mask = (1<<NCCL_LOG_WARN);
+      } else if (strcasecmp(level, "INFO") == 0) {
+        mask = (1<<NCCL_LOG_INFO);
+      } else if (strcasecmp(level, "ABORT") == 0) {
+        mask = (1<<NCCL_LOG_ABORT);
+      } else if (strcasecmp(level, "TRACE") == 0) {
+        mask = (1<<NCCL_LOG_TRACE);
+      } else {
+        // Silently fail.
+      }
+      if (mask) {
+        if (invert) ncclDebugTimestampLevels &= ~mask;
+        else ncclDebugTimestampLevels |= mask;
+      }
+      level = strtok(NULL, ",");
+    }
+    free(timestampsDup);
+  }
+
+  // Store a copy of the timestamp format with space for the subseconds, if used.
+  const char* tsFormat = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_FORMAT");
+  if (tsFormat == nullptr) tsFormat = "[%F %T] ";
+  ncclDebugTimestampSubsecondsStart = -1;
+  // Find where the subseconds are in the format.
+  for (int i=0; tsFormat[i] != '\0'; ++i) {
+    if (tsFormat[i]=='%' && tsFormat[i+1]=='%') { // Next two chars are "%"
+      // Skip the next character, too, and restart checking after that.
+      ++i;
+      continue;
+    }
+    if (tsFormat[i]=='%' &&                               // Found a percentage
+        ('1' <= tsFormat[i+1] && tsFormat[i+1] <= '9') && // Next char is a digit between 1 and 9 inclusive
+        tsFormat[i+2]=='f'                                // Two characters later is an "f"
+        ) {
+      constexpr int replaceLen = sizeof("%Xf") - 1;
+      ncclDebugTimestampSubsecondDigits = tsFormat[i+1] - '0';
+      if (ncclDebugTimestampSubsecondDigits + strlen(tsFormat) - replaceLen > sizeof(ncclDebugTimestampFormat) - 1) {
+        // Won't fit; fall back on the default.
+        break;
+      }
+      ncclDebugTimestampSubsecondsStart = i;
+      ncclDebugTimestampMaxSubseconds = 1;
+
+      memcpy(ncclDebugTimestampFormat, tsFormat, i);
+      for (int j=0; j<ncclDebugTimestampSubsecondDigits; ++j) {
+        ncclDebugTimestampFormat[i+j] = ' ';
+        ncclDebugTimestampMaxSubseconds *= 10;
+      }
+      strcpy(ncclDebugTimestampFormat+i+ncclDebugTimestampSubsecondDigits, tsFormat+i+replaceLen);
+      break;
+    }
+  }
+  if (ncclDebugTimestampSubsecondsStart == -1) {
+    if (strlen(tsFormat) < sizeof(ncclDebugTimestampFormat)) {
+      strcpy(ncclDebugTimestampFormat, tsFormat);
+    } else {
+      strcpy(ncclDebugTimestampFormat, "[%F %T] ");
+    }
+  }
+
+
   // Cache pid and hostname
   getHostName(hostname, 1024, '.');
   pid = getpid();
@@ -192,39 +276,86 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
     tid = syscall(SYS_gettid);
   }
 
+  char buffer[1024];
+  size_t len = 0;
+
+  // WARNs come with an extra newline at the beginning.
+  if (level == NCCL_LOG_WARN) {
+    buffer[len++] = '\n';
+  };
+
+  // Add the timestamp to the buffer if they are turned on for this level.
+  if (ncclDebugTimestampLevels & (1<<level)) {
+    if (ncclDebugTimestampFormat[0] != '\0') {
+      struct timespec ts;
+      clock_gettime(CLOCK_REALTIME, &ts);   // clock_gettime failure should never happen
+      std::tm nowTm;
+      localtime_r(&ts.tv_sec, &nowTm);
+
+      // Add the subseconds portion if it is part of the format.
+      char localTimestampFormat[sizeof(ncclDebugTimestampFormat)];
+      const char* pformat = ncclDebugTimestampFormat;
+      if (ncclDebugTimestampSubsecondsStart != -1) {
+        pformat = localTimestampFormat;   // Need to use the local version which has subseconds
+        memcpy(localTimestampFormat, ncclDebugTimestampFormat, ncclDebugTimestampSubsecondsStart);
+        snprintf(localTimestampFormat + ncclDebugTimestampSubsecondsStart,
+                 ncclDebugTimestampSubsecondDigits+1,
+                 "%0*ld", ncclDebugTimestampSubsecondDigits,
+                 ts.tv_nsec / (1000000UL/ncclDebugTimestampMaxSubseconds));
+        strcpy(    localTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits,
+               ncclDebugTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits);
+      }
+
+      // Format the time. If it runs out of space, fall back on a simpler format.
+      int adv = std::strftime(buffer+len, sizeof(buffer)-len, pformat, &nowTm);
+      if (adv==0 && ncclDebugTimestampFormat[0] != '\0') {
+        // Ran out of space. Fall back on the default. This should never fail.
+        adv = std::strftime(buffer+len, sizeof(buffer)-len, "[%F %T] ", &nowTm);
+      }
+      len += adv;
+    }
+  }
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+
+  // Add hostname, pid and tid portion of the log line.
+  if (level != NCCL_LOG_VERSION) {
+    len += snprintf(buffer+len, sizeof(buffer)-len, "%s:%d:%d ", hostname, pid, tid);
+    len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+  }
+
   int cudaDev = 0;
   if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
     (void)cudaGetDevice(&cudaDev);
   }
 
-  char buffer[1024];
-  size_t len = 0;
+  // Add level specific formatting.
   if (level == NCCL_LOG_WARN) {
-    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
-                   hostname, pid, tid, cudaDev, filefunc, line);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %s:%d NCCL WARN ", cudaDev, filefunc, line);
     if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
   } else if (level == NCCL_LOG_INFO) {
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] NCCL INFO ", cudaDev);
   } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "NCCL CALL ");
   } else if (level == NCCL_LOG_TRACE) {
     auto delta = std::chrono::steady_clock::now() - ncclEpoch;
     double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
-                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line);
   }
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
 
+  // Add the message as given by the call site.
   va_list vargs;
   va_start(vargs, fmt);
   len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
   va_end(vargs);
   // vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
-  // Rewind len so that we can replace the final \0 by \n
-  if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
-  if (len) {
-    buffer[len++] = '\n';
-    fwrite(buffer, 1, len, ncclDebugFile);
-  }
+  // Rewind len so that we can replace the final \0 by "\n"
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+
+  // Add a newline and write it to the debug file. No terminating null is
+  // necessary since we write bytes instead of the string.
+  buffer[len++] = '\n';
+  fwrite(buffer, 1, len, ncclDebugFile);
 }
 
 NCCL_API(void, ncclResetDebugInit);
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index 5d79d73..854ebbf 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -67,7 +67,7 @@ namespace {
         offset = dataOffset + rankDest * count;
 
         // Final wait/copy.
-        prims.directRecv(offset, offset, nelem);
+        prims.directRecv(offset, nelem);
       }
     } else if (inputBuf != outputBuf + ringRanks[0] * count) {
       inputBuf = inputBuf + partOffset;
@@ -111,25 +111,63 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+#if __CUDA_ARCH__ >= 600
     using Proto = ProtoSimple<1, 1>;
     const int nranks = ncclShmem.comm.nRanks;
     const int rank = ncclShmem.comm.rank;
     size_t count, channelOffset, channelCount, chunkCount;
     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
 
-    T *inputBuf = (T*)work->sendbuff;
-    T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg);
+    static constexpr int nworkers = NCCL_PAT_NWORKERS;
+    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
+    uint64_t pollCount = 0;
+    __syncthreads(); // Don't start using shared mem until everyone arrives
+    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
+    if (tid == 0) shmem->localAccSize = 0;
+    if (tid == nworkers) shmem->parallelFactor = 0;
+    __syncthreads();
 
-    PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
-    int last = 0;
-    while (!last) {
-      int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
-      size_t inpIx, outIx;
-      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
-      prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
+    if (tid == nworkers) { // Algo computation thread
+      PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      int step = 0;
+      while (1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        patAlgo.getNextOp(ps);
+        int last = ps->last;
+        step++;
+        if (last == 2) break;
+      }
+    } else if (tid < nworkers) { // Worker threads
+      T *inputBuf = (T*)work->sendbuff;
+      T *outputBuf = (T*)work->recvbuff;
+      int parallelFactor = 0;
+      volatile int* pfPtr = &shmem->parallelFactor;
+      while (parallelFactor == 0) parallelFactor = *pfPtr;
+
+      int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
+      int group = tid / groupSize;
+      int nGroups = nworkers / groupSize;
+      int tidInGroup = tid - group*groupSize;
+      // We don't use recvPeers/sendPeers so let's pass shmem structs instead
+      Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+        (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatAg);
+
+      int step = group;
+      while(1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
+        int last = ps->last;
+        prims.patCopy(ps, shmem);
+        if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
+        if (last) break;
+        step += nGroups;
+      }
     }
+#endif
   }
 };
 
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 2161597..81da554 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -78,7 +78,7 @@ namespace {
       offset = gridOffset + elemOffset + chunkOffset;
       nelem = (int)min(chunkCount, remCount - chunkOffset);
 
-      prims.directRecv(offset, offset, nelem);
+      prims.directRecv(offset, nelem);
     }
   }
 
@@ -132,7 +132,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         }
       }
       else {
@@ -215,7 +215,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         }
       }
       else {
@@ -710,7 +710,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
             for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
               ssize_t offset = gridOffset + bid * int(chunkSize);
               int nelem = min(chunkSize, size - offset);
-              prims.directRecv(offset, offset, nelem, /*postOp*/true);
+              prims.directRecv(offset, nelem, /*postOp*/true);
             }
           }
         } else {
@@ -737,7 +737,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid*int(chunkSize);
             int nelem = min(chunkSize, size-offset);
-            prims.directRecv(offset, offset, nelem);
+            prims.directRecv(offset, nelem);
           }
         } else {
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 017d379..5948891 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -46,7 +46,7 @@ namespace {
             prims.directCopySend(offset, offset, nelem);
           }
         } else if (nextRank == root) {
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         } else {
           prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
diff --git a/src/device/common.h b/src/device/common.h
index 05465ff..2dca70d 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -53,6 +53,7 @@ struct ncclShmemData {
   int nWorks;
   int workSize;
   uint32_t workConsumed;
+  uint64_t workCounter;
   struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
   uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
 
@@ -113,24 +114,6 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
       : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
   return bool(ans);
 }
-__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
-  int ans;
-  asm volatile("{ .reg .pred p;"
-      "  setp.ne.s32 p, %1, 0;"
-      "  barrier.red.or.pred.aligned p, %2, p; "
-      "  selp.s32 %0, 1, 0, p; }"
-      : "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
-  return bool(ans);
-}
-__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
-  int ans;
-  asm("{ .reg .pred p;"
-      "  setp.ne.s32 p, %1, 0;"
-      "  barrier.red.or.pred.aligned p, %2, %3, p; "
-      "  selp.s32 %0, 1, 0, p; }"
-      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
-  return bool(ans);
-}
 
 // Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
 inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
@@ -331,7 +314,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   /* set abort flag to 0 */
   if (tid == 0) ncclShmem.aborted = 0;
 
-  // Use first 2 warps to load comm and channel, and reamaining load work batch.
+  // Use first 2 warps to load comm and channel, and remaining load work batch.
   switch (tid/WARP_SIZE) {
   case 0:
     { void* dst = &ncclShmem.comm;
@@ -364,7 +347,8 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
   }
 
-  while (true) {
+  while (ncclShmem.aborted == 0) {
+    if (tid == 0) ncclShmem.comm.workStarted[ncclShmem.channelId] = (ncclShmem.channel.workCounter += ncclShmem.nWorks);
     if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
       SpecializedRunWorkBatch().run();
     } else {
@@ -374,17 +358,18 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     if (ncclShmem.nextBatchIx == -1) break;
     int batchIx = ncclShmem.nextBatchIx;
     __syncthreads();
+    if (tid == 0) ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
     loadWorkBatchToShmem(tid, tn, args, batchIx);
+    __syncthreads();
 
-    // Check whether the last operation was aborted and make sure all threads exit
-    bool aborted = false;
-    if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
-    aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work
     if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
-      // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
+      // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
       ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
     }
-    if (aborted) break;
+  }
+  if (tid == 0) {
+    ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
   }
 }
 
diff --git a/src/device/primitives.h b/src/device/primitives.h
index 73c10c2..3b9f169 100644
--- a/src/device/primitives.h
+++ b/src/device/primitives.h
@@ -12,7 +12,7 @@
 #include "common_kernel.h"
 #include "common.h"
 
-#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
+#define NCCL_SPINS_BEFORE_CHECK_ABORT 10000
 
 /* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128
  * We use these as template args to the Primtiives class instead of integral
@@ -115,7 +115,7 @@ struct PrimitivesWithoutDirect {
   __device__ void directSendFromOutput(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
   }
-  __device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
+  __device__ void directRecv(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
   }
   __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
@@ -139,6 +139,18 @@ struct PrimitivesWithoutDirect {
   }
 };
 
+__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
+  if (abortCache & abortValue) return 1;
+  if (++spins < NCCL_SPINS_BEFORE_CHECK_ABORT) return 0;
+  spins = 0;
+  int abort = *ncclShmem.comm.abortFlag;
+  if (abort) {
+    ncclShmem.aborted = abort;
+    abortCache |= abortValue;
+  }
+  return abort;
+}
+
 #include "prims_simple.h"
 #include "prims_ll.h"
 #include "prims_ll128.h"
diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h
index 3e00f3b..2a0f556 100644
--- a/src/device/prims_ll.h
+++ b/src/device/prims_ll.h
@@ -51,23 +51,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     }
   }
 
-  uint32_t abort = 0;
-
-  inline __device__ int checkAbort(int &spins, int send) {
-    spins++;
-    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = *ncclShmem.comm.abortFlag;
-      spins = 0;
-    }
-    return abort;
-  }
+  int abort = 0;
 
   inline __device__ void waitSend(int nbytes) {
     if (sendConnHeadPtr) {
       int spins = 0;
       while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
         sendConnHeadCache = *sendConnHeadPtr;
-        if (checkAbort(spins, 1)) break;
+        if (checkAbort(abort, 1, spins)) break;
       }
       if (sendConnFifo) {
         int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
@@ -102,7 +93,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     int spins = 0;
     do {
       asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory");
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
     } while ((flag1 != flag) || (flag2 != flag));
     uint64_t val64 = data1 + (((uint64_t)data2) << 32);
     return val64;
@@ -126,7 +117,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     int spins = 0;
     while (line[i].flag1 != flag || line[i].flag2 != flag) {
       asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
     }
     uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
     return val64;
diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h
index 617b7ac..6985e67 100644
--- a/src/device/prims_ll128.h
+++ b/src/device/prims_ll128.h
@@ -53,23 +53,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
     barrier_sync(15-group, nthreads);
   }
 
-  uint32_t abort = 0;
-
-  inline __device__ int checkAbort(int &spins, int i, int send) {
-    spins++;
-    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = *ncclShmem.comm.abortFlag;
-      spins = 0;
-    }
-    return abort;
-  }
+  int abort = 0;
 
   inline __device__ void waitSend(int nbytes) {
     if (sendConnHeadPtr) {
       int spins = 0;
       while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
         sendConnHeadCache = *sendConnHeadPtr;
-        if (checkAbort(spins, wid, 1)) break;
+        if (checkAbort(abort, 1, spins)) break;
       }
       if (sendConnFifo) {
         sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes;
@@ -201,7 +192,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
           load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
           needReload |= flagThread && (vr[u+1] != flag);
         }
-        needReload &= (0 == checkAbort(spins, 0, 0));
+        needReload &= (0 == checkAbort(abort, 1, spins));
       } while (__any_sync(WARP_MASK, needReload));
 
       #pragma unroll
@@ -248,7 +239,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
             load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
             needReload |= flagThread && (vr[u+1] != flag);
           }
-          needReload &= (0 == checkAbort(spins, i, 0));
+          needReload &= (0 == checkAbort(abort, 1, spins));
         } while (__any_sync(WARP_MASK, needReload));
 
         #pragma unroll
diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h
index 0051019..cf3ba9b 100644
--- a/src/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -52,7 +52,7 @@ class Primitives<
   uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
   int      connStepSize; // Connection step size
   void*    netDeviceHandle;
-  uint64_t accSize; // Accumulated size. Used by PAT operations
+  uint64_t accSize;
 
   // Don't use barrier 0 as it's used by the final sync
   __device__ void barrier() {
@@ -70,6 +70,11 @@ class Primitives<
     }
   }
 
+  // PAT uses a single barrier across all groups
+  __device__ void patBarrier() {
+    barrier_sync(15, NCCL_PAT_NWORKERS);
+  }
+
   __device__ bool barrierAny(int vote) {
     if (nthreads == WARP_SIZE) {
       return __any_sync(~0u, vote);
@@ -87,18 +92,6 @@ class Primitives<
     }
   }
 
-  inline __device__ bool checkAbort(int &spins) {
-    spins++;
-    if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      if (*ncclShmem.comm.abortFlag) {
-        flags |= Aborted;
-        ncclShmem.aborted = 1;
-      }
-      spins = 0;
-    }
-    return flags & Aborted;
-  }
-
   inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
     #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
     if (flags & NvlsMinPolling) {
@@ -121,7 +114,7 @@ class Primitives<
       int spins = 0;
       while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
         connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
+        if (checkAbort(flags, Aborted, spins)) break;
         //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
       }
     }
@@ -338,13 +331,8 @@ public:
     peerPtr->recv[connIndex].step += steps;
     st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
     while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
-      if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-        if (*ncclShmem.comm.abortFlag) {
-          ncclShmem.aborted = 1;
-          break;
-        }
-        spins = 0;
-      }
+      int abort = 0;
+      if (checkAbort(abort, 1, spins)) break;
     }
   }
 
@@ -359,7 +347,7 @@ public:
           int spins = 0;
           while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
             connStepCache = loadStepValue(connStepPtr);
-            if (checkAbort(spins)) break;
+            if (checkAbort(flags, Aborted, spins)) break;
           }
           void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts
                                       : ncclShmem.groups[group].srcs;
@@ -601,13 +589,13 @@ private:
     tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
     stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
 
-    // For send operations, we need an extra warp to overlap the threadfence and the copy
-    this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
-
     int peer = -1;
     flags = 0;
     index = -1;
     if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
+      // For send operations, we need an extra warp to overlap the threadfence and the copy
+      this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
+
       int nrecv=0, nsend=0;
       // Yes, for some template arguments this code will be unreachable.  That's fine.
       // coverity[dead_error_line]
@@ -637,68 +625,84 @@ private:
 
       if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
       if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
+
+      // Coverity thinks that index could be -1 here but that's not actually the case.
+      // coverity[negative_returns:FALSE]
+      int sendIpcReg;
+      int recvIpcReg;
+      int sendNetReg;
+      int recvNetReg;
+      if (P2p) {
+        sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
+        recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
+        sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
+        recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
+      } else {
+        recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
+        recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
+      }
+
+      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+      if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
+      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+      if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
+
+      if (barrierAny(flags & NetDeviceUnpack)) {
+        flags |= AnyNetDeviceUnpack;
+        // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
+        // have NetDeviceUnpack.
+        uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
+        if (tid == 0) {
+          ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+        }
+      }
+
+      // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
+      // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
+      setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
+      // coverity[uninit_member] => coverity thinks fan.n is not initialized
     } else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
       flags |= PatMode;
-      accSize = 0;
+      const int roles[5] = { RoleWaitRecv, RolePostRecv, RoleWaitSend, RolePostSend, RoleInput | RoleOutput };
+      if (tid < 5) flags |= roles[tid];
+
       int nranks = ncclShmem.comm.nRanks;
-      int rank = ncclShmem.comm.rank;
-      // A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
-      index = tid % 32;
-      uint32_t delta = 1 << index;
-      const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
-      int block = tid / 32;
-      if (block < 4 && delta < nranks) {
-        int role = roles[block];
-        if (mode == primsModePatRs) {
-          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
-          if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
-        } else if (mode == primsModePatAg) {
-          if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
-          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
-        }
-        flags |= role;
-      } else if (tid == 128) {
-        flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
+      if (tid < 32 && ((1UL<<tid) < nranks)) {
+        int rank = ncclShmem.comm.rank;
+        uint32_t delta = 1 << tid;
+        // Load recv peer
+        int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
+        struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
+        struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
+        peer->step = conn->step;
+        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+        peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
+        peer->headPtr = conn->head;
+        peer->accSize = 0;
+        peer->connStepSize = conn->stepSize/sizeof(T);
+        // Load send peer
+        int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
+        peer = ((struct ncclPatPeer*)sendPeers)+tid;
+        conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
+        peer->step = conn->step;
+        peer->connFifo = conn->connFifo;
+        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+        peer->stepCache = loadStepValue(peer->headPtr = conn->head);
+        peer->tailPtr = conn->tail;
+        peer->accSize = 0;
+        peer->connStepSize = conn->stepSize/sizeof(T);
       }
-    }
-
-    // Coverity thinks that index could be -1 here but that's not actually the case.
-    // coverity[negative_returns:FALSE]
-    int sendIpcReg;
-    int recvIpcReg;
-    int sendNetReg;
-    int recvNetReg;
-    if (P2p) {
-      sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
-      recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
-      sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
-      recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
-    } else {
-      recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
-      recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
-    }
-    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
-    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
-
-    if (barrierAny(flags & NetDeviceUnpack)) {
-      flags |= AnyNetDeviceUnpack;
-      // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
-      // have NetDeviceUnpack.
-      uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
-      if (tid == 0) {
-        ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+      if (tid==0) {
+        ncclShmem.groups[group].userInput = (void*)inputBuf;
+        ncclShmem.groups[group].userOutput = (void*)outputBuf;
+        ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
       }
+      patBarrier();
     }
-
-    // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
-    // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
-    // coverity[uninit_member] => coverity thinks fan.n is not initialized
   }
 
   __device__ ~Primitives() {
+    if (flags&PatMode) return;
     // Save steps for the next operation
     if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
     if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
@@ -708,7 +712,7 @@ private:
       uint64_t prevStep = step - StepPerSlice;
       volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
       int spins = 0;
-      while (*ptr != -1) if (checkAbort(spins)) break;
+      while (*ptr != -1) if (checkAbort(flags, Aborted, spins)) break;
     }
 
     if (flags & NetDeviceUnpack) {
@@ -726,7 +730,7 @@ private:
       int spins = 0;
       volatile uint64_t* tail = conn->tail;
       volatile uint64_t* head = conn->head;
-      while (*tail > *head) if (checkAbort(spins)) break;
+      while (*tail > *head) if (checkAbort(flags, Aborted, spins)) break;
     }
   }
 
@@ -749,7 +753,7 @@ private:
         if (slot) {
           T* exchgPtr;
           directBuff = (T*)outputBuf;
-          while (*slot != nullptr && !checkAbort(spins));
+          while (*slot != nullptr && !checkAbort(flags, Aborted, spins));
           if (P2p) {
             exchgPtr = (T*)outputBuf;
           } else {
@@ -766,7 +770,7 @@ private:
         void* ptr;
         while (slot) {
           ptr = *slot;
-          if (ptr != nullptr || checkAbort(spins)) break;
+          if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
         }
 
         if (slot) {
@@ -785,7 +789,7 @@ private:
         // Wait for consumer to consume previous value before trampling it.
         if (slot && argSlot0 && argSlot1) {
           T* exchgPtr;
-          while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
+          while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(flags, Aborted, spins));
           // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
           // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
           directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
@@ -815,7 +819,7 @@ private:
         void* ptr;
         while (slot) {
           ptr = *slot;
-          if (ptr != nullptr || checkAbort(spins)) break;
+          if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
         }
 
         if (slot && argSlot0 && argSlot1) {
@@ -826,7 +830,7 @@ private:
             while (true) {
               arg0 = *argSlot0;
               arg1 = *argSlot1;
-              if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+              if ((arg0 != 0 && arg1 != 0) || checkAbort(flags, Aborted, spins)) break;
             }
             ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
           }
@@ -866,8 +870,8 @@ private:
   __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
+  __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 0, -1, Output>(outIx, outIx, eltN, postOp);
   }
   __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
     genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
@@ -945,54 +949,65 @@ private:
     ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
   }
 
-  __device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
-    nelem = nelem < 0 ? 0 : nelem;
+  __device__ __forceinline__ void patReduce(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
+    if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
+    int nelem = ps->nelem < 0 ? 0 : ps->nelem;
     T* userInput = (T*)ncclShmem.groups[group].userInput;
     T* userOutput = (T*)ncclShmem.groups[group].userOutput;
 
-    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
-      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
-      int spins = 0;
-      while (connStepCache < step + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      if (postRecv) step += StepPerSlice;
+    bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
+    bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
+    bool postRecv = ps->postRecv && recv;
+    bool postSend = ps->postSend && send;
+    struct ncclPatPeer* peer = NULL;
+    if (recv) {
+      peer = shmem->recvDims+ps->recvDim;
+      step = peer->step;
     }
-    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
-      int spins = 0;
-      while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
-      if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
-        // New data, add our own data to it.
-        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
-        accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
-        if (flags & ConnFifoEnabled)
-          connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
-      } else {
-        // There is already data in there, accumulate instead of writing to it.
-        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
-      }
-      if (postSend) step += StepPerSlice;
+    if (send) {
+      peer = shmem->sendDims+ps->sendDim;
+      step = peer->step;
     }
-    if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
-      ncclShmem.groups[group].dsts[0] = userOutput + outIx;
-      if (accSize < outIx + nelem) {
+
+    if (recv && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
+      int spins = 0;
+      while (peer->stepCache < step + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->tailPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+    }
+    if (send && (flags & RoleWaitSend)) {
+      int spins = 0;
+      while (peer->stepCache + NCCL_STEPS < step + ps->stepOffset + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->headPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
+      if (peer->accSize < ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
         // New data, add our own data to it.
-        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
-        accSize = outIx + nelem;
+        ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
       } else {
         // There is already data in there, accumulate instead of writing to it.
         ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
       }
     }
-    barrier();
+    long long int localAccSize = shmem->localAccSize;
+    if (ps->sendDim < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
+      ncclShmem.groups[group].dsts[0] = userOutput + ps->outIx;
+      if (localAccSize < ps->outIx + nelem) {
+        // New data, add our own data to it.
+        ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
+        localAccSize = ps->outIx + nelem;
+      } else {
+        // There is already data in there, accumulate instead of writing to it.
+        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
+      }
+    }
+    patBarrier();
     int nSrcs = 2;
     void** srcs = ncclShmem.groups[group].srcs;
-    if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
+    if (ps->recvDim < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
 
     int workSize = ncclShmem.aborted ? 0 : nelem;
 
@@ -1000,59 +1015,92 @@ private:
       (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
        nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);
 
-    barrier();
-    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
-    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+    // Store conn step here inside the two barriers to make sure next reload will see the update.
+    if (postSend && (flags & RolePostSend)) {
+      if (peer->connFifo) {
+        peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
+      }
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
+    }
+
+    // Update accSize
+    if (ps->sendDim < 0 && (flags & RoleOutput)) atomicMax(&shmem->localAccSize, localAccSize);
+    if (ps->sendDim >= 0 && (flags & RoleWaitSend)) atomicMax(&peer->accSize, ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
+
+    patBarrier();
+
+    if (postSend && (flags & RolePostSend)) {
+      if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
+      st_relaxed_sys_global(peer->tailPtr, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      st_relaxed_sys_global(peer->headPtr, step);
+    }
   }
 
-  __device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
-    nelem = nelem < 0 ? 0 : nelem;
+  __device__ __forceinline__ void patCopy(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
+    if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
+    int nelem = ps->nelem < 0 ? 0 : ps->nelem;
     T* userInput = (T*)ncclShmem.groups[group].userInput;
     T* userOutput = (T*)ncclShmem.groups[group].userOutput;
 
-    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
-      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
-      int spins = 0;
-      while (connStepCache < step + recvStepOffset + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
-        // New data, copy to our output buffer.
-        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
-        accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
-      } else {
-        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
-      }
-      if (postRecv) step += StepPerSlice;
+    bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
+    bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
+    bool postRecv = ps->postRecv && recv;
+    bool postSend = ps->postSend && send;
+    struct ncclPatPeer* peer = NULL;
+    if (recv) {
+      peer = shmem->recvDims+ps->recvDim;
+      step = peer->step;
     }
-    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
-      int spins = 0;
-      while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
-      if (postSend) {
-        if (flags & ConnFifoEnabled)
-          connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
-        step += StepPerSlice;
-      }
+    if (send) {
+      peer = shmem->sendDims+ps->sendDim;
+      step = peer->step;
     }
-    if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
-      ncclShmem.groups[group].srcs[0] = userInput + inpIx;
-      if (accSize < inpIx + nelem) {
+
+    if (recv && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
+      int spins = 0;
+      while (peer->stepCache < step + ps->stepOffset + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->tailPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      if (peer->accSize < ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
         // New data, copy to our output buffer.
-        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
-        accSize = inpIx + nelem;
+        ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
       } else {
         ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
       }
     }
-    barrier();
+    if (send && (flags & RoleWaitSend)) {
+      int spins = 0;
+      while (peer->stepCache + NCCL_STEPS < step + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->headPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
+    }
+    long long int localAccSize = shmem->localAccSize;
+    if (ps->recvDim < 0 && (flags & RoleInput)) { // Source is our own local buffer
+      ncclShmem.groups[group].srcs[0] = userInput + ps->inpIx;
+      if (localAccSize < ps->inpIx + nelem) {
+        // New data, copy to our output buffer.
+        ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
+        localAccSize = ps->inpIx + nelem;
+      } else {
+        // Already done
+        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0];
+      }
+    }
+    patBarrier();
     int nDsts = 2;
     void** dsts = ncclShmem.groups[group].dsts;
-    if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
+    if (ps->sendDim < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
     if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.
 
     int workSize = ncclShmem.aborted ? 0 : nelem;
@@ -1061,9 +1109,32 @@ private:
       (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
        1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);
 
-    barrier();
-    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
-    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+    // Store conn step here inside the two barriers to make sure next reload will see the update.
+    if (postSend && (flags & RolePostSend)) {
+      if (peer->connFifo) {
+        peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
+      }
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
+    }
+
+    // Update accSize
+    if (ps->recvDim < 0 && (flags & RoleInput)) atomicMax(&shmem->localAccSize, localAccSize);
+    if (ps->recvDim >= 0 && (flags & RoleWaitRecv)) atomicMax(&peer->accSize, ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
+
+    patBarrier();
+
+    if (postSend && (flags & RolePostSend)) {
+      if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
+      st_relaxed_sys_global(peer->tailPtr, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      st_relaxed_sys_global(peer->headPtr, step);
+    }
   }
 
 };
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index 70538b1..5d8de28 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -80,29 +80,66 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+#if __CUDA_ARCH__ >= 600
     using Proto = ProtoSimple<1, 1>;
     const int nranks = ncclShmem.comm.nRanks;
     const int rank = ncclShmem.comm.rank;
     size_t count, channelOffset, channelCount, chunkCount;
     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
 
-    T *inputBuf = (T*)work->sendbuff;
-    T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs);
+    static constexpr int nworkers = NCCL_PAT_NWORKERS;
+    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
+    uint64_t pollCount = 0;
+    __syncthreads(); // Don't start using shared mem until everyone arrives
+    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
+    if (tid == 0) shmem->localAccSize = 0;
+    if (tid == nworkers) shmem->parallelFactor = 0;
+    __syncthreads();
 
-    PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
-    int last = 0;
-    while (!last) {
-      int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
-      size_t inpIx, outIx;
-      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
-      prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
+    if (tid == nworkers) { // Algo computation thread
+      PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      int step = 0;
+      while (1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        patAlgo.getNextOp(ps);
+        int last = ps->last;
+        step++;
+        if (last == 2) break;
+      }
+    } else if (tid < nworkers) { // Worker threads
+      T *inputBuf = (T*)work->sendbuff;
+      T *outputBuf = (T*)work->recvbuff;
+      int parallelFactor = 0;
+      volatile int* pfPtr = &shmem->parallelFactor;
+      while (parallelFactor == 0) parallelFactor = *pfPtr;
+
+      int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
+      int group = tid / groupSize;
+      int nGroups = nworkers / groupSize;
+      int tidInGroup = tid - group*groupSize;
+      // We don't use recvPeers/sendPeers so let's pass shmem structs instead
+      Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+        (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatRs);
+
+      int step = group;
+      while(1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
+        int last = ps->last;
+        prims.patReduce(ps, shmem);
+        if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
+        if (last) break;
+        step += nGroups;
+      }
     }
+#endif
   }
 };
 
-
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index fe3b9ca..f36a511 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -41,7 +41,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
-      prims.directRecv(cursor, cursor, n);
+      prims.directRecv(cursor, n);
       cursor += n;
     } while (cursor < bytes);
   }
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 23f4633..5e0b213 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -23,7 +23,6 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 // Returns maximum kernel stack size of all CUDA kernels
 ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
   ncclResult_t result = ncclSuccess;
-  int print = 0;
 
   if (maxStackSize) *maxStackSize = 0;
   int carveout = ncclParamL1SharedMemoryCarveout();
@@ -48,11 +47,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
     if (ncclMaxSharedMem != 0) {
       int sharedMemSize = ncclMaxSharedMem;
       if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
-        if (print++ == 0)
-          INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
-               sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
-        // Reduce requested MaxDynamicSharedMemorySize attribute
-        sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
+        WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
+             cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
+        return ncclSystemError;
       }
       CUDACHECKGOTO(cudaFuncSetAttribute(fn,
         cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
@@ -388,6 +385,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
         struct ncclTaskColl* next = aggBeg->next;
         aggBeg->algorithm = agg.algorithm;
         aggBeg->protocol = agg.protocol;
+        if (aggBeg->protocol == NCCL_PROTO_LL) aggBeg->trafficBytes *= 4;
         aggBeg->nMaxChannels = agg.nMaxChannels;
         aggBeg->nWarps = agg.nWarps;
         aggBeg->devFuncId = agg.devFuncId;
@@ -478,6 +476,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
   return ncclSuccess;
 }
 
+static ncclResult_t addProfilerProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) {
+  int tmp = op->pattern;
+  op->pattern = ncclPatternProfiler;
+  ncclResult_t ret = addProxyOpIfNeeded(comm, plan, op);
+  op->pattern = tmp;
+  return ret;
+}
+
 static ncclResult_t scheduleCollTasksToPlan(
     struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget
   ) {
@@ -550,11 +556,16 @@ static ncclResult_t scheduleCollTasksToPlan(
         proxyOp.opCount = proxyOpId;
         proxyOp.task.coll = task;
         proxyOp.rank = comm->rank;
+        proxyOp.eActivationMask = task->eActivationMask;
+        proxyOp.workCounter = ++comm->profiler.workCounter[c];
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
+        // Set pattern to profiler to add a proxy profiler for kernel events
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOp));
       }
     } else { // not task->isCollnet
       int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
+      if (task->protocol == NCCL_PROTO_LL) trafficPerByte *= 4;
       size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
       int elementsPerCell = cellSize/elementSize;
       size_t cells = divUp(task->count*elementSize, cellSize);
@@ -669,11 +680,14 @@ static ncclResult_t scheduleCollTasksToPlan(
           }
           proxyOp->ringAlgo->incRefCount();
         }
+        proxyOp->eActivationMask = task->eActivationMask;
+        proxyOp->workCounter = ++comm->profiler.workCounter[c];
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
         // determine if that's actually true but it's also not clear if that would be an issue.
         // coverity[uninit_use_in_call:FALSE]
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, proxyOp));
       }
     }
 
@@ -797,7 +811,8 @@ static ncclResult_t addP2pToPlan(
     if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
 
     if (network[dir]) {
-      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) {
+      bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1;
+      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) {
         int regFlag = 0;
         NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
         for (int part = 0; part < nChannelsMax; part++) {
@@ -888,6 +903,7 @@ static ncclResult_t addP2pToPlan(
     op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
     op->task.p2p = p2pTasks[dir];
     op->rank = comm->rank;
+    op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0;
     // The following are modified per channel part in addWorkToChannels():
     // op->buffer, op->nbytes, op->nsteps = ...;
   }
@@ -898,7 +914,6 @@ static ncclResult_t addP2pToPlan(
     plan->channelMask |= uint64_t(1)<<channelId;
     // Add batch first.
     addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, ncclDevFuncId_P2p(), workOffset, p2pRound);
-    // Add proxy ops.
     for (int dir=0; dir < nProxyOps; dir++) {
       // Partition steps across channels.
       int nParts = dir ? work->nSendChannels : work->nRecvChannels;
@@ -935,9 +950,12 @@ static ncclResult_t addP2pToPlan(
         // equal one plus the batch index this p2p settled in.
         proxyOps[dir].channelId = channelId;
         proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
+        proxyOps[dir].workCounter = comm->profiler.workCounter[channelId]+1;
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
       }
     }
+    comm->profiler.workCounter[channelId] += (proxyOps[0].nsteps || proxyOps[1].nsteps) ? 1 : 0;
   }
 
   return ncclSuccess;
@@ -1157,22 +1175,23 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
       struct uploadWork_cleanup_t* cleanup = nullptr;
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
       void* fifoBufDev = nullptr;
+      cudaStream_t deviceStream;
+
       CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail);
 
-      // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
-      // user's graph will be launched later, and it also acquires the deviceStream,
-      // it will observe this upload.
-      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail);
+      // Acquire deviceStream. Since the user's graph will be launched later and it also
+      // acquires the deviceStream, it will observe this upload.
+      NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, fail);
 
-      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, deviceStream), result, fail);
       plan->workBufPersistent = fifoBufDev;
       plan->kernelArgs->workBuf = fifoBufDev;
 
       // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL
-      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, deviceStream), result, fail);
       cudaEvent_t memcpyDone;
       CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail);
-      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaEventRecord(memcpyDone, deviceStream), result, fail);
 
       NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail);
       cleanup->base.fn = uploadWork_cleanup_fn;
@@ -1180,7 +1199,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
       cleanup->hostBuf = fifoBufHost;
       ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);
 
-      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail);
+      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail);
       NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);
 
     finish_scope:
@@ -1254,14 +1273,15 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
   if (result != ncclSuccess) {
     WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
   }
-  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs);
+  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs);
   return;
 }
 
 static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
   struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
   if (plan->persistent) {
-    comm->persistentRefs -= 1;
+    comm->sharedRes->persistentRefs -= 1;
+    comm->localPersistentRefs -= 1;
     if (plan->workStorageType == ncclDevWorkStorageTypePersistent) {
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
       CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@@ -1317,6 +1337,28 @@ static void persistentDestructor(void* plans_) {
   }
 }
 
+NCCL_PARAM(LaunchOrderImplicit, "LAUNCH_ORDER_IMPLICIT", 0);
+
+namespace {
+  enum ncclImplicitOrder {
+    ncclImplicitOrderNone,
+    ncclImplicitOrderSerial,
+    ncclImplicitOrderLaunch
+  };
+}
+
+static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) {
+  if (ncclParamLaunchOrderImplicit()) {
+    // Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs
+    if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
+    if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); }
+    *mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
+    return ncclSuccess;
+  }
+  *mode = ncclImplicitOrderNone;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
   ncclResult_t result = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
@@ -1364,58 +1406,60 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
 
     if (nPlans == 0) return ncclSuccess;
 
-    // Semantically we want these dependencies for the kernels launched:
-    //   1. Launch host task on hostStream.
-    //   2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...}
-    //   3. {deviceStream, userStream[i]...} depend on kernel.
-    // We achieve this by:
-    //   1. userStream[0] waits on deviceStream
-    //   2. deviceStream waits on each of userStream[1...]
-    //   3. host task launch on hostStream
-    //   4. userStream[0] waits on hostStream
-    //   5. kernel launch on userStream[0]
-    //   6. deviceStream waits on userStream[0]
-    //   7. userStream[1...] each waits on deviceStream
-    // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
-    // at least one of the two streams to be strong-stream.
     cudaStream_t launchStream = planner->streams->stream;
-    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
+    cudaStream_t deviceStream, launchOrder;
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, failure);
 
-    // Create dependency for device stream on user streams. First from extra user
-    // streams to deviceStream. Then deviceStream to first user stream.
+    // userStream[0] waits on each userStream[i]...
     for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
+      CUDACHECKGOTO(cudaEventRecord(comm->sharedRes->scratchEvent, l->stream), result, failure);
+      CUDACHECKGOTO(cudaStreamWaitEvent(launchStream, comm->sharedRes->scratchEvent, 0), result, failure);
     }
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
+    // userStream[0] waits on deviceStream
+    NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, deviceStream, comm->sharedRes->scratchEvent), result, failure);
 
-    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) {
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure);
+
+    if (implicitOrder != ncclImplicitOrderNone) {
+      // userStream[0] waits on per-device (context) launchOrder. Concurrent strong stream access is
+      // required if this is a graph capture, non-captured cannot be concurrent because that would violate
+      // deterministic program order of launches.
+      bool concurrent = capturing;
+      NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder), result, failure);
+      NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure);
+    }
+
+    if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) {
       // We have to launch host tasks to push proxy args. We are careful to only
       // do this if necessary since host tasks impose a high performance cost in CUDA.
       bool acquired = false;
+      cudaStream_t hostStream;
       for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
         if (plan->hasProxyOps) {
           if (!acquired) {
             acquired = true;
-            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure);
           }
-          if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs);
+          if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs);
           plan->isHostCbEnq = true;
-          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
+          CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure);
         }
       }
       if (acquired) {
         // Make to-be-launched kernels dependent on just-launched host stream tasks.
-        NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
-        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+        NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, hostStream, comm->sharedRes->scratchEvent), result, failure);
+        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false), result, failure);
       }
     }
 
     if (persistent) {
-      comm->persistentRefs += nPlans;
+      comm->sharedRes->persistentRefs += nPlans;
+      comm->localPersistentRefs += nPlans;
       NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
     }
   }
-
 failure:
   return result;
 }
@@ -1434,6 +1478,7 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif
 
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  ncclResult_t ret = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
   int nChannels = countOneBits(plan->channelMask);
   void* sym = plan->kernelFn;
@@ -1447,18 +1492,19 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     CU_LAUNCH_PARAM_END
   };
 
-  CUfunction fn;
-  CUDACHECK(cudaGetFuncBySymbol(&fn, sym));
-
-  #if CUDART_VERSION >= 11080
   int driverVersion;
-  NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
-  if (driverVersion >= 11080) {
+  NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, do_return);
+
+  CUfunction fn;
+  CUDACHECKGOTO(cudaGetFuncBySymbol(&fn, sym), ret, do_return);
+
+  if (CUDART_VERSION >= 11080 && driverVersion >= 11080) {
+  #if CUDART_VERSION >= 11080
     int compCap = comm->compCap;
     unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
 
     CUlaunchConfig launchConfig = {0};
-    CUlaunchAttribute launchAttrs[3];
+    CUlaunchAttribute launchAttrs[4] = {};
     int attrs = 0;
     /* Cooperative Group Array (CGA)
      * On sm90 and later we have an extra level of hierarchy where we
@@ -1485,6 +1531,17 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
       launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain();
     }
     #endif
+    #if CUDART_VERSION >= 12030
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return);
+    if (implicitOrder == ncclImplicitOrderLaunch) {
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT;
+      launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent;
+      launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
+      attrs++;
+    }
+    #endif
     launchConfig.gridDimX = grid.x;
     launchConfig.gridDimY = grid.y;
     launchConfig.gridDimZ = grid.z;
@@ -1496,15 +1553,15 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     launchConfig.numAttrs = attrs;
     launchConfig.hStream = launchStream;
 
-    //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args));
-    CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra));
-    return ncclSuccess;
-  }
+    CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return);
   #endif
-  // Standard kernel launch
-  CUCHECK(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra));
-  //CUDACHECK(cudaLaunchKernel(fnAddr, grid, block, args, smem, launchStream));
-  return ncclSuccess;
+  } else {
+    // Standard kernel launch
+    CUCHECKGOTO(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra), ret, do_return);
+  }
+
+do_return:
+  return ret;
 }
 
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
@@ -1524,34 +1581,39 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
 }
 
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
-  ncclResult_t result = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
-
   if (!ncclIntruQueueEmpty(&planner->planQueue)) {
     // Reset queue to empty without destroying plans since those will be sent
     // back to us for reclaiming via callbackQueue.
     ncclIntruQueueConstruct(&planner->planQueue);
+
     cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
-    // Create dependency for deviceStream on launchStream. We know that deviceStream
-    // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
-    // so we can say that launchStream subsumes it.
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
-  resume1:
-    // Create dependency for other user streams (skip launch stream) on deviceStream.
-    // Again, the user streams haven't been touched since deviceStream waited on them
-    // so we can say they are subsumed by deviceStream.
-    struct ncclCudaStreamList* sl = planner->streams->next;
-    planner->streams = nullptr; // Reset comm->planner.streams to empty.
-    while (sl != nullptr) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
-    resume2:
-      sl = sl->next;
+    cudaStream_t deviceStream, launchOrder;
+    CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream));
+    // deviceStream waits on userStream[0]
+    NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+    CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
+    // Each userStream[i] waits on userStream[0]
+    for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
+      CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0));
     }
-    // Release device stream as acquired in ncclLaunchPrepare()
-    NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
-  resume3:;
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECK(getImplicitOrder(&implicitOrder, capturing));
+    if (implicitOrder != ncclImplicitOrderNone) {
+      // As in ncclLaunchPrepare, strong stream can be non-concurrent when non-captured.
+      bool concurrent = capturing;
+      // Incorporate launch event into per-device (context) launch order.
+      NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder));
+      // If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution).
+      CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent));
+      // Release launchOrder as acquired in ncclLaunchPrepare()
+      NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent));
+    }
+    // Release deviceStream as acquired in ncclLaunchPrepare()
+    NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false));
   }
-  return result;
+  return ncclSuccess;
 }
 
 /*****************************************************************************/
@@ -1655,11 +1717,11 @@ static ncclResult_t topoGetAlgoInfo(
   if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
     char ncclAlgoEnvStr[1024] = "";
     char ncclProtoEnvStr[1024] = "";
-    char* algoEnv = getenv("NCCL_ALGO");
+    const char* algoEnv = ncclGetEnv("NCCL_ALGO");
     if (algoEnv) {
       snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv);
     }
-    char* protoEnv = getenv("NCCL_PROTO");
+    const char* protoEnv = ncclGetEnv("NCCL_PROTO");
     if (protoEnv) {
       snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv);
     }
@@ -2007,7 +2069,7 @@ static ncclResult_t hostToDevRedOp(
   uint64_t allBits = uint64_t(-1)>>(64-nbits);
   uint64_t signBit = allBits^(allBits>>1);
   bool datatype_signed = false;
-  
+
   switch (int(op)) {
   case ncclSum:  opFull->op = ncclDevSum;  break;
   case ncclProd: opFull->op = ncclDevProd; break;
@@ -2097,6 +2159,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     p2p->datatype = info->datatype;
     p2p->root = info->root;
     p2p->bytes = nBytes;
+    p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
     ncclIntruQueueEnqueue(
       isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
       p2p);
@@ -2105,6 +2168,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     // Mark channels that need pre-connect
     if (comm->rank != peer) {
       if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
+        // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
         (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
         int round = 0;
         while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
@@ -2115,12 +2179,17 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
         for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
           int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c);
           if (isSendNotRecv) {
-            if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
+              // the send/recv connector is shared among split shared comms. We need to set hasSeen to
+              // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
+              // shared comms together.
+              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
               comm->connectSend[peer] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
           } else {
-            if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
+              comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
               comm->connectRecv[peer] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
@@ -2168,6 +2237,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
       t->opDev = opDev; // C++ struct assignment
       t->chunkSteps = info->chunkSteps;
       t->sliceSteps = info->sliceSteps;
+      t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
 
       planner->nTasksColl += 1;
       ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 64fc1c5..76b508c 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -390,7 +390,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
 
   // Alternate rings to avoid crossing rails
-  if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
+  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
     for (int r=0; r<comm->nRanks; r++) {
       if (comm->rankToNode[r] % 2 == 1) {
         // Exchange rings
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 587a8b2..ace4476 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -376,9 +376,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
 
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;
+const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" };
 
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
-  *useGdr = 0;
+NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0);
+
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) {
+  *gdrMode = ncclTopoGdrModeDisable;
 
   // Get GPU and NET
   int n, g;
@@ -418,25 +421,37 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
   int distance = gpu->paths[NET][n].type;
   if (distance == PATH_PXN) {
     // In case of PXN, use the intermediate GPU distance instead
-    int proxyRank, g;
+    int proxyRank;
     NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
     NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
-    struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
-    distance = proxyGpu->paths[NET][n].type;
+    gpu = system->nodes[GPU].nodes+g;
+    distance = gpu->paths[NET][n].type;
   }
+
+  int c;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+  if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) {
+    // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
+    INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c);
+    distance = PATH_C2C;
+  }
+
   if (distance > netGdrLevel) {
     INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
     return ncclSuccess;
   }
 
-  *useGdr = 1;
-  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
+  // Force PCIe mapping if path goes through PCI on a C2C system
+  if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci;
+  else *gdrMode = ncclTopoGdrModeDefault;
+
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
   int netNum = system->nodes[NET].count;
-  int useGdr = 0;
+  enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDisable;
   *avail = false;
   for (int n = 0; n < netNum; n++) {
     int64_t netId = system->nodes[NET].nodes[n].id;
@@ -469,6 +484,14 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int*
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
   // Flush is required on Ampere and earlier
   if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
+  // On C2C platforms, data could go through a PCI switch while completions and
+  // flags would go through C2C. In that case, force a flush.
+  int c, n;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+  if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) {
+    *flush = 1;
+  }
   return ncclSuccess;
 }
 
@@ -538,7 +561,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);
 int ncclPxnDisable(struct ncclComm* comm) {
   static int pxnDisable = -1;
   if (pxnDisable == -1) {
-    if (comm && ncclNetVersion(comm) == 4) {
+    if (comm && comm->ncclNetVer == 4) {
       INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
       pxnDisable = 1;
     } else {
@@ -561,9 +584,9 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
     int proxyRank;
     NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
     if (proxyRank == comm->rank) continue;
-    int useGdr;
+    enum ncclTopoGdrMode useGdr;
     NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
-    if (useGdr == 0) continue;
+    if (useGdr == ncclTopoGdrModeDisable) continue;
     int found = 0;
     for (int r=0; r<nr; r++) {
       if (ranks[r] == proxyRank) found = 1;
@@ -664,7 +687,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       }
       if (gpu->paths[NET][n].type < PATH_PHB) {
         // Update path when we dont want to / can't use GPU Direct RDMA.
-        int gdr;
+        enum ncclTopoGdrMode gdr;
         NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
         if (gdr == 0) {
           // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
@@ -862,3 +885,38 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink
   *allNvLink = maxPath >= PATH_PIX ? 0 : 1;
   return ncclSuccess;
 }
+
+// Check whether we are in a split NVLink situation, with two NVLink domains, not
+// connected through NVLink (e.g. QPI).
+ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink) {
+  ncclResult_t res = ncclSuccess;
+  int nvlDomains = 0;
+  int *nvlDomain = NULL, *nvlDomainCount = NULL;
+  // Compute NVLink domains
+  NCCLCHECKGOTO(ncclCalloc(&nvlDomain, system->nodes[GPU].count), res, exit);
+  for (int g=0; g<system->nodes[GPU].count; g++) nvlDomain[g] = g;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    int domain = nvlDomain[g];
+    for (int p=g+1; p<system->nodes[GPU].count; p++) {
+      if (gpu->paths[GPU][p].type == PATH_NVL) {
+        nvlDomain[p] = domain;
+      }
+    }
+  }
+  // Compute number of GPUs per NVLink domain.
+  NCCLCHECKGOTO(ncclCalloc(&nvlDomainCount, system->nodes[GPU].count), res, exit);
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    nvlDomainCount[nvlDomain[g]]++;
+  }
+  // Count the number of NVLink domains
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (nvlDomainCount[g] > 1) nvlDomains++;
+  }
+  *splitNvLink = nvlDomains == 2 ? 1 : 0;
+
+exit:
+  if(nvlDomain) free(nvlDomain);
+  if(nvlDomainCount) free(nvlDomainCount);
+  return res;
+}
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 0185b3f..15a0124 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -446,12 +446,11 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
 // 2. add other NETs satisfying typeInter but not already in the list.
 
-ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
+ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
   ncclResult_t ret = ncclSuccess;
   int netCount = 0;
   int localNetCount;
-  int* localNets;
-  NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS));
+  int localNets[MAXCHANNELS];
 
   // First add the preferred NICs
   for (int g=0; g<system->nodes[GPU].count; g++) {
@@ -460,8 +459,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
     for (int c = 0; c<MAXCHANNELS; c++) {
       int64_t netId;
-      NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
-      NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
       if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
       localNetCount++;
     }
@@ -469,7 +468,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     for (int i=0; i<localNetCount; i++) {
       int n = localNets[i];
       int found = 0;
-      while (nets[found] != n && found<netCount) found++;
+      while (found<netCount && nets[found] != n) found++;
       if (found == netCount) nets[netCount++] = n;
     }
   }
@@ -488,22 +487,17 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
       for (int i=0; i<localNetCount; i++) {
         int n = localNets[i];
         int found = 0;
-        while (nets[found] != n && found<netCount) found++;
+        while (found<netCount && nets[found] != n) found++;
         if (found == netCount) nets[netCount++] = n;
       }
     }
   }
 
   *netCountRet = netCount;
-exit:
-  free(localNets);
   return ret;
-fail:
-  goto exit;
 }
 
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
-  ncclResult_t ret = ncclSuccess;
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
 
@@ -525,7 +519,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   }
   graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
   int g = gpu - system->nodes[GPU].nodes;
-  int* nets = NULL;
+  int nets[NCCL_TOPO_MAX_NODES];
   if (step == backToNet) {
     // first get back to NIC
     if (system->nodes[NET].count) {
@@ -533,8 +527,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
       int netCount;
-      NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-      NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
       for (int i=0; i<netCount; i++) {
         int n = nets[i];
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
@@ -555,14 +548,14 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
           graph->bwInter /= 2;
         }
 
-        NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
+        NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
         graph->bwInter = bwInterSave;
         if (net) {
           graph->inter[graph->nChannels*2+1] = net->id;
-          NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
+          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
 
           if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
-          NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
+          NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
           graph->bwInter = bwInterSave;
         }
       }
@@ -601,21 +594,15 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
     // Next path
     NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
   }
-exit:
-  if (nets) free(nets);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
-  ncclResult_t ret = ncclSuccess;
   const int bw = graph->bwInter;
-  int* nets;
-  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+  int nets[NCCL_TOPO_MAX_NODES];
   int netCount;
   int graphFound = 0;
-  NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
   for (int i=0; i<netCount; i++) {
     if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
     int n = nets[(graph->nChannels+i)%netCount];
@@ -639,7 +626,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       // NVLS search only tries to find NIC:GPU combinations to compute the heads.
       if (graph->nChannels < netCount) {
         int gpu;
-        NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
+        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
         if (gpu != -1) {
           int duplicate = 0;
           // check whether there is duplicate head when one GPU connects with multiple NICs
@@ -650,7 +637,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
             }
           }
           if (!duplicate) {
-            NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
+            NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
             graphFound = 1;
           }
         }
@@ -659,14 +646,14 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       if (graph->nChannels > 0) {
         // Try to replay the last channel
         int g;
-        NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
-        NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
+        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
       }
       if (graph->nChannels == 0 || graph->sameChannels == 0) {
         if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
           // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
           int t = 1 << 10;
-          NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
+          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
           if (t == -1) *time = -1;
         }
 
@@ -686,7 +673,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
           for (int i=0; i<system->nodes[GPU].count; i++) {
             int g = (graph->nChannels+i)%system->nodes[GPU].count;
             if (paths[g].bw == maxBw && paths[g].count == minHops) {
-              NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
+              NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
             }
           }
         }
@@ -700,11 +687,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       }
     }
   }
-exit:
-  free(nets);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 /* Search Patterns
@@ -999,6 +982,15 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     graph->minChannels = graph->maxChannels;
   }
 
+  int splitNvLink;
+  NCCLCHECK(ncclTopoSplitNvLink(system, &splitNvLink));
+  if (graph->pattern == NCCL_TOPO_PATTERN_RING && splitNvLink) {
+    // We have two sockets with NVLink and a slower link in between (typically QPI).
+    // Tree is likely going to work better but it needs at least 2 channels.
+    // Since Tree needs to have the same number of channels as Ring, also force Ring to use 2 channels.
+    if (graph->maxChannels >= 2 && graph->minChannels == 1) graph->minChannels = 2;
+  }
+
   struct ncclTopoGraph tmpGraph;
   memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
 
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index ba82caf..9499f39 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -22,8 +22,8 @@
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI",    "",    "",    "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "C2C", "PCI",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
@@ -45,7 +45,7 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
   return ncclSuccess;
 }
 
-static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
+static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu, struct ncclTopoNode* from) {
   *cpu = NULL;
   if (node->type == CPU) {
     *cpu = node;
@@ -54,9 +54,10 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
   for (int l=0; l<node->nlinks; l++) {
     // Go up the PCI tree to find the CPU. Follow only PCI switches.
     if (node->links[l].type == LINK_PCI
+	&& node->links[l].remNode != from
 	&& (node->links[l].remNode->type == PCI
 	    || node->links[l].remNode->type == CPU)) {
-      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu, node));
     }
     if (*cpu != NULL) return ncclSuccess;
   }
@@ -77,13 +78,17 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
     return ncclSuccess;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-    *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
+    *bw =
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_ERP ? ERP_QPI_BW :
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SRP ? SRP_QPI_BW :
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SKL ? SKL_QPI_BW :
+      BDW_QPI_BW;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
     *bw = AMD_BW;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
-    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
+    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_MODEL_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
   }
   return ncclSuccess;
 }
@@ -511,12 +516,16 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
       int familyId, modelId;
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
-      cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+      cpu->cpu.model =
+        (familyId == 6 && modelId >= 0xCF) ? NCCL_TOPO_CPU_MODEL_INTEL_ERP :
+        (familyId == 6 && modelId >= 0x8F) ? NCCL_TOPO_CPU_MODEL_INTEL_SRP :
+        (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_MODEL_INTEL_SKL :
+        NCCL_TOPO_CPU_MODEL_INTEL_BDW;
     } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
       int familyId, modelId;
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
-      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
+      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_MODEL_YONGFENG;
     }
   }
   for (int s=0; s<xmlCpu->nSubs; s++) {
@@ -565,7 +574,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
       NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId)));
     } else if (targetType == CPU) {
       // NVL connection to the local CPU
-      NCCLCHECK(findLocalCpu(gpu, &remote));
+      NCCLCHECK(findLocalCpu(gpu, &remote, NULL));
     } else {
       if (system->nodes[NVS].count == 0) {
         NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
@@ -642,10 +651,10 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys
     NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
     double c2cBw = (bw*count)/1000.0;
     struct ncclTopoNode* cpu = NULL;
-    NCCLCHECK(findLocalCpu(gpu, &cpu));
+    NCCLCHECK(findLocalCpu(gpu, &cpu, NULL));
     if (cpu == NULL) return ncclSuccess;
-    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
-    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_C2C, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_C2C, c2cBw));
   } else {
     if (strcmp(node->name, "cpu") == 0) {
       NCCLCHECK(ncclGetSystemId(system, node, &systemId));
@@ -961,26 +970,31 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*
   // Trigger the merge, then get the new device's properties
   int vDevIndex = 0;
   ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
-  if (ret == ncclInvalidUsage) {
-    WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC");
-    NCCLCHECK(ret);
+  if (ret != ncclSuccess) {
+    INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
+      vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
+    return ret;
   }
 
   INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  ncclResult_t ret = ncclSuccess;
   INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  char* ncStr;
+  NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1));
+  strcpy(ncStr, str);
   char* semi_token;
-  char* semi = strtok_r(str, ";", &semi_token);
+  char* semi = strtok_r(ncStr, ";", &semi_token);
   while (semi) {
     TRACE(NCCL_NET, "Fusing %s", semi);
     struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
     int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
     if (nUserIfs == 0) {
       INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
-        str, semi);
+        ncStr, semi);
       continue;
     }
 
@@ -994,26 +1008,37 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str,
     if (vProps.ndevs != nUserIfs) {
       WARN("TOPO/NET : Only matched %d devices, %d requested from %s",
         vProps.ndevs, nUserIfs, semi);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
       WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     struct ncclXmlNode* netNode;
-    NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice));
-
-    // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
-    for (int i = 0; i < vProps.ndevs; i++) {
-      placedDevs[vProps.devs[i]] = 1;
+    ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+    if (ret == ncclSuccess) {
+      // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
+      for (int i = 0; i < vProps.ndevs; i++) {
+        placedDevs[vProps.devs[i]] = 1;
+      }
+    } else {
+      WARN("TOPO/NET : Could not force merge NICs %s. Please specify a valid NCCL_NET_FORCE_MERGE string.", semi);
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     semi = strtok_r(NULL, ";", &semi_token);;
   }
 
-  return ncclSuccess;
+exit:
+  free(ncStr);
+  return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
@@ -1061,7 +1086,24 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe
       }
 
       struct ncclXmlNode* netNode;
-      NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out);
+      ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+
+      // Merging failed.
+      // Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
+      // Set i to 0 to restart the automatic merging process and ensure all are placed
+      if (ret != ncclSuccess) {
+        INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "Marking physical devices as unplaced, increasing distance and restarting search.");
+        placedDevs[i] = 0;
+        TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, keeping distance -> self as PATH_LOC", i);
+        for (int k = 1; k < vProps.ndevs; k++) {
+          int dev = vProps.devs[k];
+          placedDevs[dev] = 0;
+          paths[i*nPhysDevs + dev] = PATH_DIS;
+          paths[dev*nPhysDevs + i] = PATH_DIS;
+          TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, setting distance -> %d as PATH_DIS", dev, i);
+        }
+        i = 0;
+      }
     }
   }
 
@@ -1125,16 +1167,16 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
   // By default, don't merge any devices
   int mergeLevel;
   mergeLevel = PATH_PORT;
-  char* mergeLevelEnv;
-  mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL");
-  if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
-  char* forceMerge;
-  forceMerge = getenv("NCCL_NET_FORCE_MERGE");
-  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
-  memset(placedDevs, 0, sizeof(int)*physicalDevs);
+  { // Avoids warnings related to jumping to "out"
+    const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
+    if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
+    const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
+    NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+    memset(placedDevs, 0, sizeof(int)*physicalDevs);
 
-  if (forceMerge) {
-    NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+    if (forceMerge) {
+      NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+    }
   }
   NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 2be029b..921a7f5 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -18,9 +18,11 @@
 #define SM86_NVLINK_BW 12.0
 #define SM100_NVLINK_BW 40.0
 #define PCI_BW 12.0           // PCI Gen3 x16
-#define QPI_BW 6.0
 #define AMD_BW 16.0
+#define BDW_QPI_BW 6.0
 #define SKL_QPI_BW 10.0
+#define SRP_QPI_BW 22.0
+#define ERP_QPI_BW 40.0
 #define ZPI_BW 6.0
 #define YONGFENG_ZPI_BW 9.0
 #define P9_BW 32.0
@@ -44,12 +46,13 @@ extern const char* topoNodeTypeStr[];
 #define LINK_LOC 0
 #define LINK_NVL 1
 // Skipping 2 for PATH_NVB
-#define LINK_PCI 3
-// Skipping 4 for PATH_PXB
-// Skipping 5 for PATH_PXN
-// Skipping 6 for PATH_PHB
-#define LINK_SYS 7
-#define LINK_NET 8
+#define LINK_C2C 3
+#define LINK_PCI 4
+// Skipping 5 for PATH_PXB
+// Skipping 6 for PATH_PXN
+// Skipping 7 for PATH_PHB
+#define LINK_SYS 8
+#define LINK_NET 9
 extern const char* topoLinkTypeStr[];
 
 // Local (myself)
@@ -61,29 +64,32 @@ extern const char* topoLinkTypeStr[];
 // Connection through NVLink using an intermediate GPU
 #define PATH_NVB 2
 
+// Connection through C2C
+#define PATH_C2C 3
+
 // Connection traversing at most a single PCIe bridge
-#define PATH_PIX 3
+#define PATH_PIX 4
 
 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
-#define PATH_PXB 4
+#define PATH_PXB 5
 
 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
-#define PATH_PXN 5
+#define PATH_PXN 6
 
 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
-#define PATH_PHB 6
+#define PATH_PHB 7
 
 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
-#define PATH_SYS 7
+#define PATH_SYS 8
 
 // Connection through the network
-#define PATH_NET 8
+#define PATH_NET 9
 
 // New type of path which should precede PATH_PIX
 #define PATH_PORT PATH_NVL
 
 // Disconnected
-#define PATH_DIS 9
+#define PATH_DIS 10
 extern const char* topoPathTypeStr[];
 
 struct ncclTopoNode;
@@ -103,9 +109,6 @@ struct ncclTopoLinkList {
   int type;
 };
 
-#define NCCL_TOPO_CPU_INTEL_BDW 1
-#define NCCL_TOPO_CPU_INTEL_SKL 2
-
 #define NCCL_TOPO_UNDEF (-1)
 
 #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
@@ -176,6 +179,7 @@ ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem*
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
 ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
 ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
+ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);
 
 #define NCCL_TOPO_XML_MAX_NODES 256
 #define NCCL_GRAPH_XML_MAX_NODES 4096
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 8da4aeb..68085b8 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -177,6 +177,7 @@ static const double perChMaxTreeBws[][3] = {
 NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
 static int ncclPatEnable(struct ncclComm* comm) {
   int patEnable = ncclParamPatEnable();
+  if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics
   if (patEnable != 2) return patEnable;
   if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
   if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0;   // PAT doesn't support net device offload
@@ -257,7 +258,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
-        if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
+        if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
         if (a == NCCL_ALGO_PAT) busBw *= .75;
         if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
diff --git a/src/group.cc b/src/group.cc
index e387db7..c48c0de 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -193,7 +193,6 @@ fail:
 
 static ncclResult_t doLaunches(struct ncclComm* head) {
   ncclResult_t result = ncclSuccess;
-  struct ncclComm* cliqueComm0 = head->intraComm0;
   struct ncclComm* cliqueHead = head;
   struct ncclComm* cliqueNextHead;
   bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
@@ -209,7 +208,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
       NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
       if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
       comm = comm->groupNext;
-    } while (comm != nullptr && comm->intraComm0 == cliqueComm0);
+    } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
     cliqueNextHead = comm;
 
     if (capturingYes && capturingNo) {
@@ -424,38 +423,47 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
 
   /* Connect channels at runtime if cumem is supported */
   if (groupCommHeadMain != nullptr) {
-    struct ncclComm* comm = groupCommHeadMain;
+    struct ncclComm* cliqueHead = groupCommHeadMain;
+    struct ncclComm* comm = NULL;
     struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
     ncclIntruQueueConstruct(&asyncCollJobs);
     do {
-      bool needConnect = false;
-      bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
-      memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+      // We need to preconnect connections for collectives clique by clique to avoid
+      // race condition for split shared comms which can connect the same connections
+      // at the same time.
+      comm = cliqueHead;
+      do {
+        bool needConnect = false;
+        bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+        memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
 
-      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-      NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
+        CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+        NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
 
-      if (comm->cuMemSupport && needConnect) {
-        struct ncclPreconnectJob* job;
-        NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-        job->base.func = ncclCollPreconnectFunc;
-        job->base.undo = nullptr;
-        job->base.destructor = free;
-        job->base.state = ncclGroupJobRunning;
-        job->base.abortFlag = comm->abortFlag;
-        job->base.abortFlagDev = comm->abortFlagDev;
-        job->comm = comm;
-        NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
-        memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-        ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
+        if (comm->cuMemSupport && needConnect) {
+          struct ncclPreconnectJob* job;
+          NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+          job->base.func = ncclCollPreconnectFunc;
+          job->base.undo = nullptr;
+          job->base.destructor = free;
+          job->base.state = ncclGroupJobRunning;
+          job->base.abortFlag = comm->abortFlag;
+          job->base.abortFlagDev = comm->abortFlagDev;
+          job->comm = comm;
+          NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
+          memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+          ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
+        }
+        comm = comm->groupNext;
+      } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
+      // connect
+      NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
+      while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
+        struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
+        if (job->destructor) job->destructor((void*)job);
       }
-      comm = comm->groupNext;
-    } while (comm);
-    NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
-    while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
-      struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
-      if (job->destructor) job->destructor((void*)job);
-    }
+      cliqueHead = comm;
+    } while (cliqueHead != nullptr);
 
     // done with all buffer allocation, start registration and enqueue
     comm = groupCommHeadMain;
diff --git a/src/include/bitops.h b/src/include/bitops.h
index a650aa7..dcf0e2e 100644
--- a/src/include/bitops.h
+++ b/src/include/bitops.h
@@ -8,6 +8,7 @@
 #define NCCL_BITOPS_H_
 
 #include <stdint.h>
+#include <string.h>
 
 #if !__NVCC__
   #ifndef __host__
@@ -276,13 +277,53 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
   return u32fpDecode(x, 3);
 }
 
-inline __host__ __device__ uint64_t getHash(const char* string, int n) {
-  // Based on DJB2a, result = result * 33 ^ char
-  uint64_t result = 5381;
-  for (int c = 0; c < n; c++) {
-    result = ((result << 5) + result) ^ string[c];
+// The hash isn't just a function of the bytes but also where the bytes are split
+// into different calls to eatHash().
+inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
+  char const* ptr = (char const*)bytes;
+  acc[0] ^= size;
+  while (size != 0) {
+    // Mix the accumulator bits.
+    acc[0] += acc[1];
+    acc[1] ^= acc[0];
+    acc[0] ^= acc[0] >> 31;
+    acc[0] *= 0x9de62bbc8cef3ce3;
+    acc[1] ^= acc[1] >> 32;
+    acc[1] *= 0x485cd6311b599e79;
+    // Read in a chunk of input.
+    size_t chunkSize = size < sizeof(uint64_t) ? size : sizeof(uint64_t);
+    uint64_t x = 0;
+    memcpy(&x, ptr, chunkSize);
+    ptr += chunkSize;
+    size -= chunkSize;
+    // Add to accumulator.
+    acc[0] += x;
   }
-  return result;
+}
+
+template<typename T>
+inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
+  eatHash(acc, (const void*)bytes, sizeof(T));
+}
+
+inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
+  uint64_t h = acc[0];
+  h ^= h >> 31;
+  h *= 0xbac3bd562846de6b;
+  h += acc[1];
+  h ^= h >> 32;
+  h *= 0x995a187a14e7b445;
+  return h;
+}
+
+inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
+  uint64_t acc[2] = {1, 1};
+  eatHash(acc, bytes, size);
+  return digestHash(acc);
+}
+template<typename T>
+inline __host__ __device__ uint64_t getHash(const T* bytes) {
+  return getHash((const void*)bytes, sizeof(T));
 }
 
 #endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index c82ebce..c68b041 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include "device.h"
+
 #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 // CHUNKSIZE must be a multiple of SLICESIZE
@@ -382,6 +383,42 @@ public:
   ~RingBCAlgorithm() {}
 };
 
+#if !defined (__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#include <cuda/atomic>
+#endif
+
+// Need a power of two to ensure it divides by parallelFactor (which is also a power of two)
+#define NCCL_PAT_NWORKERS 512
+
+static constexpr int PatUsed = 0x1,
+                     PatSkipped = 0x2;
+
+struct ncclPatStep {
+  int recvDim, sendDim, recvOffset, sendOffset, stepOffset, postRecv, postSend, nelem, last, flags;
+  size_t inpIx, outIx;
+};
+
+struct ncclPatPeer {
+    uint64_t step;
+    struct ncclConnInfo* conn;
+    struct ncclConnFifo* connFifo;
+    void* buff;
+    uint64_t *headPtr;
+    uint64_t *tailPtr;
+    uint64_t stepCache;
+    long long int accSize;
+    int connStepSize;
+};
+
+#define NCCL_SHMEM_PAT_STEPS 32
+struct ncclPatShmem {
+  struct ncclPatStep patSteps[NCCL_SHMEM_PAT_STEPS];
+  int parallelFactor;
+  long long int localAccSize;
+  struct ncclPatPeer sendDims[32]; // Should cover 2^32 ranks
+  struct ncclPatPeer recvDims[32];
+};
+
 template<typename T>
 class PatRSAlgorithm{
   size_t offset;
@@ -394,18 +431,17 @@ class PatRSAlgorithm{
   int nrPow2;
   int postFreq;
   int lastA;
-
+  int parallelFactor;
   int aggFactor;
   int as; // aggregated steps
   int a; // step inside aggregated step
   int sendSkipped; // number of skipped steps during aggregation
-  int recvSkipped; // number of skipped steps during aggregation
-  int phase2recv;  // receive offset for phase 2
+  int stepOffset;
   int aggDelta;
   int scale;
   int phase;
 
-  __device__ __host__ int min(int a, int b) {
+  __device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
     return (a<b)?a:b;
   }
 
@@ -433,16 +469,16 @@ class PatRSAlgorithm{
 
   __device__ __host__ void resetA() {
     a = 0;
-    sendSkipped = recvSkipped = 0;
+    sendSkipped = stepOffset = 0;
     lastA = aggFactor;
     if (phase >= 2) lastA /= 2*scale;
+    if (phase == 4) lastA = 1;
   }
 
   __device__ __host__ void reset() {
     nelem = getNelem();
     phase = 0;
     scale = 1;
-    phase2recv = 0;
     as = aggDelta - 1;
     resetA();
   }
@@ -465,8 +501,9 @@ class PatRSAlgorithm{
   }
 
 public:
-   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
      offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    parallelFactor = maxParallelFactor;
     aggDelta = nrPow2 = (1<<log2Up(nranks));
 
     aggFactor = 1;
@@ -476,6 +513,7 @@ public:
       aggDelta /= 2;
     }
     postFreq = aggFactor;
+    if (postFreq < parallelFactor) parallelFactor = postFreq;
     int d = stepDepth;
     while (d > 1 && aggFactor < nranks/2) {
       d /= 2;
@@ -486,160 +524,151 @@ public:
     reset();
   }
 
-  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
-restart:
-    last = 0;
-    nelemOut = nelem;
-    outIx = offset;
+  __device__ __host__ int getParallelFactor() {
+    return parallelFactor;
+  }
+
+  __device__ __host__ void getNextOp(struct ncclPatStep* ps) {
+    ps->last = 0;
+    ps->nelem = nelem;
+    ps->outIx = offset;
+    ps->stepOffset = stepOffset;
     int skip = 0;
-    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
-    if (phase == 0) {
+    if (a >= lastA) {
+      skip = 1;
+    } else if (phase == 0) {
       int s = mirrorInvert(a, lastA)*aggDelta + as;
       if (s >= nranks) skip = 1;
       int sendDataRank = (rank + s) % nranks;
-      inpIx = sendDataRank * count + offset;
-      recvDim = -1;
-      sendDim = 0;
-      outIx = 0;
-      recvOffset = -1;
-      sendOffset = ((a - sendSkipped)%postFreq) * nelem;
-      sendStepOffset = 0;
-      if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
-        postSend = 1;
+      ps->inpIx = sendDataRank * count + offset;
+      ps->recvDim = -1;
+      ps->sendDim = 0;
+      ps->outIx = 0;
+      ps->recvOffset = -1;
+      ps->sendOffset = (a%postFreq) * nelem;
+      if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        ps->postSend = 1;
       } else {
-        postSend = 0;
+        ps->postSend = 0;
       }
-      postRecv = 0;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->postRecv = 0;
     } else if (phase == 1) {
       int s = mirrorInvert(a, lastA)*aggDelta + as;
       if (s >= nranks) skip = 1;
-      recvDim = firstBitSet(s, nrPow2);
-      sendOffset = ((a - sendSkipped)%postFreq)*nelem;
-      recvOffset = ((a - recvSkipped)%postFreq)*nelem;
-      postSend = 0;
-      if (recvDim == 0) {
-        if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
-        sendStepOffset = 0;
+      ps->recvDim = firstBitSet(s, nrPow2);
+      ps->sendOffset = (a%postFreq)*nelem;
+      ps->recvOffset = (a%postFreq)*nelem;
+      ps->postSend = 0;
+      if (ps->recvDim == 0 && (((a%postFreq) + 1 >= postFreq) || (a == lastA-1))) ps->postSend = 1;
+      if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        ps->postRecv = 1;
       } else {
-        sendStepOffset = (a - sendSkipped)/postFreq;
+        ps->postRecv = 0;
       }
-      if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
-        postRecv = 1;
-      } else {
-        postRecv = 0;
-      }
-      s -= (1<<recvDim);
+      s -= (1<<ps->recvDim);
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
-      if (sendDim == -1) {
-        sendOffset = -1;
-        sendStepOffset = 0;
-      } else if (as - (1<<recvDim) == 0) {
-        if (newPeer(a, aggFactor)) sendSkipped = a;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (ps->sendDim == -1) {
+        ps->sendOffset = -1;
+      } else if (as - (1<<ps->recvDim) == 0) {
+        if (newPeer(a, aggFactor)) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
         int foffset = a - sendSkipped;
-        sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
-        sendOffset = (foffset%postFreq)*nelem;
+        ps->sendOffset = (foffset%postFreq)*nelem;
       }
+      int recvDim = ps->recvDim;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
-        postRecv = 0;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
         skip = 0;
       }
-      if (skip || recvDim == -1) recvSkipped++;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        as--;
-        phase = as % 2 == 1 ? 0 : 1;
-        resetA();
-      }
-      if (skip == 0) return;
+      if (recvDim > 0 && (((a-sendSkipped)%postFreq) + 1 >= postFreq) && skip == 0) stepOffset++;
     } else if (phase == 2) {
       int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
-      postRecv = 0;
+      ps->postRecv = 0;
       if (s >= nranks) skip = 1;
-      recvDim = 0;
-      postSend = a == lastA-1 ? 1 : 0;
+      ps->recvDim = 0;
+      ps->postSend = a == lastA-1 ? 1 : 0;
       s -= 1;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
         skip = 0;
       } else if (!skip) {
-        int foffset = phase2recv;
-        phase2recv++;
-        postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
-        recvOffset = (foffset%postFreq) * nelem;
+        int foffset = a + aggFactor - aggFactor/scale;
+        ps->postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+        ps->recvOffset = (foffset%postFreq) * nelem;
       }
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
-      int foffset = a - sendSkipped;
-      postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
-      sendStepOffset = 0;
-      sendOffset = (foffset%postFreq) * nelem;
-      if (skip || sendDim == -1) sendSkipped++;
-      if (++a == lastA) {
-        phase = 3;
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      int foffset = a;
+      ps->postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+      ps->sendOffset = (foffset%postFreq) * nelem;
     } else if (phase == 3) {
       int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
-      postRecv = a == lastA-1 ? 1 : 0;
+      ps->postRecv = a == lastA-1 ? 1 : 0;
       if (s >= nranks) skip = 1;
-      recvDim = firstBitSet(s, nrPow2);
-      postSend = 0;
-      s -= (1<<recvDim);
-      int foffset = a - recvSkipped;
-      postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
-      recvOffset = (foffset%postFreq) * nelem;
+      ps->recvDim = firstBitSet(s, nrPow2);
+      ps->postSend = 0;
+      s -= (1<<ps->recvDim);
+      int foffset = a;
+      ps->postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
+      ps->recvOffset = (foffset%postFreq) * nelem;
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
-        postRecv = 0;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
         skip = 0;
       }
-      if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
+      if (newPeer(a, aggFactor/(2*scale))) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
       foffset = a - sendSkipped;
-      sendStepOffset = foffset / postFreq; // Accumulate on next steps
-      sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
-      if (skip || recvDim == -1) recvSkipped++;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        scale *= 2;
-        phase = scale < aggFactor ? 2 : 4;
+      if ((foffset%postFreq) + 1 >= postFreq && skip == 0) stepOffset++;
+      ps->sendOffset = ps->sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
+    } else if (phase == 4) {
+      ps->recvDim = 0;
+      ps->sendDim = -1;
+      ps->inpIx = rank * count + offset;
+      ps->recvOffset = ((aggFactor-1)%postFreq) * nelem;
+      ps->sendOffset = -1;
+      ps->postRecv = 1;
+      ps->postSend = 0;
+      offset += chunkCount;
+    }
+    a++;
+    if (a >= lastA && a >= parallelFactor) {
+      int p = phase;
+      if (p == 1) as--;
+      if (p == 3) scale *= 2;
+      phase =
+        p == 0 ? as == 1 ? (aggFactor > 1 ? 2 : 4) : 1 :
+        p == 1 ? as % 2 == 1 ? 0 : 1 :
+        p == 2 ? 3 :
+        p == 3 ? scale < aggFactor ? 2 : 4 :
+        5;
+      if (p == 4) {
+        if (offset >= end) {
+          ps->last = 2;
+        } else {
+          reset();
+        }
+      } else {
         resetA();
       }
-      if (skip == 0) return;
-    } else if (phase == 4) {
-      recvDim = 0;
-      sendDim = -1;
-      inpIx = rank * count + offset;
-      recvOffset = (phase2recv%postFreq) * nelem;
-      sendStepOffset = 0;
-      sendOffset = -1;
-      postRecv = 1;
-      postSend = 0;
-      offset += chunkCount;
-      if (offset >= end) {
-        last = 1;
-      } else {
-        reset();
-      }
-      return;
+    } else if (phase == 4 && offset >= end) {
+      ps->last = 1;
     }
-    goto restart;
+    int flags = PatUsed | (skip ? PatSkipped : 0);
+#if __CUDA_ARCH__ >= 600
+    cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
+    a.store(flags, cuda::memory_order_release);
+#else
+    ps->flags = flags;
+#endif
   }
 };
 
@@ -655,14 +684,12 @@ class PatAGAlgorithm{
   int nrPow2;
   int postFreq;
   int lastA;
-
+  int parallelFactor;
   int aggFactor;
   int as; // aggregated steps
   int a; // step inside aggregated step
   int aggDelta;
-
   int scale;
-
   int phase;
 
   // AS computation
@@ -671,7 +698,7 @@ class PatAGAlgorithm{
   int bitCount[32];
   int bitZeroStep[32];
 
-  __device__ __host__ int min(int a, int b) {
+  __device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
     return (a<b)?a:b;
   }
 
@@ -738,8 +765,9 @@ class PatAGAlgorithm{
 
 
 public:
-   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
      offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    parallelFactor = maxParallelFactor;
     aggDelta = nrPow2 = (1<<log2Up(nranks));
 
     aggFactor = 1;
@@ -749,120 +777,120 @@ public:
       aggDelta /= 2;
     }
     postFreq = aggFactor;
+    if (postFreq < parallelFactor) parallelFactor = postFreq;
     int d = stepDepth;
     while (d > 1 && aggFactor < nranks/2) {
       d /= 2;
       aggFactor *= 2;
       aggDelta /= 2;
     }
-    //printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);
 
     asDim = log2Up(aggDelta);
     reset();
   }
 
-  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
-restart:
-    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
-    last = 0;
-    nelemOut = nelem;
-    inpIx = offset;
+  __device__ __host__ int getParallelFactor() {
+    return parallelFactor;
+  }
+
+  __device__ __host__ void getNextOp(struct ncclPatStep* ps) {
+    ps->last = 0;
+    ps->nelem = nelem;
+    ps->inpIx = offset;
     int skip = 0;
-    if (phase == 0) {
+    if (a >= lastA) {
+      skip = 1;
+    } else if (phase == 0) {
       int s = a*aggDelta + as;
       if (s >= nranks) skip = 1;
-      int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
       int recvDataRank = (rank + s) % nranks;
-      outIx = recvDataRank * count + offset;
-      sendDim = -1;
-      recvDim = 0;
-      inpIx = 0;
-      sendOffset = -1;
-      recvOffset = (a % postFreq) * nelem;
-      recvStepOffset = 0;
-      postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
-      postSend = 0;
-      a++;
-      if (nextSkip) {
-        as = nextAs();
-        if (as == aggDelta/2) {
-          offset += chunkCount;
-          if (offset >= end) {
-            last = 1;
-          } else {
-            reset();
-          }
-          return;
-        }
-        phase = 1;
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->outIx = recvDataRank * count + offset;
+      ps->sendDim = -1;
+      ps->recvDim = 0;
+      ps->inpIx = 0;
+      ps->sendOffset = -1;
+      ps->recvOffset = (a % postFreq) * nelem;
+      ps->stepOffset = 0;
+      ps->postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      ps->postSend = 0;
    } else if (phase == 1) {
       int s = a*aggDelta + as;
       if (s >= nranks) skip = 1;
-      sendDim = firstBitSet(s, nrPow2);
-      s -= (1<<sendDim);
+      ps->sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<ps->sendDim);
       int sendDataRank = (rank + nranks + s) % nranks;
-      outIx = sendDataRank * count + offset;
-      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      sendOffset = recvOffset = (a % postFreq) * nelem;
-      postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
-      postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
-      recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
-      if (recvDim == -1) {
-        recvOffset = -1;
-        postRecv = 0;
-      } else if (as - (1<<sendDim) == 0) {
-        int foffset = (a*aggDelta) >> (recvDim+1);
-        recvOffset = (foffset%postFreq)*nelem;
-        postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
-        recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
+      ps->outIx = sendDataRank * count + offset;
+      ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      ps->sendOffset = ps->recvOffset = (a % postFreq) * nelem;
+      ps->postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      ps->postRecv = (ps->sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
+      ps->stepOffset = (ps->sendDim == 0) ? 0 : a/postFreq;
+      if (ps->recvDim == -1) {
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
+      } else if (as - (1<<ps->sendDim) == 0) {
+        int foffset = (a*aggDelta) >> (ps->recvDim+1);
+        ps->recvOffset = (foffset%postFreq)*nelem;
+        ps->postRecv = (ps->sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<ps->recvDim) >= nranks) ? 1 : 0;
+        ps->stepOffset = (ps->sendDim == 0) ? 0 : foffset/postFreq;
       }
-      if (s < nranks && sendDim == 0 && skip) {
+      if (s < nranks && ps->sendDim == 0 && skip) {
         // Don't forget to receive at least once even if we don't send afterwards
-        sendDim = -1;
-        sendOffset = -1;
-        postSend = 0;
+        ps->sendDim = -1;
+        ps->sendOffset = -1;
+        ps->postSend = 0;
         skip = 0;
       }
-      if (++a == lastA) {
-        if (as % 2 == 1) {
-          phase = 0;
-        } else {
-          as = nextAs();
-        }
-        resetA();
-      }
-      if (skip == 0) return;
     } else if (phase == 2) {
       int s = (2*a+1)*scale*aggDelta;
-      postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
-      postRecv = 0;
+      ps->postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
+      ps->postRecv = 0;
       if (s >= nranks) skip = 1;
-      sendDim = firstBitSet(s, nrPow2);
-      s -= (1<<sendDim);
-      sendOffset = (a%postFreq) * nelem;
-      recvStepOffset = a / postFreq;
+      ps->sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<ps->sendDim);
+      ps->sendOffset = (a%postFreq) * nelem;
+      ps->stepOffset = a / postFreq;
       int sendDataRank = (rank + nranks + s) % nranks;
-      outIx = sendDataRank * count + offset;
-      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      if (recvDim == -1) {
-        recvOffset = -1;
+      ps->outIx = sendDataRank * count + offset;
+      ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (ps->recvDim == -1) {
+        ps->recvOffset = -1;
       } else {
-        s -= (1<<recvDim);
-        int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
-        recvOffset = (foffset%postFreq)*nelem;
-        recvStepOffset = foffset / postFreq;
+        s -= (1<<ps->recvDim);
+        int foffset = (a*2*scale*aggDelta) >> (ps->recvDim+1);
+        ps->recvOffset = (foffset%postFreq)*nelem;
+        ps->stepOffset = foffset / postFreq;
       }
-      if (++a == lastA) {
-        scale /= 2;
-        phase = scale ? 2 : 1;
+    }
+    a++;
+    if (a >= lastA && a >= parallelFactor) {
+      int p = phase;
+      if (p == 2) scale /= 2;
+      phase =
+        p == 2 ? scale ? 2 : 1 :
+        p == 1 ? as % 2 == 1 ? 0 : 1 :
+        1;
+      if (p == 0 || (p == 1 && as % 2 == 0)) as = nextAs();
+      if (p == 0 && as == aggDelta/2) {
+        offset += chunkCount;
+        if (offset >= end) {
+          ps->last = 2;
+        } else {
+          reset();
+        }
+      } else {
         resetA();
       }
-      if (skip == 0) return;
+    } else if (phase == 0 && as == 1 && offset + chunkCount >= end && a-1 >= ((lastA-1) / parallelFactor) * parallelFactor) {
+      ps->last = 1;
     }
-    goto restart;
+    int flags = PatUsed | (skip ? PatSkipped : 0);
+#if __CUDA_ARCH__ >= 600
+    cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
+    a.store(flags, cuda::memory_order_release);
+#else
+    ps->flags = flags;
+#endif
   }
 };
 #endif
diff --git a/src/include/comm.h b/src/include/comm.h
index c3f4eb4..4095187 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -131,6 +131,9 @@ struct ncclSharedResources {
   int* tpRankToLocalRank;
   // Internal streams
   struct ncclStrongStream deviceStream, hostStream;
+  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
+  int persistentRefs;
+  cudaEvent_t launchEvent, scratchEvent;
 
   /* proxy related shared res */
   struct ncclProxyState* proxyState;
@@ -407,6 +410,7 @@ struct ncclComm {
   // List of destructors to run when comm is destructed
   struct ncclDestructor* destructorHead;
 
+  struct ncclCudaContext* context;
   struct ncclSharedResources* sharedRes;
   /* map to top parent ranks. */
   int* topParentRanks;
@@ -419,6 +423,7 @@ struct ncclComm {
 
   int netPluginLoaded;
   ncclNet_t* ncclNet;
+  int ncclNetVer;
   ncclNetDeviceType netDeviceType;
   ncclCollNet_t* ncclCollNet;
   void* bootstrap;
@@ -426,6 +431,7 @@ struct ncclComm {
   uint64_t* connectSend;
   uint64_t* connectRecv;
   struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+  int maxTreePattern;
   bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
   bool runtimeConn; // if dynamic connection is supported
   bool directMode;
@@ -565,8 +571,7 @@ struct ncclComm {
   struct ncclComm* groupNext;
   // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
   struct ncclComm* preconnectNext;
-  int persistentRefs; // number of persistent plan-lists capturing this comm
-  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
+  int localPersistentRefs; // number of persistent plan-lists capturing this comm
   struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
 
   struct ncclKernelPlanner planner;
@@ -603,6 +608,7 @@ struct ncclComm {
   // Profiler plugin
   void* profilerContext;
   uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
+  struct ncclProfilerProxy profiler;
 
   // buffer registration cache
   struct ncclRegCache regCache;
diff --git a/src/include/device.h b/src/include/device.h
index 3f918ab..0763a57 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -133,6 +133,7 @@ struct ncclProxyConnector {
 
 struct ncclConnector {
   int connected;
+  int hasSeen;
   struct ncclProxyConnector proxyConn;
   struct ncclTransportComm* transportComm;
   void* transportResources;
@@ -374,6 +375,7 @@ struct alignas(16) ncclDevChannel {
   struct ncclDirect collnetDirect;
   struct ncclNvls nvls;
   uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+  uint64_t workCounter;
 };
 
 struct ncclDevComm {
@@ -396,6 +398,10 @@ struct ncclDevComm {
   // Channels, device side
   struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
   int* rankToLocalRank;
+
+  // Profiler counters
+  uint64_t* workStarted/*[MAXCHANNELS]*/;
+  uint64_t* workCompleted/*[MAXCHANNELS]*/;
 };
 
 struct alignas(16) ncclDevCommAndChannels {
@@ -468,7 +474,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int
 
 __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
   // Our collective unroll should move to the same bytes&insns model as NVLS.
-  return cudaArch >= 800 ? 8 : 4;
+  return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
 }
 
 __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
diff --git a/src/include/graph.h b/src/include/graph.h
index a22b62b..b779773 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -36,7 +36,13 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr);
+enum ncclTopoGdrMode {
+  ncclTopoGdrModeDisable = 0,
+  ncclTopoGdrModeDefault = 1,
+  ncclTopoGdrModePci = 2,
+  ncclTopoGdrModeNum = 3
+};
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode);
 ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
@@ -55,9 +61,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_VENDOR_AMD 2
 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
 #define NCCL_TOPO_CPU_VENDOR_MIXED 4
-#define NCCL_TOPO_CPU_TYPE_BDW 1
-#define NCCL_TOPO_CPU_TYPE_SKL 2
-#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
+#define NCCL_TOPO_CPU_MODEL_INTEL_BDW 1
+#define NCCL_TOPO_CPU_MODEL_INTEL_SKL 2
+#define NCCL_TOPO_CPU_MODEL_INTEL_SRP 3
+#define NCCL_TOPO_CPU_MODEL_INTEL_ERP 4
+#define NCCL_TOPO_CPU_MODEL_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
diff --git a/src/include/group.h b/src/include/group.h
index 91bc190..c06d1ef 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -112,6 +112,12 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
     struct ncclComm** pp = &ncclGroupCommHead;
     while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
       pp = &(*pp)->groupNext;
+
+    // didn't find its clique, we need to insert it with ascending order based on commHash
+    if (*pp == nullptr) {
+      pp = &ncclGroupCommHead;
+      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
+    }
     comm->groupNext = *pp;
     *pp = comm;
     // Comms gets a new memory stack scope upon joining. Each task batched for
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
deleted file mode 100644
index f165aa1..0000000
--- a/src/include/nccl_net.h
+++ /dev/null
@@ -1,604 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-#include "net_device.h"
-#include <stdint.h>
-
-#define NCCL_NET_HANDLE_MAXSIZE 128
-//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
-#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
-#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
-
-#define NCCL_PTR_HOST 0x1
-#define NCCL_PTR_CUDA 0x2
-#define NCCL_PTR_DMABUF 0x4
-
-// Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 32
-
-// Max number of ncclNet objects which can live in the same process
-#define NCCL_NET_MAX_PLUGINS 3
-
-#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
-
-typedef struct {
-  int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
-} ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int regIsGlobal;                 // regMr is not tied to a particular comm
-  int forceFlush;                  // Force a flush on receives
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-  ncclNetVDeviceProps_v9_t vProps;
-  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
-  size_t maxCollBytes;             // Max transfer size for collective operations
-} ncclNetProperties_v9_t;
-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-
-  // Create a virtual NIC given the specified properties, which can be accessed at device index d
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
-} ncclNet_v9_t;
-
-typedef ncclNet_v9_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9
-
-typedef struct {
-  void* mhandle;
-  void* address;
-  size_t size;
-} ncclNetSGE_v9_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
-                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                             void* sendMhandle, void** request);
-  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
-                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                                 ncclDataType_t dataType, ncclRedOp_t redOp,
-                                 void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Create a virtual NIC given the specified properties, which can be accessed at device index d
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
-} ncclCollNet_v9_t;
-
-typedef ncclCollNet_v9_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int regIsGlobal;                 // regMr is not tied to a particular comm
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v8_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v8_t;
-
-typedef struct {
-  void* mhandle;
-  void* address;
-  uint32_t size;
-} ncclNetSGE_v8_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
-                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                             void* sendMhandle, void** request);
-  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
-                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                                 ncclDataType_t dataType, ncclRedOp_t redOp,
-                                 void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v8_t;
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v7_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v7_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v7_t;
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
-
-// v6 struct for backwards compatibility
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  float latency;  // Network latency
-  int maxComms;   // Maximum number of comms we can create
-  int maxRecvs;   // Maximum number of grouped receives.
-} ncclNetProperties_v6_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v6_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v5_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v5_t;
-
-#endif // end include guard
diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h
deleted file mode 100644
index a8164d0..0000000
--- a/src/include/nccl_profiler.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_PROFILER_H_
-#define NCCL_PROFILER_H_
-
-#include <cstdint>
-
-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-};
-
-typedef struct {
-  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
-  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
-  int rank;                     // originating rank
-  union {
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint64_t seqNumber;
-      const char* func;
-      void const* sendBuff;
-      void* recvBuff;
-      size_t count;
-      int root;
-      const char* datatype;
-      size_t trafficBytes;
-      uint8_t nMaxChannels;
-      uint8_t nWarps;
-      const char* algo;
-      const char* proto;
-    } coll;
-
-    struct {
-      const char* name;
-      uint64_t commHash;
-      const char* func;
-      void* buff;
-      const char* datatype;
-      size_t count;
-      int peer;
-    } p2p;
-
-    struct {
-      pid_t pid;                // pid of the originating process
-      uint8_t channelId;        // channel id for this proxy operation
-      int peer;                 // remote rank for send/recv
-      int nSteps;               // number of steps for this proxy operation
-      int chunkSize;            // amount of data transferred by this proxy operation
-      int isSend;
-    } proxyOp;
-
-    struct {
-      int step;
-    } proxyStep;
-  };
-} ncclProfilerEventDescr_v2_t;
-
-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v2_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v2_t;
-
-typedef struct {
-  const char* name;
-
-  // init - initialize the profiler plugin
-  // Input
-  //  - context        : opaque profiler context object for separating profiler behavior across comms
-  // Output
-  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
-
-  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
-  // Input
-  //  - context: opaque profiler context object
-  //  - eDescr : pointer to ncclProfilerEventDescr_t object
-  // Output
-  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
-
-  // stopEvent - stop/finalize an event inside and event set
-  // Input
-  //  - eHandle: handle to event object
-  ncclResult_t (*stopEvent)(void* eHandle);
-
-  // recordEventState - record event state transitions and event attribute updates
-  // Input
-  //  - eHandle   : handle to event object created through startEvent
-  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
-  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
-
-  // finalize - finalize the profiler plugin
-  // Input
-  //  - context: opaque profiler context object
-  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v2_t;
-
-typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v2_t ncclProfiler_t;
-
-typedef struct {
-  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
-  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
-  int rank;                     // originating rank
-  union {
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint64_t seqNumber;
-      uint8_t func;
-      void const* sendBuff;
-      void* recvBuff;
-      size_t count;
-      int root;
-      uint8_t datatype;
-      uint32_t op;
-      size_t trafficBytes;
-      uint8_t nMaxChannels;
-      uint8_t nWarps;
-      uint8_t algo;
-      uint8_t proto;
-      int isCollnet;
-      int isNvls;
-    } coll;
-
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint8_t func;
-      void* buff;
-      uint8_t datatype;
-      size_t count;
-      int peer;
-    } p2p;
-
-    struct {
-      pid_t pid;                // pid of the originating process
-      uint8_t channelId;        // channel id for this proxy operation
-      int peer;                 // remote rank for send/recv
-      int nSteps;               // number of steps for this proxy operation
-      int chunkSize;            // amount of data transferred by this proxy operation
-      int isSend;
-    } proxyOp;
-
-    struct {
-      int step;
-    } proxyStep;
-  };
-} ncclProfilerEventDescr_v1_t;
-
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
-
-typedef struct {
-  const char* name;
-
-  // init - initialize the profiler plugin
-  // Input
-  //  - context        : opaque profiler context object for separating profiler behavior across comms
-  // Output
-  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
-
-  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
-  // Input
-  //  - context: opaque profiler context object
-  //  - eDescr : pointer to ncclProfilerEventDescr_t object
-  // Output
-  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
-
-  // stopEvent - stop/finalize an event inside and event set
-  // Input
-  //  - eHandle: handle to event object
-  ncclResult_t (*stopEvent)(void* eHandle);
-
-  // recordEventState - record event state transitions and event attribute updates
-  // Input
-  //  - eHandle   : handle to event object created through startEvent
-  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
-  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
-
-  // finalize - finalize the profiler plugin
-  // Input
-  //  - context: opaque profiler context object
-  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v1_t;
-
-#endif
diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h
deleted file mode 100644
index 6e61118..0000000
--- a/src/include/nccl_tuner.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TUNER_H_
-#define NCCL_TUNER_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - numPipeOps: number of operations in the group
-  //   - numAlgo: number of algorithms in collCostTable
-  //   - numProto: number of protocols in collCostTable
-  //   - regBuff: can register user buffer
-  //
-  // Outputs:
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // InOut:
-  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
-  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int regBuff, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v4_t;
-
-typedef ncclTuner_v4_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - numPipeOps: number of operations in the group
-  //   - numAlgo: number of algorithms in collCostTable
-  //   - numProto: number of protocols in collCostTable
-  //
-  // Outputs:
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // InOut:
-  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
-  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v3_t;
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - collNetTypeSupport: whether collnet supports this type
-  //   - nvlsTypeSupport: whether nvlink sharp supports this time
-  //   - numPipeOps: number of operations in the group
-  //
-  // Outputs:
-  //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the give collective
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int* algorithm, int* protocol, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v2_t;
-
-#endif
diff --git a/src/include/net.h b/src/include/net.h
index d1926cc..afc2d16 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -18,7 +18,6 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
 ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);
-int ncclNetVersion(struct ncclComm* comm);
 
 // Test whether the current GPU support GPU Direct RDMA.
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
diff --git a/src/include/net_device.h b/src/include/net_device.h
index 5fae9b5..c3a79e3 100644
--- a/src/include/net_device.h
+++ b/src/include/net_device.h
@@ -26,6 +26,7 @@ typedef struct {
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index 5d00f07..2c18b36 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -31,9 +31,10 @@
 #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommSplit            13
 #define NVTX_SID_CommFinalize         14
+// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
 
 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 15 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
 
 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h
new file mode 100644
index 0000000..d57aad5
--- /dev/null
+++ b/src/include/plugin/nccl_net.h
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "net_device.h"
+#include <stdint.h>
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
+
+#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
+#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 32
+
+// Max number of ncclNet objects which can live in the same process
+#define NCCL_NET_MAX_PLUGINS 3
+
+// NCCL core profiler callback for network defined events instrumentation
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
+
+#include "net/net_v10.h"
+#include "net/net_v9.h"
+#include "net/net_v8.h"
+#include "net/net_v7.h"
+#include "net/net_v6.h"
+
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclCollNet_v10_t ncclCollNet_t;
+typedef ncclNetSGE_v10_t ncclNetSGE_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10
+
+#endif // end include guard
diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h
new file mode 100644
index 0000000..34cf9a9
--- /dev/null
+++ b/src/include/plugin/nccl_profiler.h
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+
+#include <cstdint>
+#include "profiler/profiler_v3.h"
+#include "profiler/profiler_v2.h"
+#include "profiler/profiler_v1.h"
+
+typedef ncclProfiler_v3_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#endif
diff --git a/src/include/plugin/nccl_tuner.h b/src/include/plugin/nccl_tuner.h
new file mode 100644
index 0000000..f240189
--- /dev/null
+++ b/src/include/plugin/nccl_tuner.h
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+
+#include "tuner/tuner_v4.h"
+#include "tuner/tuner_v3.h"
+#include "tuner/tuner_v2.h"
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+#endif
diff --git a/src/include/plugin/net/net_v10.h b/src/include/plugin/net/net_v10.h
new file mode 100644
index 0000000..ada6d48
--- /dev/null
+++ b/src/include/plugin/net/net_v10.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v10_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclCollNet_v10_t;
+
+#endif // end include guard
diff --git a/src/include/plugin/net/net_v6.h b/src/include/plugin/net/net_v6.h
new file mode 100644
index 0000000..99445ce
--- /dev/null
+++ b/src/include/plugin/net/net_v6.h
@@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V6_H_
+#define NET_V6_H_
+
+#define NCCL_NET_MAX_REQUESTS_V6 8
+
+// v6 struct for backwards compatibility
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+} ncclNetProperties_v6_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v7.h b/src/include/plugin/net/net_v7.h
new file mode 100644
index 0000000..e9b19de
--- /dev/null
+++ b/src/include/plugin/net/net_v7.h
@@ -0,0 +1,120 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V7_H_
+#define NET_V7_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v8.h b/src/include/plugin/net/net_v8.h
new file mode 100644
index 0000000..a178132
--- /dev/null
+++ b/src/include/plugin/net/net_v8.h
@@ -0,0 +1,134 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V8_H_
+#define NET_V8_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  uint32_t size;
+} ncclNetSGE_v8_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v8_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v9.h b/src/include/plugin/net/net_v9.h
new file mode 100644
index 0000000..ce9d917
--- /dev/null
+++ b/src/include/plugin/net/net_v9.h
@@ -0,0 +1,152 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V9_H_
+#define NET_V9_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclNet_v9_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v9_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclCollNet_v9_t;
+
+#endif // end include guard
diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h
new file mode 100644
index 0000000..7336c34
--- /dev/null
+++ b/src/include/plugin/plugin.h
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PLUGIN_H_
+#define NCCL_PLUGIN_H_
+
+#include "nccl.h"
+
+void* ncclOpenNetPluginLib(const char* name);
+void* ncclOpenTunerPluginLib(const char* name);
+void* ncclOpenProfilerPluginLib(const char* name);
+void* ncclGetNetPluginLib(void);
+ncclResult_t ncclClosePluginLib(void* handle);
+
+#endif
diff --git a/src/include/plugin/profiler/net_ib.h b/src/include/plugin/profiler/net_ib.h
new file mode 100644
index 0000000..2ac6d5c
--- /dev/null
+++ b/src/include/plugin/profiler/net_ib.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_H_
+#define NET_IB_H_
+
+#include "nccl_profiler.h"
+#include "net_ib_v1.h"
+
+#endif
diff --git a/src/include/plugin/profiler/net_ib_v1.h b/src/include/plugin/profiler/net_ib_v1.h
new file mode 100644
index 0000000..f142de5
--- /dev/null
+++ b/src/include/plugin/profiler/net_ib_v1.h
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_V1_H_
+#define NET_IB_V1_H_
+
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/net_socket.h b/src/include/plugin/profiler/net_socket.h
new file mode 100644
index 0000000..9f57496
--- /dev/null
+++ b/src/include/plugin/profiler/net_socket.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_H_
+#define NET_SOCKET_H_
+
+#include "nccl_profiler.h"
+#include "net_socket_v1.h"
+
+#endif
diff --git a/src/include/plugin/profiler/net_socket_v1.h b/src/include/plugin/profiler/net_socket_v1.h
new file mode 100644
index 0000000..0cb664f
--- /dev/null
+++ b/src/include/plugin/profiler/net_socket_v1.h
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_V1_H_
+#define NET_SOCKET_V1_H_
+
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v1.h b/src/include/plugin/profiler/profiler_v1.h
new file mode 100644
index 0000000..3b67102
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v1.h
@@ -0,0 +1,107 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v2.h b/src/include/plugin/profiler/profiler_v2.h
new file mode 100644
index 0000000..146152a
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v2.h
@@ -0,0 +1,104 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v3.h b/src/include/plugin/profiler/profiler_v3.h
new file mode 100644
index 0000000..10c5059
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v3.h
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v2.h b/src/include/plugin/tuner/tuner_v2.h
new file mode 100644
index 0000000..ec96f60
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v2.h
@@ -0,0 +1,53 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V2_H_
+#define TUNER_V2_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetTypeSupport: whether collnet supports this type
+  //   - nvlsTypeSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the give collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int* algorithm, int* protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v2_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v3.h b/src/include/plugin/tuner/tuner_v3.h
new file mode 100644
index 0000000..4fa10e8
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v3.h
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V3_H_
+#define TUNER_V3_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v3_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v4.h b/src/include/plugin/tuner/tuner_v4.h
new file mode 100644
index 0000000..a4b38a0
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v4.h
@@ -0,0 +1,56 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V4_H_
+#define TUNER_V4_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+#endif
diff --git a/src/include/profiler.h b/src/include/profiler.h
index 2b7efe0..8d41079 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -17,6 +17,18 @@ struct ncclTaskP2p;
 struct ncclInfo;
 struct ncclComm;
 struct ncclProxyOp;
+struct ncclProxyConnector;
+
+struct ncclProfilerProxy {
+  bool initialized;
+  uint64_t* workStarted/*[MAXCHANNELS]*/;
+  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+  uint64_t workCounter[MAXCHANNELS]; // host work counter
+  struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
+  struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
+};
+
+extern int ncclProfilerEventMask;
 
 // Plugin Init/Finalize Wrappers
 ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
@@ -44,6 +56,10 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args,
 ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
 ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
 
+// Kernel Channel Start/Stop Event Wrappers
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
+
 // Record Event Wrappers
 ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
@@ -51,5 +67,9 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n
 
 // Profiler utility functions
 ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
+bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op);
+
+// Profiler callback for network plugin
+ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
 
 #endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index c97a4d7..225acb2 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -32,7 +32,8 @@ typedef enum : uint8_t {
   ncclPatternPatUp,
   ncclPatternPatDown,
   ncclPatternSend,
-  ncclPatternRecv
+  ncclPatternRecv,
+  ncclPatternProfiler,
 } ncclPattern_t;
 
 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
@@ -93,6 +94,7 @@ struct ncclProxyOp {
   int peer;
   pid_t pid;
   void* profilerContext;
+  uint64_t workCounter;
 
   struct ncclProxyOp *enqNext;
 };
@@ -129,12 +131,15 @@ struct ncclProxySubArgs {
   // Profiler plugin
   int eActivationMask;
   int rank;
+  uint64_t profilerSteps;
   pid_t pid;
   void* profilerContext;
   void* taskEventHandle;
   void* opEventHandle;
+  void* kernelEventHandle;
   void* stepEventHandles[NCCL_STEPS];
   size_t transSize;
+  uint64_t workCounter;
 
   void* recvRequestsCache[NCCL_STEPS];
   int recvRequestsSubCount;
diff --git a/src/include/ras.h b/src/include/ras.h
index 7909b3d..d27a543 100644
--- a/src/include/ras.h
+++ b/src/include/ras.h
@@ -15,6 +15,8 @@ struct rasRankInit {
   pid_t pid;
   int cudaDev;
   int nvmlDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
 };
 
 ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
diff --git a/src/include/register.h b/src/include/register.h
index 740a645..143f41b 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -42,7 +42,7 @@ struct ncclReg {
   uintptr_t baseAddr;
   size_t baseSize;
   CUdeviceptr regAddr;
-  size_t regSize;
+  size_t regUCSize, regMCSize;
   int dev;
   CUmemGenericAllocationHandle mcHandle;
   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
diff --git a/src/include/shm.h b/src/include/shm.h
index b519e5d..223d873 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -14,7 +14,6 @@ struct shmCuIpc {
     CUmemFabricHandle handle;
     CUmemGenericAllocationHandle data;
   };
-  int tpProxyRank;
   void *ptr;
   size_t size;
 };
@@ -30,8 +29,8 @@ struct shmIpcDesc {
 
 typedef struct shmIpcDesc ncclShmIpcDesc_t;
 
-ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
-ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
+ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
 ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
 
 #endif
diff --git a/src/include/socket.h b/src/include/socket.h
index f0a3237..ffa1480 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -96,5 +96,5 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
 ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
-ncclResult_t ncclSocketClose(struct ncclSocket* sock);
+ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false);
 #endif
diff --git a/src/include/strongstream.h b/src/include/strongstream.h
index 0984dfe..c56d5ac 100644
--- a/src/include/strongstream.h
+++ b/src/include/strongstream.h
@@ -10,13 +10,24 @@
 #include "nccl.h"
 #include "checks.h"
 
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <stdint.h>
 
+// ncclCudaContext: wraps a CUDA context with per-context state.
+struct ncclCudaContext;
+
+// Get a ncclCudaContext to track the currently active CUDA context.
+ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out);
+// Drop reference.
+void ncclCudaContextDrop(struct ncclCudaContext* cxt);
+
 /* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
  * easily.
  */
 struct ncclCudaGraph {
 #if CUDART_VERSION >= 11030
+  cudaStream_t origin;
   cudaGraph_t graph;
   unsigned long long graphId;
 #endif
@@ -25,6 +36,7 @@ struct ncclCudaGraph {
 inline struct ncclCudaGraph ncclCudaGraphNone() {
   struct ncclCudaGraph tmp;
   #if CUDART_VERSION >= 11030
+    tmp.origin = nullptr;
     tmp.graph = nullptr;
     tmp.graphId = ULLONG_MAX;
   #endif
@@ -33,7 +45,7 @@ inline struct ncclCudaGraph ncclCudaGraphNone() {
 
 inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
   #if CUDART_VERSION >= 11030
-    return graph.graph != nullptr;
+    return graph.graphId != ULLONG_MAX;
   #else
     return false;
   #endif
@@ -57,60 +69,37 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
  * streams unfit for the use of serializing access to a persistent resource.
  * Strong streams have been introduced to address this need.
  *
- * - All updates to a strong stream must be enclosed by a Acquire/Release pair.
+ * All updates to a strong stream must be enclosed by a Acquire/Release pair.
  *
- * - The Acquire, Release, and all updates take a ncclCudaGraph parameter
- *   indicating the currently capturing graph (or none). This parameter must be
- *   the same for the entire sequence of {Acquire; ...; Release}.
+ * Acquire retrieves a "work" stream (cudaStream_t) which may be used to add
+ * work.
  *
- * - An {Acquire; ...; Release} sequence must not be concurrent with any
- *   other operations against the strong stream including graph launches which
- *   reference this stream.
+ * Release publishes the work streams work into the strong stream. The Release
+ * must be issued by the same thread that did the Acquire.
  */
 struct ncclStrongStream;
 
 ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
 ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
 
-// Acquire-fence the strong stream.
+// Acquire the strong stream. Upon return `*workStream` will be usable to add work.
+// `concurrent` indicates if other threads may be using the strong stream.
 ncclResult_t ncclStrongStreamAcquire(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
 );
 
-// Acquire-fence the strong stream assuming no graph is capturing. This permits
-// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA
-// calls. Strong stream still must be released via:
-//   ncclStrongStreamRelease(ncclCudaGraphNone(), ss);
-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
-
-// Release-fence of the strong stream.
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
-
-// Add a host launch to the stream.
-ncclResult_t ncclStrongStreamLaunchHost(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  cudaHostFn_t fn, void* arg
-);
-// Add a kernel launch to the stream.
-ncclResult_t ncclStrongStreamLaunchKernel(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
+// Get the workStream for an already acquired strong stream.
+// `concurrent` indicates if other threads may be using the strong stream.
+ncclResult_t ncclStrongStreamAcquiredWorkStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
 );
 
-// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
-// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
-// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
-// implementation to induce few graph dependencies.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
-);
-// `b` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
-);
-// `a` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
+// Release of the strong stream.
+// `concurrent` indicates if other threads may be using the strong stream.
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent);
+
+ncclResult_t ncclStreamWaitStream(
+  cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent
 );
 
 // Synchrnoization does not need the strong stream to be acquired.
@@ -118,23 +107,28 @@ ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct ncclStrongStreamGraph; // internal to ncclStrongStream
+struct ncclStrongStreamCapture; // internal to ncclStrongStream
 
 struct ncclStrongStream {
-  // Used when not graph capturing.
-  cudaStream_t cudaStream;
+  // The stream to use for non-captured work.
+  cudaStream_t liveStream;
+  void* liveAcquiredBy;
 #if CUDART_VERSION >= 11030
+  // This stream ever appeared in a graph capture.
+  bool everCaptured;
+  pthread_mutex_t lock;
+  struct ncclStrongStreamCapture* captureHead;
   // The event used to establish order between graphs and streams. During acquire
   // this event is waited on, during release it is recorded to.
   cudaEvent_t serialEvent;
-  // This stream ever appeared in a graph capture.
-  bool everCaptured;
-  // Tracks whether serialEvent needs to be recorded to upon Release().
-  bool serialEventNeedsRecord;
-  struct ncclStrongStreamGraph* graphHead;
-#else
-  cudaEvent_t scratchEvent;
 #endif
 };
 
+struct ncclCudaContext {
+  struct ncclCudaContext* next;
+  CUcontext hcontext;
+  int refCount;
+  struct ncclStrongStream launchOrder;
+};
+
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 37187f6..c563fbb 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -18,6 +18,7 @@
 #define TRANSPORT_SHM 1
 #define TRANSPORT_NET 2
 #define TRANSPORT_COLLNET 3
+#define TRANSPORT_PROFILER 4
 
 #include "proxy.h"
 #include "comm.h"
@@ -26,6 +27,7 @@ extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
 extern struct ncclTransport netTransport;
 extern struct ncclTransport collNetTransport;
+extern struct ncclTransport profilerTransport;
 
 extern struct ncclTransport* ncclTransports[];
 // Forward declarations
@@ -65,8 +67,10 @@ struct ncclNvlsSharedRes {
   CUmulticastObjectProp signalProp;
   CUmemAccessDesc accessDesc;
   int dev;
-  size_t buffSize;
-  size_t creditSize;
+  size_t creditUCSize;
+  size_t creditMCSize;
+  size_t buffUCSize;
+  size_t buffMCSize;
   CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
   CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
   char* mcBuff; // Multicast NVLS buffer address
@@ -123,7 +127,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
 ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
 ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
 ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
 
 enum { collNetRecv=0, collNetSend=1 };
diff --git a/src/init.cc b/src/init.cc
index 3e218ab..46b02e6 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -51,17 +51,6 @@ NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
 
 static ncclResult_t commReclaim(ncclComm_t comm);
 
-static uint64_t hashUniqueId(ncclUniqueId const &id) {
-  char const *bytes = (char const*)&id;
-  uint64_t h = 0xdeadbeef;
-  for(int i=0; i < (int)sizeof(ncclUniqueId); i++) {
-    h ^= h >> 32;
-    h *= 0x8db3db47fa2994ad;
-    h += bytes[i];
-  }
-  return h;
-}
-
 // GDRCOPY support: Off by default
 NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
 
@@ -111,7 +100,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
   memset(out, 0, sizeof(*out));
   // copy to avoid alignment mismatch
   memcpy(out, &handle, sizeof(handle));
-  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
+  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(out->internal, NCCL_UNIQUE_ID_BYTES));
   return ncclSuccess;
 }
 
@@ -232,6 +221,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
       free(comm->sharedRes->tpRankToLocalRank);
       NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
       NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
+      CUDACHECK(cudaEventDestroy(comm->sharedRes->launchEvent));
+      CUDACHECK(cudaEventDestroy(comm->sharedRes->scratchEvent));
       NCCLCHECK(ncclProxyDestroy(comm));
       free(comm->sharedRes);
     }
@@ -268,6 +259,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
   NCCLCHECK(ncclProfilerPluginFinalize(comm));
   NCCLCHECK(ncclNetFinalize(comm));
   NCCLCHECK(ncclNetPluginUnload(comm));
+
+  ncclCudaContextDrop(comm->context);
+
   free(comm);
 
   return ncclSuccess;
@@ -309,17 +303,12 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
     ncclGroupJobAbort(comm->groupJob);
   } else {
     NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
-    if (ret != ncclSuccess) {
-      /* if ret is not ncclInProgress, we just keep it. */
+    if (ret == ncclInProgress) {
       WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
-      if (ret == ncclInProgress) ret = ncclInvalidArgument;
+      ret = ncclInvalidArgument;
       goto exit;
     }
-    /* if there is linked group job, we should complete it. */
-    if (comm->groupJob) {
-      NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
-      comm->groupJob = NULL;
-    }
+    /* if ret is not ncclInProgress, we just keep it. */
   }
 
 exit:
@@ -357,6 +346,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   // the device we're on (failure cause #1) , better know it early.
   CUDACHECK(cudaGetDevice(&comm->cudaDev));
 
+  NCCLCHECK(ncclCudaContextTrack(&comm->context));
+
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
   nvmlDevice_t nvmlDev;
   char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
@@ -396,6 +387,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
     NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
     NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
     NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
     comm->sharedRes = sharedRes;
     sharedRes->refCount = 1;
   } else {
@@ -437,13 +430,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   struct ncclDevCommAndChannels *devCommAndChans = NULL;
   struct ncclNvmlCCStatus ccStatus;
   bool ccEnable;
+  cudaStream_t deviceStream;
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail);
   ncclCommPushCudaFree(comm, devCommAndChans);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
   ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
   comm->devComm = &devCommAndChans->comm;
   tmpCommAndChans.comm.rank = comm->rank;
   tmpCommAndChans.comm.nRanks = nRanks;
@@ -494,10 +488,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   comm->workFifoConsumedLeast = 0;
   tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed;
 
+  // Alloc profiler counters for the kernel
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workCompleted, MAXCHANNELS), ret, fail);
+  tmpCommAndChans.comm.workStarted = comm->profiler.workStarted;
+  tmpCommAndChans.comm.workCompleted = comm->profiler.workCompleted;
+  ncclCommPushCudaHostFree(comm, comm->profiler.workStarted);
+  ncclCommPushCudaHostFree(comm, comm->profiler.workCompleted);
+
   if (comm->collNetDenseToUserRank != nullptr) {
-    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
     ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
-    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
   }
 
   for (int c=0; c < MAXCHANNELS; c++) {
@@ -510,14 +512,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
     tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
 
     if (comm->channels[c].ring.userRanks != nullptr) {
-      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, deviceStream), ret, fail);
     }
   }
 
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, deviceStream), ret, fail);
 exit:
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream));
   return ret;
 fail:
   goto exit;
@@ -1000,6 +1002,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
       graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
     }
+    comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
   }
   if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
   if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
@@ -1376,12 +1379,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
-    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex),
-    // add unique split counter and the color
-    ncclUniqueId tmpId;
-    memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
-    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color);
-    comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
+    // child hash obtained from (parent hash, split count, color)
+    uint64_t hacc[2] = {1, 1};
+    eatHash(hacc, &job->parent->commHash);
+    eatHash(hacc, &job->splitCount);
+    eatHash(hacc, &job->color);
+    comm->commHash = digestHash(hacc);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
          comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1394,8 +1397,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
     // obtain a unique hash using the first commId
-    comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
-    commIdHash = hashUniqueId(job->commId[0]);
+    comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
          comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1610,6 +1612,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d");
 
   /* assign config to communicator */
   comm->config.blocking = internalConfigPtr->blocking;
@@ -1618,6 +1621,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   comm->config.maxCTAs = internalConfigPtr->maxCTAs;
   comm->config.netName = internalConfigPtr->netName;
   comm->config.splitShare = internalConfigPtr->splitShare;
+  comm->config.trafficClass = internalConfigPtr->trafficClass;
 
   NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
 
@@ -1642,6 +1646,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
   const char* commIdEnv = NULL;
   ncclComm_t comm = NULL;
   struct ncclCommInitRankAsyncJob* job = NULL;
+  bool launchedJob = false;
   // first call ncclInit, this will setup the environment
   NCCLCHECKGOTO(ncclInit(), res, fail);
 
@@ -1695,12 +1700,13 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
     // start the bootstrap root before bootstrapping, use only the first handle
     NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
   }
+  launchedJob = true;
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
 
 exit:
   return ncclGroupErrCheck(res);
 fail:
-  if (job) ncclCommInitJobFree(job);
+  if (job && !launchedJob) ncclCommInitJobFree(job);
   if (comm) {
     free(comm->abortFlag);
     if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
@@ -1896,7 +1902,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
     NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
     NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
     // And keep polling until all graphs referencing us die.
-    while (comm->persistentRefs != 0) {
+    while (comm->localPersistentRefs != 0) {
       NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
     }
     while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
@@ -1964,7 +1970,6 @@ exit:
   }
   return ret;
 fail:
-  free(job);
   if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
   goto exit;
 }
@@ -2215,6 +2220,11 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
 
   *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
   if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
+  /* if there is linked group job, we should complete it. */
+  if (*asyncError == ncclSuccess && comm->groupJob) {
+    NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
+    comm->groupJob = NULL;
+  }
   return ncclSuccess;
 }
 
@@ -2265,16 +2275,13 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
 
 #if CUDART_VERSION >= 12010
   size_t memGran = 0;
-  size_t mcGran = 0;
   CUdevice currentDev;
   CUmemAllocationProp memprop = {};
-  CUmulticastObjectProp mcprop = {};
   CUmemAccessDesc accessDesc = {};
   CUmemGenericAllocationHandle handle;
   int cudaDev;
   int flag;
   int dcnt;
-  int mcSupport = 0;
 
   if (ptr == NULL || size == 0) goto fallback;
 
@@ -2284,6 +2291,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
   CUCHECK(cuDeviceGet(&currentDev, cudaDev));
 
   if (ncclCuMemEnable()) {
+    size_t handleSize = size;
     int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     // Query device to see if FABRIC handle support is available
     flag = 0;
@@ -2299,40 +2307,25 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
     if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
     CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
     CUDACHECK(cudaGetDeviceCount(&dcnt));
-
-    if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
-    if (mcSupport) {
-      /* mc property */
-      mcprop.size = size;
-      /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
-      mcprop.numDevices = dcnt;
-      mcprop.handleTypes = requestedHandleTypes;
-      mcprop.flags = 0;
-      CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-
-      /* only size needs to be aligned to mcGran */
-      ALIGN_SIZE(size, mcGran);
-    } else {
-      ALIGN_SIZE(size, memGran);
-    }
+    ALIGN_SIZE(handleSize, memGran);
 
     if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
       /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
-      CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
+      CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
       if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
         requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
         memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
         /* Allocate the physical memory on the device */
-        CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
       }
     } else {
       /* Allocate the physical memory on the device */
-      CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+      CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
     }
     /* Reserve a virtual address range */
-    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
     /* Map the virtual address range to the physical allocation */
-    CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+    CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
     /* Now allow RW access to the newly mapped memory */
     for (int i = 0; i < dcnt; ++i) {
       int p2p = 0;
@@ -2340,7 +2333,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
         accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
         accessDesc.location.id = i;
         accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
       }
       if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
     }
diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc
index 23746b3..3e9dfcd 100644
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@@ -169,7 +169,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
   } control_un;
 
   struct cmsghdr *cmptr;
-  char dummy_buffer[1];
+  char dummy_buffer[1] = {'\0'};
   struct sockaddr_un cliaddr;
 
   // Construct client address to send this shareable handle to
@@ -190,6 +190,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
   TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
 
   if (sendFd != -1) {
+    memset(&control_un, '\0', sizeof(control_un));
     msg.msg_control = control_un.control;
     msg.msg_controllen = sizeof(control_un.control);
 
diff --git a/src/misc/param.cc b/src/misc/param.cc
index eb50cfe..d7c324f 100644
--- a/src/misc/param.cc
+++ b/src/misc/param.cc
@@ -32,6 +32,7 @@ void setEnvFile(const char* fileName) {
   size_t n = 0;
   ssize_t read;
   while ((read = getline(&line, &n, file)) != -1) {
+    if (line[0] == '#') continue;
     if (line[read-1] == '\n') line[read-1] = '\0';
     int s=0; // Env Var Size
     while (line[s] != '\0' && line[s] != '=') s++;
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index dfb4e68..731dbce 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -171,6 +171,7 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
       strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
       // Store the IP address
       int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+      memset(addrs+found, '\0', sizeof(*addrs));
       memcpy(addrs+found, interface->ifa_addr, salen);
       found++;
     }
@@ -905,9 +906,17 @@ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
   if (sock != NULL) {
     if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
+      if (wait) {
+        char data;
+        int closed = 0;
+        do {
+          int offset = 0;
+          if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
+        } while (closed == 0);
+      }
       /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
        * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
        * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
index 61b0e4b..e6cce98 100644
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@@ -9,28 +9,61 @@
 #include "checks.h"
 #include "param.h"
 
-// Tracks the chain of graph nodes for a given graph captured identified by
-// its graph id. This state has to live for as long as captured work is being
-// submitted. CUDA doesn't have mechanism to inform us when the user ends capture
-// so the best we can do is get notified when the graph is destroyed.
-struct ncclStrongStreamGraph {
-  struct ncclStrongStreamGraph* next;
-  // Atomically exchanged to false by both the main thread or the graph destructor
-  // callback. The last to arrive deletes the node.
-  bool alive;
+// Tracks the captured work a given graph captured identified by its graph id.
+struct ncclStrongStreamCapture {
+  struct ncclStrongStreamCapture* next;
+  cudaGraph_t graph;
   unsigned long long graphId;
-  // For each graph we track the "tip" of the chain of graph nodes. A linear
-  // chain would always have just one node at its tip, but since we have to merge
-  // in chains from other streams (via ncclStrongStreamWaitStream) some spots
-  // in the chain can be wider than a single node and thus need a list, so we
-  // maintain a dynamically sized array of tip nodes.
-  int tipCount, tipCapacity;
-  cudaGraphNode_t* tipNodes;
+  cudaStream_t captureStream;
+  cudaGraphNode_t lastRecord;
+  void* acquiredBy;
 };
 
-static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) {
-  free(g->tipNodes);
-  free(g);
+////////////////////////////////////////////////////////////////////////////////
+
+static ncclCudaContext* cxtListHead = nullptr;
+static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
+  ncclResult_t result = ncclSuccess;
+  CUcontext hcontext;
+  CUCHECK(cuCtxGetCurrent(&hcontext));
+
+  pthread_mutex_lock(&cxtListLock);
+  struct ncclCudaContext* p = cxtListHead;
+  while (1) {
+    if (p == nullptr) {
+      p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext));
+      p->refCount = 1;
+      p->hcontext = hcontext;
+      p->next = cxtListHead;
+      cxtListHead = p;
+      NCCLCHECKGOTO(ncclStrongStreamConstruct(&p->launchOrder), result, leave);
+      break;
+    }
+    if (p->hcontext == hcontext) {
+      p->refCount += 1;
+      break;
+    }
+    p = p->next;
+  }
+leave:
+  pthread_mutex_unlock(&cxtListLock);
+  *out = p;
+  return ncclSuccess;
+}
+
+void ncclCudaContextDrop(struct ncclCudaContext* cxt) {
+  pthread_mutex_lock(&cxtListLock);
+  if (0 == --cxt->refCount) {
+    struct ncclCudaContext** pp = &cxtListHead;
+    while (*pp != cxt) pp = &(*pp)->next;
+    *pp = cxt->next; // remove from list
+    // Destroy resources held in cxt
+    ncclStrongStreamDestruct(&cxt->launchOrder);
+    free(cxt);
+  }
+  pthread_mutex_unlock(&cxtListLock);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -43,9 +76,9 @@ ncclResult_t ncclCudaGetCapturingGraph(
     NCCLCHECK(ncclCudaDriverVersion(&driver));
     if (CUDART_VERSION < 11030 || driver < 11030) {
       cudaStreamCaptureStatus status;
-      unsigned long long gid;
-      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid));
+      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, nullptr));
       #if CUDART_VERSION >= 11030
+        graph->origin = nullptr;
         graph->graph = nullptr;
         graph->graphId = ULLONG_MAX;
       #endif
@@ -56,13 +89,14 @@ ncclResult_t ncclCudaGetCapturingGraph(
     } else {
       #if CUDART_VERSION >= 11030
         cudaStreamCaptureStatus status;
-        unsigned long long gid;
-        CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
+        CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr));
         if (status != cudaStreamCaptureStatusActive) {
+          graph->origin = nullptr;
           graph->graph = nullptr;
-          gid = ULLONG_MAX;
+          graph->graphId = ULLONG_MAX;
+        } else {
+          graph->origin = stream;
         }
-        graph->graphId = gid;
       #endif
     }
   #endif
@@ -86,315 +120,218 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
 ////////////////////////////////////////////////////////////////////////////////
 
 ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
-  CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking));
+  CUDACHECK(cudaStreamCreateWithFlags(&ss->liveStream, cudaStreamNonBlocking));
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
     ss->everCaptured = false;
-    ss->serialEventNeedsRecord = false;
-    ss->graphHead = nullptr;
-  #else
-    CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming));
+    ss->captureHead = nullptr;
+    pthread_mutex_init(&ss->lock, nullptr);
+    CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
   #endif
   return ncclSuccess;
 }
 
-static void graphDestructor(void* arg) {
-  struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg;
-  if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
-    // Last to arrive deletes list node.
-    ncclStrongStreamGraphDelete(g);
-  }
-}
-
 ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
-  CUDACHECK(cudaStreamDestroy(ss->cudaStream));
+  CUDACHECK(cudaStreamDestroy(ss->liveStream));
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaEventDestroy(ss->serialEvent));
-    // Delete list of per-graph chains.
-    struct ncclStrongStreamGraph* g = ss->graphHead;
-    while (g != nullptr) {
-      struct ncclStrongStreamGraph* next = g->next;
-      if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
-        // Last to arrive deletes list node.
-        ncclStrongStreamGraphDelete(g);
-      }
-      g = next;
+    struct ncclStrongStreamCapture* cap = ss->captureHead;
+    while (cap) {
+      struct ncclStrongStreamCapture* next = cap->next;
+      CUDACHECK(cudaStreamDestroy(cap->captureStream));
+      free(cap);
+      cap = next;
     }
-  #else
-    CUDACHECK(cudaEventDestroy(ss->scratchEvent));
+    CUDACHECK(cudaEventDestroy(ss->serialEvent));
+    pthread_mutex_destroy(&ss->lock);
   #endif
   return ncclSuccess;
 }
 
 NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1)
+NCCL_PARAM(LaunchRaceFatal, "LAUNCH_RACE_FATAL", 1);
+constexpr char const* launchRaceFatalMsg = "Fatal: host threads racing to launch NCCL on same device.";
 
-static void ensureTips(struct ncclStrongStreamGraph* g, int n) {
-  if (g->tipCapacity < n) {
-    g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t));
-    g->tipCapacity = n;
-  }
-}
+static __thread char threadIdMarker;
+static void* localThreadId() { return &threadIdMarker; }
 
 ncclResult_t ncclStrongStreamAcquire(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss
+   struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
+   cudaStream_t* workStream
   ) {
   #if CUDART_VERSION >= 11030
     bool mixing = ncclParamGraphMixingSupport();
-    if (graph.graph == nullptr) {
-      if (mixing && ss->everCaptured) {
-        CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-        ss->serialEventNeedsRecord = false;
+    if (graph.graphId == ULLONG_MAX) {
+      *workStream = ss->liveStream;
+      ss->liveAcquiredBy = localThreadId();
+      if (mixing && __atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
+        CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
       }
     } else {
-      ss->everCaptured = true;
-      // Find the current graph in our list of graphs if it exists.
-      struct ncclStrongStreamGraph** pg = &ss->graphHead;
-      struct ncclStrongStreamGraph* g;
-      while (*pg != nullptr) {
-        g = *pg;
-        if (g->graphId == graph.graphId) {
-          // Move to front of list so that operations after acquire don't have to search the list.
-          *pg = g->next;
-          g->next = ss->graphHead;
-          ss->graphHead = g;
+      bool firstCapture = !ss->everCaptured;
+      __atomic_store_n(&ss->everCaptured, true, __ATOMIC_RELAXED);
+
+      ncclResult_t ret = ncclSuccess;
+      if (concurrent) pthread_mutex_lock(&ss->lock);
+
+      // Look for capture in our list of active captures.
+      struct ncclStrongStreamCapture** pcap = &ss->captureHead;
+      struct ncclStrongStreamCapture* cap;
+      struct ncclStrongStreamCapture* spare = nullptr;
+      while (*pcap != nullptr) {
+        cap = *pcap;
+        if (cap->graphId == graph.graphId) { // Capture node already exists.
+          *workStream = cap->captureStream;
+          cap->acquiredBy = localThreadId();
+          if (concurrent) pthread_mutex_unlock(&ss->lock);
           return ncclSuccess;
-        } else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) {
-          // Unrelated graph that has been destroyed. Remove and delete.
-          *pg = g->next;
-          ncclStrongStreamGraphDelete(g);
         } else {
-          pg = &g->next;
+          cudaStreamCaptureStatus status;
+          CUDACHECKGOTO(cudaStreamIsCapturing(cap->captureStream, &status), ret, do_unlock);
+          if (status == cudaStreamCaptureStatusActive) {
+            pcap = &cap->next; // Active capture doesn't match, on to next.
+          } else { // Capture no longer active
+            *pcap = cap->next; // Remove from current list
+            if (spare == nullptr) { // Keep one spare to reuse below.
+              spare = cap;
+            } else {
+              cudaStreamDestroy(cap->captureStream);
+              free(cap);
+            }
+          }
         }
       }
-
-      // This is a new graph so add to the list.
-      g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph));
-      g->graphId = graph.graphId;
-      g->tipNodes = nullptr;
-      g->tipCapacity = 0;
-      g->tipCount = 0;
-      g->next = ss->graphHead;
-      ss->graphHead = g;
-      g->alive = true;
-      NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g));
-
-      if (mixing && ss->serialEventNeedsRecord) {
-        // Can only be here if previous release was for uncaptured work that
-        // elided updating the event because no capture had yet occurred.
-        CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-        CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
+      // No matching capture, need a new entry.
+      cap = spare;
+      if (cap == nullptr) {
+        cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture));
+        CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
       }
-      ss->serialEventNeedsRecord = false;
+      cap->graphId = graph.graphId;
+      cap->lastRecord = nullptr;
+      cap->acquiredBy = localThreadId();
+      // Push to capturing list.
+      cap->next = ss->captureHead;
+      ss->captureHead = cap;
 
-      // First node in the chain must be a wait on the serialEvent.
+    do_unlock:
+      if (concurrent) pthread_mutex_unlock(&ss->lock);
+      if (ret != ncclSuccess) return ret;
+
+      *workStream = cap->captureStream;
+
+      // Bring captureStream into the graph but without any dependencies.
+      cudaEvent_t scratch;
+      CUDACHECK(cudaEventCreateWithFlags(&scratch, cudaEventDisableTiming));
+      CUDACHECK(cudaEventRecord(scratch, graph.origin));
+      CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0));
+      CUDACHECK(cudaEventDestroy(scratch));
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies));
+
+      if (mixing && firstCapture) {
+        CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
+      }
       if (mixing) {
-        ensureTips(g, 1);
-        CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent));
-        g->tipCount = 1;
-      } else {
-        g->tipCount = 0;
+        // First dependency is to wait on serialEvent
+        CUDACHECK(cudaStreamWaitEvent(cap->captureStream, ss->serialEvent, cudaEventWaitExternal));
       }
     }
   #endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
+ncclResult_t ncclStrongStreamAcquiredWorkStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
+    cudaStream_t* workStream
+  ) {
   #if CUDART_VERSION >= 11030
-    bool mixing = ncclParamGraphMixingSupport();
-    if (mixing && ss->everCaptured) {
-      CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
+    if (graph.graphId == ULLONG_MAX) {
+      *workStream = ss->liveStream;
+    } else {
+      if (concurrent) pthread_mutex_lock(&ss->lock);
+      struct ncclStrongStreamCapture* cap = ss->captureHead;
+      while (cap->graphId != graph.graphId) cap = cap->next;
+      *workStream = cap->captureStream;
+      if (concurrent) pthread_mutex_unlock(&ss->lock);
     }
-    ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream.
+  #else
+    *workStream = ss->liveStream
   #endif
   return ncclSuccess;
 }
 
-static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) {
-  if (g == nullptr || g->graphId != id) {
-    WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id);
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
+ncclResult_t ncclStrongStreamRelease(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent
+  ) {
   #if CUDART_VERSION >= 11030
     bool mixing = ncclParamGraphMixingSupport();
-    if (mixing && ss->serialEventNeedsRecord) {
-      if (graph.graph == nullptr) {
-        if (ss->everCaptured) {
-          CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
-          ss->serialEventNeedsRecord = false;
+    if (mixing) {
+      if (graph.graphId == ULLONG_MAX) {
+        if (__atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
+          CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
+        }
+        if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
+          WARN("%s", launchRaceFatalMsg);
+          return ncclInvalidUsage;
         }
       } else {
-        struct ncclStrongStreamGraph* g = ss->graphHead;
-        NCCLCHECK(checkGraphId(g, graph.graphId));
-        ensureTips(g, 1);
-        CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent));
-        g->tipCount = 1;
-        ss->serialEventNeedsRecord = false;
+        if (concurrent) pthread_mutex_lock(&ss->lock);
+        struct ncclStrongStreamCapture* cap = ss->captureHead;
+        while (cap->graphId != graph.graphId) cap = cap->next;
+        if (concurrent) pthread_mutex_unlock(&ss->lock);
+
+        // Add event record node with dependencies added further down.
+        cudaGraphNode_t recordNode;
+        CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
+
+        // Make this record order after previous record on this stream.
+        if (cap->lastRecord != nullptr) {
+          CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
+        }
+        cap->lastRecord = recordNode;
+
+        // Get current nodes from work stream so we can add them as dependencies.
+        cudaStreamCaptureStatus status;
+        cudaGraphNode_t const* nodes;
+        size_t count = 0;
+        cudaError_t res = cudaStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count);
+
+        #if CUDART_VERSION >= 12030
+        if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
+          cudaGraphEdgeData const* edges;
+          CUDACHECK(cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, &edges, &count));
+          for (int i=0; i < (int)count; i++) {
+            CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, &edges[i], 1));
+          }
+        }
+        #else
+        if (false) {}
+        #endif
+        else {
+          CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
+          for (int i=0; i < (int)count; i++) {
+            CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1));
+          }
+        }
+
+        if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
+          WARN("%s", launchRaceFatalMsg);
+          return ncclInvalidUsage;
+        }
       }
     }
   #endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclStrongStreamLaunchHost(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
-    } else {
-      cudaHostNodeParams p;
-      p.fn = fn;
-      p.userData = arg;
-      struct ncclStrongStreamGraph* g = ss->graphHead;
-      NCCLCHECK(checkGraphId(g, graph.graphId));
-      ensureTips(g, 1);
-      CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
-      g->tipCount = 1;
-    }
-    ss->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
-  #endif
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamLaunchKernel(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-    void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
-    } else {
-      cudaKernelNodeParams p;
-      p.func = fn;
-      p.gridDim = grid;
-      p.blockDim = block;
-      p.kernelParams = args;
-      p.sharedMemBytes = sharedMemBytes;
-      p.extra = nullptr;
-      struct ncclStrongStreamGraph* g = ss->graphHead;
-      NCCLCHECK(checkGraphId(g, graph.graphId));
-      ensureTips(g, 1);
-      CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
-      g->tipCount = 1;
-    }
-    ss->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
-  #endif
-  return ncclSuccess;
-}
-
-// Merge node list `b` into list `a` but don't add duplicates.
-static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) {
-  int an = a->tipCount;
-  ensureTips(a, an + bn);
-  for (int bi=0; bi < bn; bi++) {
-    for (int ai=0; ai < an; ai++) {
-      if (a->tipNodes[ai] == bNodes[bi]) goto next_b;
-    }
-    a->tipNodes[a->tipCount++] = bNodes[bi];
-  next_b:;
-  }
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      if (b->serialEventNeedsRecord) {
-        b->serialEventNeedsRecord = false;
-        CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
-      }
-      CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0));
-    } else {
-      struct ncclStrongStreamGraph* ag = a->graphHead;
-      NCCLCHECK(checkGraphId(ag, graph.graphId));
-      struct ncclStrongStreamGraph* bg = b->graphHead;
-      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      if (b_subsumes_a) ag->tipCount = 0;
-      mergeTips(ag, bg->tipNodes, bg->tipCount);
-    }
-    a->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
-    CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0));
-  #endif
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      // It is ok to use a->serialEvent to record b since we'll be setting
-      // a->serialEventNeedsRecord so the event won't be considered accurate
-      // until re-recorded.
-      CUDACHECK(cudaEventRecord(a->serialEvent, b));
-      CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0));
-    } else {
-      cudaStreamCaptureStatus status;
-      unsigned long long bGraphId;
-      cudaGraphNode_t const* bNodes;
-      size_t bCount = 0;
-      CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount));
-      if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) {
-        WARN("Stream is not being captured by the expected graph.");
-        return ncclInvalidUsage;
-      }
-      struct ncclStrongStreamGraph* ag = a->graphHead;
-      NCCLCHECK(checkGraphId(ag, graph.graphId));
-      if (b_subsumes_a) ag->tipCount = 0;
-      mergeTips(ag, bNodes, bCount);
-    }
-    a->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaEventRecord(a->scratchEvent, b));
-    CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0));
-  #endif
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      if (b->serialEventNeedsRecord) {
-        b->serialEventNeedsRecord = false;
-        CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
-      }
-      CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0));
-    } else {
-      struct ncclStrongStreamGraph* bg = b->graphHead;
-      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount,
-        b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies
-      ));
-    }
-  #else
-    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
-    CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0));
-  #endif
+ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent) {
+  CUDACHECK(cudaEventRecord(scratchEvent, b));
+  CUDACHECK(cudaStreamWaitEvent(a, scratchEvent, 0));
   return ncclSuccess;
 }
 
 ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-    ss->serialEventNeedsRecord = false;
+    CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
   #endif
-  CUDACHECK(cudaStreamSynchronize(ss->cudaStream));
+  CUDACHECK(cudaStreamSynchronize(ss->liveStream));
   return ncclSuccess;
 }
diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc
deleted file mode 100644
index 267e12a..0000000
--- a/src/misc/tuner.cc
+++ /dev/null
@@ -1,267 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <dlfcn.h>
-#include <errno.h>
-#include <stdlib.h>
-
-#include "checks.h"
-#include "debug.h"
-#include "tuner.h"
-
-pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static int tunerPluginRefCount;
-static void* tunerPluginLib = nullptr;
-static ncclTuner_v4_t* tunerSymbol = nullptr;
-static ncclTuner_v3_t* ncclTuner_v3 = nullptr;
-static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
-static ncclTuner_v4_t ncclTuner_v2_as_v4;
-static ncclTuner_v4_t ncclTuner_v3_as_v4;
-
-static int hasNvlsSupport(float** collCostTable) {
-  // Requirements for support of different algorithms:
-  //
-  // - NVLS intra-node: nvlsSupport
-  // - NVLS intra+inter-node: collNetSupport
-  // - NVLSTree intra-node: always disabled
-  // - NVLSTree inter-node: nvlsSupport
-  // - Collnet* inter-node: collNetSupport
-  //
-  // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
-}
-
-static int hasCollNetSupport(float** collCostTable) {
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
-}
-
-static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
-  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
-  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
-  ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo;
-  ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
-  int algorithm = NCCL_ALGO_UNDEF;
-  int protocol = NCCL_PROTO_UNDEF;
-  int nvlsSupport = hasNvlsSupport(collCostTable);
-  int collNetSupport = hasCollNetSupport(collCostTable);
-  NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
-  // set time to 0 below to make sure this algorithm/protocol is selected later on
-  if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
-    float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-    if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
-  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
-  ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo;
-  ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy;
-  return ncclSuccess;
-}
-
-#define MAX_STR_LEN 255
-
-static void* tryOpenLib(const char* name, int* err, char* errStr) {
-  *err = 0;
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = '\0';
-    // "handle" and "name" won't be NULL at the same time.
-    // coverity[var_deref_model]
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openTunerPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char tunerPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-  const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN");
-  if (envTunerPluginName && strlen(envTunerPluginName)) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName);
-    snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  } else {
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so");
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  }
-
-  const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envNetPluginName && strlen(envNetPluginName)) {
-    // Users are allowed to pack tuner into the net plugin
-    snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  } else {
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so");
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  }
-  tunerPluginLibName[0] = '\0';
-  return nullptr;
-}
-
-enum {
-  tunerPluginLoadFailed  = -1,
-  tunerPluginLoadReady   =  0,
-  tunerPluginLoadSuccess =  1,
-};
-
-#define MAX_PLUGIN_LOAD 4
-
-static int status = tunerPluginLoadReady;
-
-ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
-  // Initialize to nullptr by default if plugin tuner cannot be loaded.
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  comm->tuner = nullptr;
-  if (tunerPluginLoadFailed == status) {
-    return ncclSuccess;
-  }
-
-  pthread_mutex_lock(&tunerPluginLock);
-  if (tunerPluginLoadFailed == status) {
-    goto exit;
-  }
-
-  if (tunerPluginLoadSuccess == status) {
-    comm->tuner = tunerSymbol;
-    ++tunerPluginRefCount;
-    goto exit;
-  }
-
-  tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
-  if (nullptr == tunerPluginLib) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames);
-    } else {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
-    }
-    goto fail;
-  }
-
-  tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4");
-  if (tunerSymbol == nullptr) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
-    ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
-    if (ncclTuner_v3 == nullptr) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
-      ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
-      if (ncclTuner_v2 == nullptr) {
-        INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
-        dlclose(tunerPluginLib);
-        goto fail;
-      } else {
-        ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init;
-        ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
-        tunerSymbol = &ncclTuner_v2_as_v4;
-      }
-    } else {
-      ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init;
-      ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
-      tunerSymbol = &ncclTuner_v3_as_v4;
-    }
-  }
-
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name);
-  comm->tuner = tunerSymbol;
-  ++tunerPluginRefCount;
-  status = tunerPluginLoadSuccess;
-  comm->tunerPluginLoaded = 1;
-
-exit:
-  pthread_mutex_unlock(&tunerPluginLock);
-  return ncclSuccess;
-fail:
-  tunerPluginLib = nullptr;
-  status = tunerPluginLoadFailed;
-  goto exit;
-}
-
-ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&tunerPluginLock);
-  if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
-    INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
-    dlclose(tunerPluginLib);
-    tunerPluginLib = nullptr;
-    tunerSymbol = nullptr;
-    comm->tuner = nullptr;
-    status = tunerPluginLoadReady;
-    comm->tunerPluginLoaded = 0;
-  }
-  pthread_mutex_unlock(&tunerPluginLock);
-  return ncclSuccess;
-}
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 8a6f94e..f3ab534 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -66,6 +66,7 @@ typedef struct ncclConfig_v21700 {
   int maxCTAs;
   const char *netName;
   int splitShare;
+  int trafficClass;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -79,7 +80,8 @@ typedef struct ncclConfig_v21700 {
   NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
   NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
   NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
-  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
+  NCCL_CONFIG_UNDEF_INT,                    /* splitShare */            \
+  NCCL_CONFIG_UNDEF_INT,                    /* trafficClass */          \
 }
 
 /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
diff --git a/src/net.cc b/src/net.cc
deleted file mode 100644
index 13e8c2b..0000000
--- a/src/net.cc
+++ /dev/null
@@ -1,1033 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "net.h"
-#include "bootstrap.h"
-#include "checks.h"
-
-#include <string.h>
-#include <errno.h>
-#include <dlfcn.h>
-//#include <sys/types.h>
-//#include <sys/stat.h>
-//#include <unistd.h>
-
-static ncclNet_v9_t ncclNet_v5_as_v9;
-static ncclNet_v9_t ncclNet_v6_as_v9;
-static ncclNet_v9_t ncclNet_v7_as_v9;
-static ncclNet_v9_t ncclNet_v8_as_v9;
-static ncclNet_v5_t *ncclNet_v5;
-static ncclNet_v6_t *ncclNet_v6;
-static ncclNet_v7_t *ncclNet_v7;
-static ncclNet_v8_t *ncclNet_v8;
-static ncclCollNet_v9_t ncclCollNet_v5_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v6_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v7_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v8_as_v9;
-static ncclCollNet_v5_t *ncclCollNet_v5;
-static ncclCollNet_v6_t *ncclCollNet_v6;
-static ncclCollNet_v7_t *ncclCollNet_v7;
-static ncclCollNet_v8_t *ncclCollNet_v8;
-
-#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
-#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
-
-static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v8_t p8;
-  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
-  if (ans != ncclSuccess) return ans;
-  props->name = p8.name;
-  props->pciPath = p8.pciPath;
-  props->guid = p8.guid;
-  props->ptrSupport = p8.ptrSupport;
-  props->regIsGlobal = p8.regIsGlobal;
-  props->forceFlush = 0;
-  props->speed = p8.speed;
-  props->port = p8.port;
-  props->maxComms = p8.maxComms;
-  props->maxRecvs = p8.maxRecvs;
-  props->latency = p8.latency;
-  props->netDeviceType = p8.netDeviceType;
-  props->netDeviceVersion = p8.netDeviceVersion;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v8->init(logfn));
-  ncclNet_v8_as_v9.name = ncclNet_v8->name;
-  ncclNet_v8_as_v9.devices = ncclNet_v8->devices;
-  ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties;
-  ncclNet_v8_as_v9.listen = ncclNet_v8->listen;
-  ncclNet_v8_as_v9.connect = ncclNet_v8->connect;
-  ncclNet_v8_as_v9.accept =  ncclNet_v8->accept;
-  ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr;
-  ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
-  ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr;
-  ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend;
-  ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv;
-  ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush;
-  ncclNet_v8_as_v9.test = ncclNet_v8->test;
-  ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend;
-  ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv;
-  ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen;
-  ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr;
-  ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed;
-  ncclNet_v8_as_v9.makeVDevice   = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v7_t p7;
-  ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
-  if (ans != ncclSuccess) return ans;
-  props->name = p7.name;
-  props->pciPath = p7.pciPath;
-  props->guid = p7.guid;
-  props->ptrSupport = p7.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p7.speed;
-  props->port = p7.port;
-  props->maxComms = p7.maxComms;
-  props->maxRecvs = p7.maxRecvs;
-  props->latency = p7.latency;
-  props->netDeviceType = p7.netDeviceType;
-  props->netDeviceVersion = p7.netDeviceVersion;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v7->init(logfn));
-  ncclNet_v7_as_v9.name = ncclNet_v7->name;
-  ncclNet_v7_as_v9.devices = ncclNet_v7->devices;
-  ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v7_as_v9.listen = ncclNet_v7->listen;
-  ncclNet_v7_as_v9.connect = ncclNet_v7->connect;
-  ncclNet_v7_as_v9.accept =  ncclNet_v7->accept;
-  ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr;
-  ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
-  ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr;
-  ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend;
-  ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv;
-  ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush;
-  ncclNet_v7_as_v9.test = ncclNet_v7->test;
-  ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend;
-  ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv;
-  ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen;
-  ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr;
-  ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed;
-  ncclNet_v7_as_v9.makeVDevice  = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
-  return ncclNet_v6->connect(dev, handle, sendComm);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
-  return ncclNet_v6->accept(listenComm, recvComm);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v6->init(logfn));
-  ncclNet_v6_as_v9.name = ncclNet_v6->name;
-  ncclNet_v6_as_v9.devices = ncclNet_v6->devices;
-  ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties;
-  ncclNet_v6_as_v9.listen = ncclNet_v6->listen;
-  ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect;
-  ncclNet_v6_as_v9.accept =  ncclNet_v6_as_v9_accept;
-  ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr;
-  ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
-  ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr;
-  ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend;
-  ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv;
-  ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush;
-  ncclNet_v6_as_v9.test = ncclNet_v6->test;
-  ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend;
-  ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv;
-  ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen;
-  ncclNet_v6_as_v9.getDeviceMr = NULL;
-  ncclNet_v6_as_v9.irecvConsumed = NULL;
-  ncclNet_v6_as_v9.makeVDevice  = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
-  return ncclNet_v5->connect(dev, handle, sendComm);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
-  return ncclNet_v5->accept(listenComm, recvComm);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-// We use a wrapper around the v5 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v9.name = ncclNet_v5->name;
-  ncclNet_v5_as_v9.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties;
-  ncclNet_v5_as_v9.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect;
-  ncclNet_v5_as_v9.accept =  ncclNet_v5_as_v9_accept;
-  ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr;
-  ncclNet_v5_as_v9.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend;
-  ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv;
-  ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v9.test = ncclNet_v5->test;
-  ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen;
-  ncclNet_v5_as_v9.getDeviceMr = NULL;
-  ncclNet_v5_as_v9.irecvConsumed = NULL;
-  ncclNet_v5_as_v9.makeVDevice = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v5 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties;
-  ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr;
-  ncclCollNet_v5_as_v9.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce;
-  ncclCollNet_v5_as_v9.iallgather = nullptr;
-  ncclCollNet_v5_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v6 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v6->init(logfn));
-  ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
-  ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices;
-  ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties;
-  ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen;
-  ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect;
-  ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport;
-  ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr;
-  ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
-  ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr;
-  ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce;
-  ncclCollNet_v6_as_v9.iallgather = nullptr;
-  ncclCollNet_v6_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush;
-  ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test;
-  ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl;
-  ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v7_t p7;
-  ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
-  if (ans != ncclSuccess) return ans;
-  props->name = p7.name;
-  props->pciPath = p7.pciPath;
-  props->guid = p7.guid;
-  props->ptrSupport = p7.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p7.speed;
-  props->port = p7.port;
-  props->maxComms = p7.maxComms;
-  props->maxRecvs = p7.maxRecvs;
-  props->latency = p7.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v7 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v7->init(logfn));
-  ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
-  ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices;
-  ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties;
-  ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen;
-  ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect;
-  ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport;
-  ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr;
-  ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
-  ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr;
-  ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce;
-  ncclCollNet_v7_as_v9.iallgather = nullptr;
-  ncclCollNet_v7_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush;
-  ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test;
-  ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl;
-  ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v8_t p8;
-  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
-  if (ans != ncclSuccess) return ans;
-  props->name = p8.name;
-  props->pciPath = p8.pciPath;
-  props->guid = p8.guid;
-  props->ptrSupport = p8.ptrSupport;
-  props->regIsGlobal = p8.regIsGlobal;
-  props->forceFlush = 0;
-  props->speed = p8.speed;
-  props->port = p8.port;
-  props->maxComms = p8.maxComms;
-  props->maxRecvs = p8.maxRecvs;
-  props->latency = p8.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
-                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                           void* sendMhandle, void** request) {
-   ncclNetSGE_v8_t recvPartsInt;
-   if (nRecvParts > 1) return ncclInternalError;
-   if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
-   recvPartsInt.mhandle = recvParts->mhandle;
-   recvPartsInt.address = recvParts->address;
-   recvPartsInt.size = (int)recvParts->size;
-   ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
-                   bytesPerRank, windowOffset, windowBytes,
-                   sendMhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
-                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                               ncclDataType_t dataType, ncclRedOp_t redOp,
-                               void* recvMhandle, void** request) {
-   ncclNetSGE_v8_t sendPartsInt;
-   if (nSendParts > 1) return ncclInternalError;
-   if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
-   sendPartsInt.mhandle = sendParts->mhandle;
-   sendPartsInt.address = sendParts->address;
-   sendPartsInt.size = (int)sendParts->size;
-   ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
-                   recvData, bytesPerRank, windowOffset, windowBytes,
-                   dataType, redOp,
-                  recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v8 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v8->init(logfn));
-  ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
-  ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices;
-  ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties;
-  ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen;
-  ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect;
-  ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport;
-  ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr;
-  ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
-  ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr;
-  ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce;
-  ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather;
-  ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter;
-  ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush;
-  ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test;
-  ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl;
-  ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen;
-  return ncclSuccess;
-}
-
-static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
-ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
-enum ncclNetState {
-  ncclNetStateInit = 0,
-  ncclNetStateEnabled = 1,
-  ncclNetStateDisabled = 2
-};
-enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-
-#define MAX_STR_LEN 255
-
-static void* tryOpenLib(char* name, int* err, char* errStr) {
-  *err = 0;
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = '\0';
-    // "handle" and "name" won't be NULL at the same time.
-    // coverity[var_deref_model]
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openNetPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char netPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-  const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envNetPluginName && strlen(envNetPluginName)) {
-    snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-
-    snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-  } else {
-    snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so");
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-  }
-  return nullptr;
-}
-
-static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static int netPluginRefCount;
-static void* netPluginLib;
-
-enum {
-  netPluginLoadFailed  = -1,
-  netPluginLoadReady   =  0,
-  netPluginLoadSuccess =  1,
-};
-
-static int netPluginStatus = netPluginLoadReady;
-
-#define MAX_PLUGIN_LOAD 2
-
-ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  pthread_mutex_lock(&netPluginLock);
-  if (netPluginLoadFailed == netPluginStatus) {
-    goto exit;
-  }
-  if (netPluginLoadSuccess == netPluginStatus) {
-    ++netPluginRefCount;
-    goto exit;
-  }
-
-  netPluginLib = openNetPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
-  if (netPluginLib == nullptr) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Could not find:%s. Using internal network plugin.", couldNotFindNames);
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin.");
-    }
-    goto fail;
-  }
-
-  ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9");
-  if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
-    ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
-    if (ncclNet_v8 == nullptr) {
-      // Try v7 plugin
-      ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
-      if (ncclNet_v7 == nullptr) {
-        // Try v6 plugin
-        ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
-        if (ncclNet_v6 == nullptr) {
-          // Try v5 plugin
-          ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-          if (ncclNet_v5 == nullptr) {
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
-            goto fail;
-          } else {
-            ncclNets[0] = &ncclNet_v5_as_v9;
-            ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init;
-            // Set the name right away to allow for NCCL_NET=... to work
-            ncclNet_v5_as_v9.name = ncclNet_v5->name;
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
-          }
-        } else {
-          ncclNets[0] = &ncclNet_v6_as_v9;
-          ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init;
-          // Set the name right away to allow for NCCL_NET=... to work
-          ncclNet_v6_as_v9.name = ncclNet_v6->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
-        }
-      } else {
-        ncclNets[0] = &ncclNet_v7_as_v9;
-        ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init;
-        // Set the name right away to allow for NCCL_NET=... to work
-        ncclNet_v7_as_v9.name = ncclNet_v7->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
-      }
-    } else {
-      ncclNets[0] = &ncclNet_v8_as_v9;
-      ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init;
-      // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v8_as_v9.name = ncclNet_v8->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name);
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name);
-  }
-
-  // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9");
-  if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
-    ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
-    if (ncclCollNet_v8 == nullptr) {
-      ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
-      if (ncclCollNet_v7 == nullptr) {
-        ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
-        if (ncclCollNet_v6 == nullptr) {
-          ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-          if (ncclCollNet_v5 == nullptr) {
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
-          } else {
-            ncclCollNets[0] = &ncclCollNet_v5_as_v9;
-            ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init;
-            ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
-          }
-        } else {
-         ncclCollNets[0] = &ncclCollNet_v6_as_v9;
-         ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init;
-         ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
-         INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
-        }
-      } else {
-       ncclCollNets[0] = &ncclCollNet_v7_as_v9;
-       ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init;
-       ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
-       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
-      }
-    } else {
-      ncclCollNets[0] = &ncclCollNet_v8_as_v9;
-      ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init;
-      ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name);
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name);
-  }
-
-  ++netPluginRefCount;
-  netPluginStatus = netPluginLoadSuccess;
-  comm->netPluginLoaded = 1;
-
-exit:
-  pthread_mutex_unlock(&netPluginLock);
-  return ncclSuccess;
-fail:
-  if (netPluginLib) dlclose(netPluginLib);
-  netPluginStatus = netPluginLoadFailed;
-  goto exit;
-}
-
-ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&netPluginLock);
-  if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
-    if (ncclNets[0]) {
-      INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
-    }
-    if (ncclCollNets[0]) {
-      INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
-    }
-    dlclose(netPluginLib);
-    netPluginLib = nullptr;
-    ncclNets[0] = nullptr;
-    ncclCollNets[0] = nullptr;
-    netPluginStatus = netPluginLoadReady;
-    comm->netPluginLoaded = 0;
-    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
-      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
-  }
-  pthread_mutex_unlock(&netPluginLock);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
-  ncclNetProperties_t props;
-
-  NCCLCHECK(net->getProperties(dev, &props));
-  ncclNetDeviceType type = props.netDeviceType;
-  if (type) switch (type) {
-    case NCCL_NET_DEVICE_UNPACK:
-      if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
-        INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
-          props.netDeviceVersion);
-        return ncclSuccess;
-      } else {
-        WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
-          props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
-        return ncclInternalError;
-      }
-    default:
-      WARN("Unknown device code index %d \n", type);
-      return ncclInternalError;
-  }
-
-  return ncclSuccess;
-}
-
-static ncclResult_t netGetState(int i, enum ncclNetState* state) {
-  pthread_mutex_lock(&netLock);
-  if (ncclNetStates[i] == ncclNetStateInit) {
-    int ndev;
-    if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
-    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
-    else ncclNetStates[i] = ncclNetStateEnabled;
-  }
-  *state = ncclNetStates[i];
-  pthread_mutex_unlock(&netLock);
-  return ncclSuccess;
-}
-
-static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
-  pthread_mutex_lock(&netLock);
-  if (ncclCollNetStates[i] == ncclNetStateInit) {
-    int ndev;
-    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
-    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
-    else ncclCollNetStates[i] = ncclNetStateEnabled;
-  }
-  *state = ncclCollNetStates[i];
-  pthread_mutex_unlock(&netLock);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetInit(struct ncclComm* comm) {
-  // Initialize main communication network
-  const char* netName;
-  bool ok = false;
-
-  netName = comm->config.netName;
-  for (int i=0; i<3; i++) {
-    if (ncclNets[i] == nullptr) continue;
-    enum ncclNetState state;
-    NCCLCHECK(netGetState(i, &state));
-    if (state != ncclNetStateEnabled) continue;
-    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
-    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
-      // Mismatched device plugin version
-      continue;
-    }
-
-    comm->ncclNet = ncclNets[i];
-    ok = true;
-
-    if (ncclCollNets[i]) {
-      NCCLCHECK(collNetGetState(i, &state));
-      if (state == ncclNetStateEnabled) {
-        comm->ncclCollNet = ncclCollNets[i];
-      }
-    }
-    break;
-  }
-
-  if (!ok) {
-    WARN("Error: network %s not found.", netName ? netName : "");
-    return ncclInvalidUsage;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
-  comm->ncclNet = nullptr;
-  comm->ncclCollNet = nullptr;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
-  constexpr int GPU_BUF_SIZE = 2*1024*1024;
-#if CUDART_VERSION >= 11030
-  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
-  int driverVersion;
-  CUDACHECK(cudaDriverGetVersion(&driverVersion));
-  if (driverVersion >= 11030) {
-    int cudaDev, attr = 0;
-    CUDACHECK(cudaGetDevice(&cudaDev));
-    CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
-    *gdrSupport = attr;
-    return ncclSuccess;
-  }
-#endif
-  static int gdrSupportMatrix[32] = {
-	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
-  if (gdrSupportMatrix[comm->cudaDev] == -1) {
-    int netDevs;
-    NCCLCHECK(comm->ncclNet->devices(&netDevs));
-    gdrSupportMatrix[comm->cudaDev] = 0;
-    for (int dev=0; dev<netDevs; dev++) {
-      // Find a net device which is GDR-capable
-      ncclNetProperties_t props;
-      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
-      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
-
-    // Allocate memory on the GPU and try to register it on the NIC.
-    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
-    ncclNetHandle_t handle;
-    char* gpuPtr = NULL;
-    void* mHandle = NULL;
-    ncclResult_t ret;
-    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
-
-    bool connected;
-    connected = false;
-    while (!connected) {
-
-      // If we're aborting now, skip to cleanup
-      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
-        goto cleanup2;
-      }
-
-      if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2);
-
-      if (rComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
-
-      connected = (rComm != NULL) && (sComm != NULL);
-    }
-
-    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
-    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
-      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
-      gdrSupportMatrix[comm->cudaDev] = 1;
-    }
-    ncclDebugNoWarn = 0;
-    NCCLCHECK(ncclCudaFree(gpuPtr));
-cleanup2:
-    if (rComm != NULL)
-      NCCLCHECK(comm->ncclNet->closeRecv(rComm));
-    if (sComm != NULL)
-      NCCLCHECK(comm->ncclNet->closeSend(sComm));
-    NCCLCHECK(comm->ncclNet->closeListen(lComm));
-cleanup1:
-      break;
-    }
-  }
-  *gdrSupport = gdrSupportMatrix[comm->cudaDev];
-  return ncclSuccess;
-}
-
-int ncclNetVersion(struct ncclComm* comm) {
-  return
-    (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 :
-    (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 :
-    (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 :
-    (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 :
-    9;
-}
diff --git a/src/plugin/net.cc b/src/plugin/net.cc
new file mode 100644
index 0000000..9257d77
--- /dev/null
+++ b/src/plugin/net.cc
@@ -0,0 +1,319 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "net.h"
+#include "bootstrap.h"
+#include "checks.h"
+#include "plugin.h"
+
+#include <string.h>
+#include <errno.h>
+//#include <sys/types.h>
+//#include <sys/stat.h>
+//#include <unistd.h>
+
+extern ncclNet_t* getNcclNet_v6(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v7(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v8(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v9(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v10(void* netPluginLib);
+
+extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib);
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
+static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 };
+ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
+enum ncclNetState {
+  ncclNetStateInit = 0,
+  ncclNetStateEnabled = 1,
+  ncclNetStateDisabled = 2
+};
+enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+
+NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1);
+static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static void* netPluginLib;
+
+static int netPluginRefCount;
+static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();}
+
+enum {
+  netPluginLoadFailed  = -1,
+  netPluginLoadReady   =  0,
+  netPluginLoadSuccess =  1,
+};
+
+static int netPluginStatus = netPluginLoadReady;
+
+ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
+  static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT;
+  pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce);
+
+  pthread_mutex_lock(&netPluginLock);
+  if (netPluginLoadFailed == netPluginStatus) {
+    goto exit;
+  }
+  if (netPluginLoadSuccess == netPluginStatus) {
+    ++netPluginRefCount;
+    goto exit;
+  }
+
+  netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN"));
+  if (netPluginLib == nullptr) {
+    goto fail;
+  }
+
+  ncclNets[0] = getNcclNet_v10(netPluginLib);
+  if (ncclNets[0]) ncclNetsVer[0] = 10;
+  if (ncclNets[0] == nullptr) {
+    // Try v9 plugin
+    ncclNets[0] = getNcclNet_v9(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 9;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v8 plugin
+    ncclNets[0] = getNcclNet_v8(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 8;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v7 plugin
+    ncclNets[0] = getNcclNet_v7(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 7;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v6 plugin
+    ncclNets[0] = getNcclNet_v6(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 6;
+  }
+  if (ncclNets[0] == nullptr) {
+    goto fail;
+  }
+
+  // Check for CollNet
+  ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v9(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v8(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v7(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v6(netPluginLib);
+  }
+
+  ++netPluginRefCount;
+  netPluginStatus = netPluginLoadSuccess;
+  comm->netPluginLoaded = 1;
+
+exit:
+  pthread_mutex_unlock(&netPluginLock);
+  return ncclSuccess;
+fail:
+  if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib));
+  netPluginStatus = netPluginLoadFailed;
+  goto exit;
+}
+
+ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
+  pthread_mutex_lock(&netPluginLock);
+  if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
+    if (ncclNets[0]) {
+      INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
+    }
+    if (ncclCollNets[0]) {
+      INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
+    }
+    NCCLCHECK(ncclClosePluginLib(netPluginLib));
+    netPluginLib = nullptr;
+    ncclNets[0] = nullptr;
+    ncclCollNets[0] = nullptr;
+    netPluginStatus = netPluginLoadReady;
+    comm->netPluginLoaded = 0;
+    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
+      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
+  }
+  pthread_mutex_unlock(&netPluginLock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
+  ncclNetProperties_t props;
+
+  NCCLCHECK(net->getProperties(dev, &props));
+  ncclNetDeviceType type = props.netDeviceType;
+  if (type) switch (type) {
+    case NCCL_NET_DEVICE_UNPACK:
+      if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
+        INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
+          props.netDeviceVersion);
+        return ncclSuccess;
+      } else {
+        WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
+          props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
+        return ncclInternalError;
+      }
+    default:
+      WARN("Unknown device code index %d \n", type);
+      return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+static ncclResult_t netGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
+    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
+    else ncclNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclCollNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else ncclCollNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclCollNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetInit(struct ncclComm* comm) {
+  // Initialize main communication network
+  const char* netName;
+  bool ok = false;
+
+  netName = comm->config.netName;
+  for (int i=0; i<3; i++) {
+    if (ncclNets[i] == nullptr) continue;
+    enum ncclNetState state;
+    NCCLCHECK(netGetState(i, &state));
+    if (state != ncclNetStateEnabled) continue;
+    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
+    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
+      // Mismatched device plugin version
+      continue;
+    }
+
+    comm->ncclNet = ncclNets[i];
+    comm->ncclNetVer = ncclNetsVer[i];
+    ok = true;
+
+    if (ncclCollNets[i]) {
+      NCCLCHECK(collNetGetState(i, &state));
+      if (state == ncclNetStateEnabled) {
+        comm->ncclCollNet = ncclCollNets[i];
+      }
+    }
+    break;
+  }
+
+  if (!ok) {
+    WARN("Error: network %s not found.", netName ? netName : "");
+    return ncclInvalidUsage;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
+  comm->ncclNet = nullptr;
+  comm->ncclCollNet = nullptr;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
+  constexpr int GPU_BUF_SIZE = 2*1024*1024;
+#if CUDART_VERSION >= 11030
+  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
+  int driverVersion;
+  CUDACHECK(cudaDriverGetVersion(&driverVersion));
+  if (driverVersion >= 11030) {
+    int cudaDev, attr = 0;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
+    *gdrSupport = attr;
+    return ncclSuccess;
+  }
+#endif
+  static int gdrSupportMatrix[32] = {
+	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+  if (gdrSupportMatrix[comm->cudaDev] == -1) {
+    int netDevs;
+    NCCLCHECK(comm->ncclNet->devices(&netDevs));
+    gdrSupportMatrix[comm->cudaDev] = 0;
+    for (int dev=0; dev<netDevs; dev++) {
+      // Find a net device which is GDR-capable
+      ncclNetProperties_t props;
+      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
+      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+
+    // Allocate memory on the GPU and try to register it on the NIC.
+    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+    ncclNetHandle_t handle;
+    char* gpuPtr = NULL;
+    void* mHandle = NULL;
+    ncclResult_t ret;
+    ncclDebugNoWarn = NCCL_NET;
+    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
+
+    bool connected;
+    connected = false;
+    while (!connected) {
+
+      // If we're aborting now, skip to cleanup
+      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
+        goto cleanup2;
+      }
+
+      if (sComm == NULL)
+        NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2);
+
+      if (rComm == NULL)
+        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
+
+      connected = (rComm != NULL) && (sComm != NULL);
+    }
+
+    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
+    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
+      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
+      gdrSupportMatrix[comm->cudaDev] = 1;
+    }
+    ncclDebugNoWarn = 0;
+    NCCLCHECK(ncclCudaFree(gpuPtr));
+cleanup2:
+    if (rComm != NULL)
+      NCCLCHECK(comm->ncclNet->closeRecv(rComm));
+    if (sComm != NULL)
+      NCCLCHECK(comm->ncclNet->closeSend(sComm));
+    NCCLCHECK(comm->ncclNet->closeListen(lComm));
+cleanup1:
+      break;
+    }
+  }
+  *gdrSupport = gdrSupportMatrix[comm->cudaDev];
+  return ncclSuccess;
+}
diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc
new file mode 100644
index 0000000..682f239
--- /dev/null
+++ b/src/plugin/net/net_v10.cc
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+
+static ncclNet_v10_t* ncclNet_v10;
+static ncclCollNet_v10_t* ncclCollNet_v10;
+
+ncclNet_t* getNcclNet_v10(void* lib) {
+  ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10");
+  if (ncclNet_v10) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name);
+    return ncclNet_v10;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol.");
+  return nullptr;
+}
+
+ncclCollNet_t* getNcclCollNet_v10(void* lib) {
+  ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10");
+  if (ncclCollNet_v10) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name);
+    return ncclCollNet_v10;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc
new file mode 100644
index 0000000..baff679
--- /dev/null
+++ b/src/plugin/net/net_v6.cc
@@ -0,0 +1,178 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v6_t* ncclNet_v6;
+static ncclCollNet_v6_t* ncclCollNet_v6;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+  return ncclNet_v6->connect(dev, handle, sendComm);
+}
+
+static ncclResult_t ncclNet_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+  return ncclNet_v6->accept(listenComm, recvComm);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+     ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v6->init(logfn));
+  ncclNet.devices = ncclNet_v6->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v6->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_accept;
+  ncclNet.regMr = ncclNet_regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v6->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v6->iflush;
+  ncclNet.test = ncclNet_v6->test;
+  ncclNet.closeSend = ncclNet_v6->closeSend;
+  ncclNet.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet.closeListen = ncclNet_v6->closeListen;
+  ncclNet.getDeviceMr = NULL;
+  ncclNet.irecvConsumed = NULL;
+  ncclNet.makeVDevice  = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v6(void* lib) {
+  ncclNet_v6 = (ncclNet_v6_t*)dlsym(lib, "ncclNetPlugin_v6");
+  if (ncclNet_v6) {
+    ncclNet.name = ncclNet_v6->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNet_v6->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v6->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v6->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v6->listen;
+  ncclCollNet.connect = ncclCollNet_v6->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = nullptr;
+  ncclCollNet.ireducescatter = nullptr;
+  ncclCollNet.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet.test = ncclCollNet_v6->test;
+  ncclCollNet.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v6->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v6(void* lib) {
+  ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(lib, "ncclCollNetPlugin_v6");
+  if (ncclCollNet_v6) {
+    ncclCollNet.name = ncclCollNet_v6->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNet_v6->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc
new file mode 100644
index 0000000..4bad5ec
--- /dev/null
+++ b/src/plugin/net/net_v7.cc
@@ -0,0 +1,174 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v7_t* ncclNet_v7;
+static ncclCollNet_v7_t* ncclCollNet_v7;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v7_t p7;
+  ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
+  if (ans != ncclSuccess) return ans;
+  props->name = p7.name;
+  props->pciPath = p7.pciPath;
+  props->guid = p7.guid;
+  props->ptrSupport = p7.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p7.speed;
+  props->port = p7.port;
+  props->maxComms = p7.maxComms;
+  props->maxRecvs = p7.maxRecvs;
+  props->latency = p7.latency;
+  props->netDeviceType = p7.netDeviceType;
+  props->netDeviceVersion = p7.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v7->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v7_t p7;
+  ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
+  if (ans != ncclSuccess) return ans;
+  props->name = p7.name;
+  props->pciPath = p7.pciPath;
+  props->guid = p7.guid;
+  props->ptrSupport = p7.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p7.speed;
+  props->port = p7.port;
+  props->maxComms = p7.maxComms;
+  props->maxRecvs = p7.maxRecvs;
+  props->latency = p7.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v7->init(logfn));
+  ncclNet.devices = ncclNet_v7->devices;
+  ncclNet.getProperties = ncclNet_getProperties; // ncclNet_v5->getProperties;
+  ncclNet.listen = ncclNet_v7->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_v7->accept;
+  ncclNet.regMr = ncclNet_regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v7->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v7->iflush;
+  ncclNet.test = ncclNet_v7->test;
+  ncclNet.closeSend = ncclNet_v7->closeSend;
+  ncclNet.closeRecv = ncclNet_v7->closeRecv;
+  ncclNet.closeListen = ncclNet_v7->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v7->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet.makeVDevice  = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v7(void* lib) {
+  ncclNet_v7 = (ncclNet_v7_t*)dlsym(lib, "ncclNetPlugin_v7");
+  if (ncclNet_v7) {
+    ncclNet.name = ncclNet_v7->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNet_v7->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v7->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v7->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v7->listen;
+  ncclCollNet.connect = ncclCollNet_v7->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v7->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v7->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = nullptr;
+  ncclCollNet.ireducescatter = nullptr;
+  ncclCollNet.iflush = ncclCollNet_v7->iflush;
+  ncclCollNet.test = ncclCollNet_v7->test;
+  ncclCollNet.closeColl = ncclCollNet_v7->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v7->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v7(void* lib) {
+  ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(lib, "ncclCollNetPlugin_v7");
+  if (ncclCollNet_v7) {
+    ncclCollNet.name = ncclCollNet_v7->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNet_v7->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc
new file mode 100644
index 0000000..b43bb89
--- /dev/null
+++ b/src/plugin/net/net_v8.cc
@@ -0,0 +1,196 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v8_t* ncclNet_v8;
+static ncclCollNet_v8_t* ncclCollNet_v8;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType = p8.netDeviceType;
+  props->netDeviceVersion = p8.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v8->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
+                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                           void* sendMhandle, void** request) {
+  ncclNetSGE_v8_t recvPartsInt;
+  if (nRecvParts > 1) return ncclInternalError;
+  if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+  recvPartsInt.mhandle = recvParts->mhandle;
+  recvPartsInt.address = recvParts->address;
+  recvPartsInt.size = (int)recvParts->size;
+  ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
+                  bytesPerRank, windowOffset, windowBytes,
+                  sendMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData,
+                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                               ncclDataType_t dataType, ncclRedOp_t redOp,
+                               void* recvMhandle, void** request) {
+  ncclNetSGE_v8_t sendPartsInt;
+  if (nSendParts > 1) return ncclInternalError;
+  if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+  sendPartsInt.mhandle = sendParts->mhandle;
+  sendPartsInt.address = sendParts->address;
+  sendPartsInt.size = (int)sendParts->size;
+  ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
+                  recvData, bytesPerRank, windowOffset, windowBytes,
+                  dataType, redOp,
+                  recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v8->init(logfn));
+  ncclNet.devices = ncclNet_v8->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v8->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_v8->accept;
+  ncclNet.regMr = ncclNet_v8->regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v8->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v8->iflush;
+  ncclNet.test = ncclNet_v8->test;
+  ncclNet.closeSend = ncclNet_v8->closeSend;
+  ncclNet.closeRecv = ncclNet_v8->closeRecv;
+  ncclNet.closeListen = ncclNet_v8->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed;
+  ncclNet.makeVDevice   = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v8(void* lib) {
+  ncclNet_v8 = (ncclNet_v8_t*)dlsym(lib, "ncclNetPlugin_v8");
+  if (ncclNet_v8) {
+    ncclNet.name = ncclNet_v8->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNet_v8->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v8->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v8->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v8->listen;
+  ncclCollNet.connect = ncclCollNet_v8->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v8->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_v8->regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v8->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = ncclCollNet_iallgather;
+  ncclCollNet.ireducescatter = ncclCollNet_ireducescatter;
+  ncclCollNet.iflush = ncclCollNet_v8->iflush;
+  ncclCollNet.test = ncclCollNet_v8->test;
+  ncclCollNet.closeColl = ncclCollNet_v8->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v8->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v8(void* lib) {
+  ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(lib, "ncclCollNetPlugin_v8");
+  if (ncclCollNet_v8) {
+    ncclCollNet.name = ncclCollNet_v8->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNet_v8->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc
new file mode 100644
index 0000000..34e0393
--- /dev/null
+++ b/src/plugin/net/net_v9.cc
@@ -0,0 +1,121 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v9_t* ncclNet_v9;
+static ncclCollNet_v9_t* ncclCollNet_v9;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  return ncclNet_v9->isend(sendComm, data, size, tag, mhandle, request);
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  return ncclNet_v9->irecv(recvComm, n, data, sizes, tags, mhandles, request);
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v9->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) {
+  return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props);
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+}
+
+static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request) {
+  return ncclCollNet_v9->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v9_t*)recvParts, bytesPerRank,
+                             windowOffset, windowBytes, sendMhandle, request);
+}
+
+static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request) {
+  return ncclCollNet_v9->ireducescatter(collComm, nSendParts, (ncclNetSGE_v9_t*)sendParts, recvData, bytesPerRank,
+                                 windowOffset, windowBytes, dataType, redOp, recvMhandle, request);
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v9->init(logfn));
+  ncclNet.devices = ncclNet_v9->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v9->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept = ncclNet_v9->accept;
+  ncclNet.regMr = ncclNet_v9->regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v9->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v9->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v9->iflush;
+  ncclNet.test = ncclNet_v9->test;
+  ncclNet.closeSend = ncclNet_v9->closeSend;
+  ncclNet.closeRecv = ncclNet_v9->closeRecv;
+  ncclNet.closeListen = ncclNet_v9->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed;
+  ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v9(void* lib) {
+  ncclNet_v9 = (ncclNet_v9_t*)dlsym(lib, "ncclNetPlugin_v9");
+  if (ncclNet_v9) {
+    ncclNet.name = ncclNet_v9->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNet_v9->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v9->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v9->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v9->listen;
+  ncclCollNet.connect = ncclCollNet_v9->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v9->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_v9->regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v9->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v9->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_v9->iallreduce;
+  ncclCollNet.iallgather = ncclCollNet_iallgather;
+  ncclCollNet.ireducescatter = ncclCollNet_ireducescatter;
+  ncclCollNet.iflush = ncclCollNet_v9->iflush;
+  ncclCollNet.test = ncclCollNet_v9->test;
+  ncclCollNet.closeColl = ncclCollNet_v9->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v9->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v9(void* lib) {
+  ncclCollNet_v9 = (ncclCollNet_v9_t*)dlsym(lib, "ncclCollNetPlugin_v9");
+  if (ncclCollNet_v9) {
+    ncclCollNet.name = ncclCollNet_v9->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNet_v9->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
new file mode 100644
index 0000000..a43df28
--- /dev/null
+++ b/src/plugin/plugin_open.cc
@@ -0,0 +1,134 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <dlfcn.h>
+
+#include "debug.h"
+
+#define MAX_STR_LEN 255
+
+enum ncclPluginType {
+  ncclPluginTypeNet,
+  ncclPluginTypeTuner,
+  ncclPluginTypeProfiler,
+};
+
+#define NUM_LIBS 3
+static void *libHandles[NUM_LIBS];
+static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
+static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" };
+static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" };
+static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
+
+static void* tryOpenLib(char* name, int* err, char* errStr) {
+  *err = 0;
+  if (nullptr == name || strlen(name) == 0) {
+    return nullptr;
+  }
+
+  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
+    name = nullptr;
+  }
+
+  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == handle) {
+    strncpy(errStr, dlerror(), MAX_STR_LEN);
+    errStr[MAX_STR_LEN] = '\0';
+    // "handle" and "name" won't be NULL at the same time.
+    // coverity[var_deref_model]
+    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+      *err = ENOENT;
+    }
+  }
+  return handle;
+}
+
+static void appendNameToList(char* nameList, int *nameListLen, char* name) {
+  snprintf(nameList, *nameListLen, " %s", name);
+  nameList += strlen(name) + 1;
+  *nameListLen -= strlen(name) + 1;
+}
+
+static void* openPluginLib(enum ncclPluginType type, const char* libName) {
+  int openErr, len = PATH_MAX;
+  char libName_[MAX_STR_LEN] = { 0 };
+  char openErrStr[MAX_STR_LEN + 1] = { 0 };
+  char eNoEntNameList[PATH_MAX] = { 0 };
+
+  if (libName && strlen(libName)) {
+    snprintf(libName_, MAX_STR_LEN, "%s", libName);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+
+    snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+  } else {
+    snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+  }
+
+  if (strlen(eNoEntNameList)) {
+    INFO(subsys[type], "%s/Plugin: Could not find:%s. %s", pluginNames[type], eNoEntNameList, pluginFallback[type]);
+  } else if (strlen(pluginFallback[type])) {
+    INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], pluginFallback[type]);
+  }
+  return nullptr;
+}
+
+void* ncclOpenNetPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeNet, name);
+}
+
+void* ncclOpenTunerPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeTuner, name);
+}
+
+void* ncclOpenProfilerPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeProfiler, name);
+}
+
+void* ncclGetNetPluginLib(void) {
+  return libHandles[ncclPluginTypeNet];
+}
+
+ncclResult_t ncclClosePluginLib(void* handle) {
+  for (int l=0; l<NUM_LIBS; l++) {
+    if (libHandles[l] == handle) {
+      libHandles[l] = nullptr;
+      dlclose(handle);
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
+}
diff --git a/src/misc/profiler.cc b/src/plugin/profiler.cc
similarity index 57%
rename from src/misc/profiler.cc
rename to src/plugin/profiler.cc
index c9fb2a8..023a704 100644
--- a/src/misc/profiler.cc
+++ b/src/plugin/profiler.cc
@@ -11,182 +11,20 @@
 #include "utils.h"
 #include "proxy.h"
 #include "profiler.h"
+#include "transport.h"
+#include "plugin.h"
+
+extern ncclProfiler_t* getNcclProfiler_v1(void* lib);
+extern ncclProfiler_t* getNcclProfiler_v2(void* lib);
+extern ncclProfiler_t* getNcclProfiler_v3(void* lib);
 
 static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
 static int profilerPluginRefCount;
 static void* profilerPluginLib;
 static ncclProfiler_t* ncclProfiler;
-static ncclProfiler_v2_t ncclProfiler_v1_as_v2;
-static ncclProfiler_v1_t* ncclProfiler_v1;
-
-static uint8_t ncclStringToFunc(const char* func) {
-  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
-  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
-  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
-  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
-  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
-  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
-  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
-  return ncclFuncSend;
-}
-
-static uint8_t ncclStringToAlgo(const char* algo) {
-  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
-  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
-  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
-  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
-  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
-  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
-  return NCCL_ALGO_PAT;
-}
-
-static uint8_t ncclStringToProto(const char* proto) {
-  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
-  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
-  return NCCL_PROTO_SIMPLE;
-}
-
-static uint8_t ncclStringToDatatype(const char* dt) {
-  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
-  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
-  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
-  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
-  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
-  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
-  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
-#endif
-  return ncclFloat64;
-}
-
-static ncclResult_t ncclProfiler_v1_as_v2_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr) {
-  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
-  eDescr_v1.type = eDescr->type;
-  eDescr_v1.parentObj = eDescr->parentObj;
-  eDescr_v1.rank = eDescr->rank;
-  switch(eDescr->type) {
-    case ncclProfileGroup: break;
-    case ncclProfileColl: {
-      eDescr_v1.coll.name = eDescr->coll.name;
-      eDescr_v1.coll.commHash = eDescr->coll.commHash;
-      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
-      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
-      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
-      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
-      eDescr_v1.coll.count = eDescr->coll.count;
-      eDescr_v1.coll.root = eDescr->coll.root;
-      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
-      eDescr_v1.coll.op = 0; // removed in v2
-      eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes;
-      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
-      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
-      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
-      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
-    } break;
-    case ncclProfileP2p: {
-      eDescr_v1.p2p.name = eDescr->p2p.name;
-      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
-      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
-      eDescr_v1.p2p.buff = eDescr->p2p.buff;
-      eDescr_v1.p2p.count = eDescr->p2p.count;
-      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
-      eDescr_v1.p2p.peer = eDescr->p2p.peer;
-    } break;
-    case ncclProfileProxyOp: {
-      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
-      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
-      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
-      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
-      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
-      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
-    } break;
-    case ncclProfileProxyStep: {
-      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
-    } break;
-    case ncclProfileProxyCtrl: break;
-    default:;
-  }
-  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
-}
-
-static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) {
-  ncclProfiler_v1->init(context, eActivationMask);
-  ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent;
-  ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent;
-  ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState;
-  ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize;
-  return ncclSuccess;
-}
 
 #define MAX_STR_LEN 256
 
-static void* tryOpenLib(char* name, int *err, char* errStr) {
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = 0;
-    if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openProfilerPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char profilerPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-
-  const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN");
-  if (envProfilerPluginName && strlen(envProfilerPluginName)) {
-    snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName);
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
-      return pluginLib;
-    }
-
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-  } else {
-    snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so");
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-  }
-
-  return nullptr;
-}
-
 enum {
   profilerPluginLoadFailed = -1,
   profilerPluginLoadReady = 0,
@@ -195,43 +33,31 @@ enum {
 static int profilerPluginStatus = profilerPluginLoadReady;
 static pid_t pid;
 
-#define MAX_PLUGIN_LOAD 2
-
 static ncclResult_t ncclProfilerPluginLoad(void) {
   if (profilerPluginLoadFailed == profilerPluginStatus) {
     return ncclSuccess;
   }
 
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
   pthread_mutex_lock(&profilerLock);
   if (profilerPluginLoadSuccess == profilerPluginStatus) {
     ++profilerPluginRefCount;
     goto exit;
   }
 
-  profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
+  profilerPluginLib = ncclOpenProfilerPluginLib(ncclGetEnv("NCCL_PROFILER_PLUGIN"));
   if (profilerPluginLib == nullptr) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames);
-    }
     goto fail;
   }
 
-  ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2");
+  ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
   if (ncclProfiler == nullptr) {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2.");
-    ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1");
-    if (ncclProfiler_v1 == nullptr) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
-      goto fail;
-    } else {
-      ncclProfiler = &ncclProfiler_v1_as_v2;
-      ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name;
-      ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init;
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1.");
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2.");
+    ncclProfiler = getNcclProfiler_v2(profilerPluginLib);
+  }
+  if (ncclProfiler == NULL) {
+    ncclProfiler = getNcclProfiler_v1(profilerPluginLib);
+  }
+  if (ncclProfiler == NULL) {
+    goto fail;
   }
 
   ++profilerPluginRefCount;
@@ -247,7 +73,7 @@ exit:
   pthread_mutex_unlock(&profilerLock);
   return ncclSuccess;
 fail:
-  if (profilerPluginLib) dlclose(profilerPluginLib);
+  if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib));
   profilerPluginStatus = profilerPluginLoadFailed;
   goto exit;
 }
@@ -256,7 +82,7 @@ static ncclResult_t ncclProfilerPluginUnload(void) {
   pthread_mutex_lock(&profilerLock);
   if (0 == (--profilerPluginRefCount)) {
     INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
-    dlclose(profilerPluginLib);
+    NCCLCHECK(ncclClosePluginLib(profilerPluginLib));
     profilerPluginLib = nullptr;
     ncclProfiler = nullptr;
     profilerPluginStatus = profilerPluginLoadReady;
@@ -269,6 +95,11 @@ static ncclResult_t ncclProfilerPluginUnload(void) {
 #include "timer.h"
 
 #if ENABLE_TIMER
+// These counters are used to measure profiler overheads for different part of the code
+// These counters are only useful/meaningful in controlled test environments where there
+// is only one thread updating each set of counters, i.e., every communicator has its
+// own proxy thread and the network uses only one thread to make progress (this is true
+// for net_ib plugin but might not be true for net_socket plugin).
 static int64_t elapsedCount;
 static int64_t initCount, finalizeCount;
 static int64_t groupStartCount, groupStopCount;
@@ -324,15 +155,14 @@ static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2];
 #endif
 
 
-static int eActivationMask;       // Set by profiler
-static int eActivationMaskGroup;  // Cached for current group
+int ncclProfilerEventMask;       // Set by profiler
 
 ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
   TIME_START_EVENT(elapsed);
   TIME_START_EVENT(init);
   ncclProfilerPluginLoad();
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask);
+    int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask);
     if (err) {
       WARN("Profiler init failed with error (%d). Continue without profiler.", err);
       ncclProfiler = NULL;
@@ -356,9 +186,29 @@ ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
 
 ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(groupStart);
-  eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
+    // Check if any collective in the plan has a set event activation mask
+    struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+    int eActivationMask_ = 0;
+    while (ct) {
+      if (ct->eActivationMask) {
+        eActivationMask_ = ct->eActivationMask;
+        goto startGroup;
+      }
+      ct = ct->next;
+    }
+    // Check if any pt2pt in the plan has a set event activation mask
+    while (pt) {
+      if (pt->eActivationMask) {
+        eActivationMask_ = pt->eActivationMask;
+        goto startGroup;
+      }
+      pt = pt->next;
+    }
+
+  startGroup:
+    if (eActivationMask_ & (ncclProfileGroup | ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileGroup;
       ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
@@ -379,52 +229,63 @@ ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) {
 
 ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(taskStart);
-  if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
-    if (plan->groupEventHandle && enable) {
-      struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
-      while (ct) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileColl;
-        eDescr.parentObj = plan->groupEventHandle;
-        eDescr.rank = plan->comm->rank;
-        eDescr.coll.name = plan->comm->commName;
-        eDescr.coll.commHash = plan->comm->commHash;
-        eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
-        eDescr.coll.func = ncclFuncToString(ct->func);
-        eDescr.coll.sendBuff = ct->sendbuff;
-        eDescr.coll.recvBuff = ct->recvbuff;
-        eDescr.coll.count = ct->count;
-        eDescr.coll.root = ct->root;
-        eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
-        eDescr.coll.trafficBytes = ct->trafficBytes;
-        eDescr.coll.nMaxChannels = ct->nMaxChannels;
-        eDescr.coll.nWarps = ct->nWarps;
-        eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
-        eDescr.coll.proto = ncclProtoToString(ct->protocol);
-        ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
-
-        // update collective task with group event activation mask
-        ct->eActivationMask = eActivationMaskGroup;
-        ct = ct->next;
+  struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+  while (ct) {
+    if (__builtin_expect(ncclProfiler != NULL, 0)) {
+      if (plan->groupEventHandle) {
+        int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin);
+        if (enable) {
+          ncclProfilerEventDescr_t eDescr = { 0 };
+          eDescr.type = ncclProfileColl;
+          eDescr.parentObj = plan->groupEventHandle;
+          eDescr.rank = plan->comm->rank;
+          eDescr.coll.name = plan->comm->commName;
+          eDescr.coll.commHash = plan->comm->commHash;
+          eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
+          eDescr.coll.func = ncclFuncToString(ct->func);
+          eDescr.coll.sendBuff = ct->sendbuff;
+          eDescr.coll.recvBuff = ct->recvbuff;
+          eDescr.coll.count = ct->count;
+          eDescr.coll.root = ct->root;
+          eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
+          eDescr.coll.nMaxChannels = ct->nMaxChannels;
+          eDescr.coll.nWarps = ct->nWarps;
+          eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
+          eDescr.coll.proto = ncclProtoToString(ct->protocol);
+          ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
+        }
       }
+    }
+    // comm->seqNumber values are updated even if the plugin is not active, since they are used by RAS as well.
+    // The test for "persistent" is a workaround for graph-captured collectives.  In their case this function may not be
+    // consistently invoked on all the ranks, which would lead to mismatched counter values and thus false-positive
+    // reports from RAS.  Instead, we choose not to include graph-captured collectives in our counts.  An exception is
+    // made if ncclProfileKernelCh profiler events are active, as they result in proxy events always being added, which
+    // gives the consistency.
+    if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle &&
+                              (ct->eActivationMask & ncclProfileKernelCh)))
+      plan->comm->seqNumber[ct->func]++;
+    ct = ct->next;
+  }
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    if (plan->groupEventHandle) {
       struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
       while (pt) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileP2p;
-        eDescr.parentObj = plan->groupEventHandle;
-        eDescr.rank = plan->comm->rank;
-        eDescr.p2p.name = plan->comm->commName;
-        eDescr.p2p.commHash = plan->comm->commHash;
-        eDescr.p2p.func = ncclFuncToString(pt->func);
-        eDescr.p2p.buff = pt->buff;
-        eDescr.p2p.count = pt->count;
-        eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
-        eDescr.p2p.peer = pt->root;
-        ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
-
-        // update collective task with group event activation mask
-        pt->eActivationMask = eActivationMaskGroup;
+        int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh);
+        if (enable) {
+          ncclProfilerEventDescr_t eDescr = { 0 };
+          eDescr.type = ncclProfileP2p;
+          eDescr.parentObj = plan->groupEventHandle;
+          eDescr.rank = plan->comm->rank;
+          eDescr.p2p.name = plan->comm->commName;
+          eDescr.p2p.commHash = plan->comm->commHash;
+          eDescr.p2p.func = ncclFuncToString(pt->func);
+          eDescr.p2p.buff = pt->buff;
+          eDescr.p2p.count = pt->count;
+          eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
+          eDescr.p2p.peer = pt->root;
+          ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
+        }
         pt = pt->next;
       }
     }
@@ -436,16 +297,15 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
 ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(taskStop);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
-    if (plan->groupEventHandle && enable) {
+    if (plan->groupEventHandle) {
       struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
       while (ct) {
-        ncclProfiler->stopEvent(ct->eventHandle);
+        if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle);
         ct = ct->next;
       }
       struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
       while (pt) {
-        ncclProfiler->stopEvent(pt->eventHandle);
+        if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle);
         pt = pt->next;
       }
     }
@@ -463,7 +323,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
+    if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
@@ -485,7 +345,7 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
+    if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
@@ -518,7 +378,7 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
+    if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) {
       int step_ = DIVUP(stepId, args->sliceSteps);
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyStep;
@@ -536,7 +396,7 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
+    if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) {
       int step_ = DIVUP(stepId, args->sliceSteps);
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyStep;
@@ -568,7 +428,7 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand
   TIME_START_EVENT(proxyCtrlStart);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     // for proxy control events we allow profiling mode to change on a per event basis
-    int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
+    int eActivationMaskProxy = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
     if (eActivationMaskProxy & ncclProfileProxyCtrl) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyCtrl;
@@ -591,6 +451,30 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = &args->subs[s];
+    if (sub->eActivationMask & ncclProfileKernelCh) {
+      ncclProfilerEventDescr_t eDescr = { };
+      eDescr.type = ncclProfileKernelCh;
+      eDescr.parentObj = sub->taskEventHandle;
+      eDescr.kernelCh.channelId = sub->channelId;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr);
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = &args->subs[s];
+    if (sub->kernelEventHandle) {
+      ncclProfiler->stopEvent(sub->kernelEventHandle);
+    }
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyOpRecord);
   struct ncclProxySubArgs* sub = &args->subs[s];
@@ -619,7 +503,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs*
 
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyCtrlRecord);
-  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
+  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
     ncclProfilerEventStateArgs_t args = { };
     args.proxyCtrl.appendedProxyOps = appended;
     ncclProfiler->recordEventState(eHandle, eState, &args);
@@ -632,3 +516,47 @@ ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) {
   op->pid = pid;
   return ncclSuccess;
 }
+
+static pthread_mutex_t proxyProfilerConnectLock = PTHREAD_MUTEX_INITIALIZER;
+
+static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxyOp* op) {
+  ncclResult_t ret = ncclSuccess;
+  pthread_mutex_lock(&proxyProfilerConnectLock);
+  if (comm->profiler.initialized) goto exit;
+  for (int c = 0; c < MAXCHANNELS; c++) {
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.sendProxyConn[c]), ret, exit);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.sendProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit);
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.recvProxyConn[c]), ret, exit);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.recvProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit);
+  }
+  comm->profiler.initialized = true;
+exit:
+  pthread_mutex_unlock(&proxyProfilerConnectLock);
+  return ret;
+}
+
+bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op) {
+  bool enabled = (__builtin_expect(ncclProfiler != NULL, 0) && (op->eActivationMask & ncclProfileKernelCh));
+  if (enabled && !comm->profiler.initialized) (void)proxyProfilerConnect(comm, op);
+  return enabled;
+}
+
+ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle;
+    if (type == 0) { // start
+      if (sub->eActivationMask & ncclProfileNetPlugin) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileNetPlugin;
+        eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS];
+        eDescr.rank = sub->rank;
+        eDescr.netPlugin.id = pluginId;
+        eDescr.netPlugin.data = extData;
+        ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr);
+      }
+    } else { // stop
+      ncclProfiler->stopEvent(*eHandle);
+    }
+  }
+  return ncclSuccess;
+}
diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc
new file mode 100644
index 0000000..1397429
--- /dev/null
+++ b/src/plugin/profiler/profiler_v1.cc
@@ -0,0 +1,133 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+#include "checks.h"
+
+static ncclProfiler_t ncclProfiler;
+static ncclProfiler_v1_t* ncclProfiler_v1;
+
+static uint8_t ncclStringToFunc(const char* func) {
+  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
+  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
+  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
+  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
+  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
+  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
+  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
+  return ncclFuncSend;
+}
+
+static uint8_t ncclStringToAlgo(const char* algo) {
+  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
+  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
+  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
+  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
+  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
+  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
+  return NCCL_ALGO_PAT;
+}
+
+static uint8_t ncclStringToProto(const char* proto) {
+  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
+  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
+  return NCCL_PROTO_SIMPLE;
+}
+
+static uint8_t ncclStringToDatatype(const char* dt) {
+  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
+  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
+  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
+  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
+  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
+  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
+  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
+#endif
+  return ncclFloat64;
+}
+
+static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
+  eDescr_v1.type = eDescr->type;
+  eDescr_v1.parentObj = eDescr->parentObj;
+  eDescr_v1.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v1.coll.name = eDescr->coll.name;
+      eDescr_v1.coll.commHash = eDescr->coll.commHash;
+      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
+      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v1.coll.count = eDescr->coll.count;
+      eDescr_v1.coll.root = eDescr->coll.root;
+      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
+      eDescr_v1.coll.op = 0; // removed in v2
+      eDescr_v1.coll.trafficBytes = 0; // removed in v3
+      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
+      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v1.p2p.name = eDescr->p2p.name;
+      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
+      eDescr_v1.p2p.buff = eDescr->p2p.buff;
+      eDescr_v1.p2p.count = eDescr->p2p.count;
+      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
+      eDescr_v1.p2p.peer = eDescr->p2p.peer;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    case ncclProfileKernelCh:
+    case ncclProfileNetPlugin: {
+      *eHandle = NULL;
+      return ncclSuccess;
+    }
+    default:;
+  }
+  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
+}
+
+static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs);
+}
+
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
+  NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask));
+  ncclProfiler.startEvent = ncclProfiler_startEvent;
+  ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent;
+  ncclProfiler.recordEventState = ncclProfiler_recordEventState;
+  ncclProfiler.finalize = ncclProfiler_v1->finalize;
+  return ncclSuccess;
+}
+
+ncclProfiler_t* getNcclProfiler_v1(void* lib) {
+  ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(lib, "ncclProfiler_v1");
+  if (ncclProfiler_v1) {
+    ncclProfiler.name = ncclProfiler_v1->name;
+    ncclProfiler.init = ncclProfiler_init;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v1->name);
+    return &ncclProfiler;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
+  return NULL;
+}
diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc
new file mode 100644
index 0000000..3d00008
--- /dev/null
+++ b/src/plugin/profiler/profiler_v2.cc
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+#include "checks.h"
+
+static ncclProfiler_t ncclProfiler;
+static ncclProfiler_v2_t* ncclProfiler_v2;
+
+static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  if (eDescr->type == ncclProfileKernelCh || eDescr->type == ncclProfileNetPlugin) {
+    *eHandle = NULL;
+    return ncclSuccess;
+  }
+  return ncclProfiler_v2->startEvent(context, eHandle, (ncclProfilerEventDescr_v2_t *)eDescr);
+}
+
+static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs);
+}
+
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
+  NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask));
+  ncclProfiler.startEvent = ncclProfiler_startEvent;
+  ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent;
+  ncclProfiler.recordEventState = ncclProfiler_recordEventState;
+  ncclProfiler.finalize = ncclProfiler_v2->finalize;
+  return ncclSuccess;
+}
+
+ncclProfiler_t* getNcclProfiler_v2(void* lib) {
+  ncclProfiler_v2 = (ncclProfiler_v2_t*)dlsym(lib, "ncclProfiler_v2");
+  if (ncclProfiler_v2) {
+    ncclProfiler.name = ncclProfiler_v2->name;
+    ncclProfiler.init = ncclProfiler_init;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v2->name);
+    return &ncclProfiler;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2");
+  return NULL;
+}
diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc
new file mode 100644
index 0000000..322bea5
--- /dev/null
+++ b/src/plugin/profiler/profiler_v3.cc
@@ -0,0 +1,20 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+
+static ncclProfiler_v3_t* ncclProfiler_v3;
+
+ncclProfiler_t* getNcclProfiler_v3(void* lib) {
+  ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3");
+  if (ncclProfiler_v3) {
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name);
+    return ncclProfiler_v3;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3");
+  return NULL;
+}
diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc
new file mode 100644
index 0000000..443bf78
--- /dev/null
+++ b/src/plugin/tuner.cc
@@ -0,0 +1,99 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "checks.h"
+#include "debug.h"
+#include "tuner.h"
+#include "plugin.h"
+
+extern ncclTuner_t* getNcclTuner_v2(void* lib);
+extern ncclTuner_t* getNcclTuner_v3(void* lib);
+extern ncclTuner_t* getNcclTuner_v4(void* lib);
+
+pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static int tunerPluginRefCount;
+static void* tunerPluginLib = nullptr;
+static ncclTuner_t* tunerSymbol = nullptr;
+
+enum {
+  tunerPluginLoadFailed  = -1,
+  tunerPluginLoadReady   =  0,
+  tunerPluginLoadSuccess =  1,
+};
+
+#define MAX_PLUGIN_LOAD 4
+
+static int status = tunerPluginLoadReady;
+
+ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
+  // Initialize to nullptr by default if plugin tuner cannot be loaded.
+  comm->tuner = nullptr;
+  if (tunerPluginLoadFailed == status) {
+    return ncclSuccess;
+  }
+
+  pthread_mutex_lock(&tunerPluginLock);
+  if (tunerPluginLoadFailed == status) {
+    goto exit;
+  }
+
+  if (tunerPluginLoadSuccess == status) {
+    comm->tuner = tunerSymbol;
+    ++tunerPluginRefCount;
+    goto exit;
+  }
+
+  tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN"));
+  if (nullptr == tunerPluginLib) {
+    tunerPluginLib = ncclGetNetPluginLib();
+    if (nullptr == tunerPluginLib) {
+      goto fail;
+    }
+  }
+
+  tunerSymbol = getNcclTuner_v4(tunerPluginLib);
+  if (tunerSymbol == NULL) {
+    tunerSymbol = getNcclTuner_v3(tunerPluginLib);
+  }
+  if (tunerSymbol == NULL) {
+    tunerSymbol = getNcclTuner_v2(tunerPluginLib);
+  }
+  if (tunerSymbol == NULL) {
+    goto fail;
+  }
+
+  comm->tuner = tunerSymbol;
+  ++tunerPluginRefCount;
+  status = tunerPluginLoadSuccess;
+  comm->tunerPluginLoaded = 1;
+
+exit:
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+fail:
+  tunerPluginLib = nullptr;
+  status = tunerPluginLoadFailed;
+  goto exit;
+}
+
+ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
+  pthread_mutex_lock(&tunerPluginLock);
+  if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
+    INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
+    NCCLCHECK(ncclClosePluginLib(tunerPluginLib));
+    tunerPluginLib = nullptr;
+    tunerSymbol = nullptr;
+    comm->tuner = nullptr;
+    status = tunerPluginLoadReady;
+    comm->tunerPluginLoaded = 0;
+  }
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+}
diff --git a/src/plugin/tuner/tuner_v2.cc b/src/plugin/tuner/tuner_v2.cc
new file mode 100644
index 0000000..005638f
--- /dev/null
+++ b/src/plugin/tuner/tuner_v2.cc
@@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "checks.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v2_t* ncclTuner_v2;
+static ncclTuner_t ncclTuner;
+
+static int hasNvlsSupport(float** collCostTable) {
+  // Requirements for support of different algorithms:
+  //
+  // - NVLS intra-node: nvlsSupport
+  // - NVLS intra+inter-node: collNetSupport
+  // - NVLSTree intra-node: always disabled
+  // - NVLSTree inter-node: nvlsSupport
+  // - Collnet* inter-node: collNetSupport
+  //
+  // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
+}
+
+static int hasCollNetSupport(float** collCostTable) {
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
+}
+
+static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
+  int algorithm = NCCL_ALGO_UNDEF;
+  int protocol = NCCL_PROTO_UNDEF;
+  int nvlsSupport = hasNvlsSupport(collCostTable);
+  int collNetSupport = hasCollNetSupport(collCostTable);
+  NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
+  // set time to 0 below to make sure this algorithm/protocol is selected later on
+  if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
+    float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+    if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
+  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context));
+  ncclTuner.getCollInfo = ncclTuner_getCollInfo;
+  ncclTuner.destroy = ncclTuner_v2->destroy;
+  return ncclSuccess;
+}
+
+ncclTuner_t* getNcclTuner_v2(void* lib) {
+  ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(lib, "ncclTunerPlugin_v2");
+  if (ncclTuner_v2) {
+    ncclTuner.name = ncclTuner_v2->name;
+    ncclTuner.init = ncclTuner_init;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name);
+    return &ncclTuner;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
+  return NULL;
+}
diff --git a/src/plugin/tuner/tuner_v3.cc b/src/plugin/tuner/tuner_v3.cc
new file mode 100644
index 0000000..3898243
--- /dev/null
+++ b/src/plugin/tuner/tuner_v3.cc
@@ -0,0 +1,38 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "checks.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v3_t* ncclTuner_v3;
+static ncclTuner_t ncclTuner;
+
+static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
+  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
+  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context));
+  ncclTuner.getCollInfo = ncclTuner_getCollInfo;
+  ncclTuner.destroy = ncclTuner_v3->destroy;
+  return ncclSuccess;
+}
+
+ncclTuner_t* getNcclTuner_v3(void* lib) {
+  ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(lib, "ncclTunerPlugin_v3");
+  if (ncclTuner_v3) {
+    ncclTuner.name = ncclTuner_v3->name;
+    ncclTuner.init = ncclTuner_init;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name);
+    return &ncclTuner;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
+  return NULL;
+}
diff --git a/src/plugin/tuner/tuner_v4.cc b/src/plugin/tuner/tuner_v4.cc
new file mode 100644
index 0000000..4bfd116
--- /dev/null
+++ b/src/plugin/tuner/tuner_v4.cc
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v4_t* ncclTuner_v4;
+
+ncclTuner_t* getNcclTuner_v4(void* lib) {
+  ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4");
+  if (ncclTuner_v4) {
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name);
+    return ncclTuner_v4;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
+  return NULL;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index 5a83ef3..7e8021e 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -383,6 +383,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->pid = op->pid;
   sub->profilerContext = op->profilerContext;
   sub->ringAlgo = op->ringAlgo;
+  sub->workCounter = op->workCounter;
   args->nsubs = subIndex+1;
   if (subIndex) {
     if ((args->sliceSteps != op->sliceSteps) ||
@@ -532,6 +533,19 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
   return ncclSuccess;
 }
 
+static ncclResult_t SaveProxyProfiler(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
+  struct ncclProxyConnector* proxyConn = (op->coll == ncclFuncRecv) ? &comm->profiler.recvProxyConn[op->channelId] : &comm->profiler.sendProxyConn[op->channelId];
+  if (justInquire) *justInquire = true;
+  else {
+    op->sendbuff = (uint8_t *)comm->profiler.workStarted;
+    op->recvbuff = (uint8_t *)comm->profiler.workCompleted;
+    NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op));
+    // Ensure that in graph capturing the proxy workCounter is incremented to keep up with kernel workCounter
+    op->workCounter += comm->profiler.workCounter[op->channelId];
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
   if (peer < 0) return ncclSuccess;
 
@@ -612,20 +626,19 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       // Run full algorithm to count the number of steps for each peer.
       ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      int last = 0;
-      int *nstepsSend = NULL, *nstepsRecv = NULL;
       const int rank = comm->rank, nranks = comm->nRanks;
-      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
       NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up);
       NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up);
 
-      while (last == 0) {
-        int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
-        size_t inpIx, outIx;
-        algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
-        if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
-        if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
-      }
+      struct ncclPatStep ps;
+      do {
+        algo.getNextOp(&ps);
+        if (ps.flags & PatSkipped) continue;
+        if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++;
+        if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++;
+      } while (ps.last != 2);
       for (int i=0; i<log2Up(nranks); i++) {
         if (nstepsSend[i]) {
           int sendPeer = (rank + (1<<i)) % nranks;
@@ -647,20 +660,19 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       // Run full algorithm to count the number of steps for each peer.
       ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      int last = 0;
-      int *nstepsSend = NULL, *nstepsRecv = NULL;
       const int rank = comm->rank, nranks = comm->nRanks;
-      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
       NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down);
       NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down);
 
-      while (last == 0) {
-        int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
-        size_t inpIx, outIx;
-        algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
-        if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
-        if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
-      }
+      struct ncclPatStep ps;
+      do {
+        algo.getNextOp(&ps);
+        if (ps.flags & PatSkipped) continue;
+        if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++;
+        if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++;
+      } while (ps.last != 2);
       for (int i=0; i<log2Up(nranks); i++) {
         if (nstepsSend[i]) {
           int sendPeer = (rank - (1<<i) + nranks) % nranks;
@@ -683,6 +695,11 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       if (op->root == comm->rank) return ncclSuccess;
       NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire));
     } break;
+  case ncclPatternProfiler: {
+      if (ncclProfilerNeedsProxy(comm, op)) {
+        NCCLCHECK(SaveProxyProfiler(comm, op, justInquire));
+      }
+    } break;
   }
   return ncclSuccess;
 }
@@ -725,10 +742,10 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
   while (op) {
     if (op->state == ncclProxyOpNone) return ncclInternalError;
     TIME_START(0); TIME_START(1);
-    NCCLCHECK(op->progress(proxyState, op));
+    ncclResult_t ret = op->progress(proxyState, op);
     if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
     *idle &= op->idle;
-    if (op->state == ncclProxyOpNone) {
+    if (op->state == ncclProxyOpNone || ret != ncclSuccess) {
       TIME_START(2);
       NCCLCHECK(removeOp(state, &op, &prevOp));
       TIME_STOP(2);
@@ -910,7 +927,7 @@ void* ncclProxyProgress(void *proxyState_) {
     if (ret != ncclSuccess) {
       __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
       INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
-      continue;
+      break;
     }
     void* eHandle;
     ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
@@ -932,7 +949,7 @@ void* ncclProxyProgress(void *proxyState_) {
       }
     }
     lastIdle = idle;
-  } while (state->stop == 0 || (state->stop == 1 && state->active));
+  } while ((state->stop == 0 || (state->stop == 1 && state->active)) && __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0);
   return NULL;
 }
 
@@ -1140,6 +1157,7 @@ ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyCon
   }
 
   ncclIpcHdr hdr;
+  memset(&hdr, '\0', sizeof(hdr));
   hdr.type = type;
   hdr.rank = rank;
   hdr.reqSize = reqSize;
@@ -1323,9 +1341,12 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) {
     pthread_mutexattr_init(&mutexAttr);
     pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
     pthread_mutex_init(&pool->mutex, &mutexAttr);
+    pthread_mutexattr_destroy(&mutexAttr);
     pthread_condattr_t condAttr;
+    pthread_condattr_init(&condAttr);
     pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
     pthread_cond_init(&pool->cond, &condAttr);
+    pthread_condattr_destroy(&condAttr);
     state->opsPool = pool;
 
     memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1);
diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc
index 3e4e9a5..3eafe1b 100644
--- a/src/ras/client_support.cc
+++ b/src/ras/client_support.cc
@@ -4,8 +4,6 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out duriyng development only!
-#include <cassert>
 #include <cstdarg>
 #include <cstddef>
 
@@ -26,26 +24,26 @@
 #define STR2(v) #v
 #define STR(v) STR2(v)
 
-// The RAS client listening socket of this RAS thread (normally port 28028).
-int rasClientListeningSocket = -1;
-
-// Auxiliary structure used when processing the results.  Helps with statistics gathering and sorting.
+// Generic auxiliary structure used when processing the results.  Helps with statistics gathering and sorting,
+// e.g., for the calculation of the distribution of the number of peers per node, of the number of GPUs per peer,
+// of the communicator sizes, or of the counts of collective operations.
 struct rasValCount {
   uint64_t value; // The observed value.
   int count; // The number of occurences of this value in the results.
   int firstIdx; // The index of the first occurence of this value in the results.
 };
 
-// Used in rasAuxComm below.  The values are bitmasks so that they can be combined.
+// Communicator status, used in rasAuxComm below.  The values are bitmasks so that they can be combined.
 typedef enum {
-  RAS_ACS_UNKNOWN = 1, // Set if a peer did not provide info about a given communicator.
+  RAS_ACS_NOCOMM = 1, // Set if the peer claims not to be a member of a given communicator.
   RAS_ACS_INIT = 2,
   RAS_ACS_RUNNING = 4,
   RAS_ACS_FINALIZE = 8,
   RAS_ACS_ABORT = 16
 } rasACStatus;
 
-// Used in rasAuxComm below.  The values are bitmasks so that they can be combined (with the exception of RAS_ACE_OK).
+// Communicator errors, used in rasAuxComm below.  The values are bitmasks so that they can be combined (with the
+// exception of RAS_ACE_OK).
 typedef enum {
   RAS_ACE_OK = 0,
   RAS_ACE_MISMATCH = 1,
@@ -53,22 +51,45 @@ typedef enum {
   RAS_ACE_INCOMPLETE = 4
 } rasACError;
 
-// Auxiliary structure used when processing the results.  Helps with sorting and includes additional statistics
-// on the number of peers and nodes for a communicator.
+// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query.  For each communicator, caches
+// statistics extracted from the results, such as the number of peers and nodes or the communicator status.  Includes
+// a pointer to the communicator data in the results, making it easy to sort the communicators by a different key
+// without altering the results buffer, or just to iterate over the communicators, given that the communicator data
+// in the resuls is of variable length.
 struct rasAuxComm {
-  struct rasCollComms::comm* comm;
+  struct rasCollComms::comm* comm; // Points to the results buffer.
   int nPeers;
   int nNodes;
   int ranksPerNodeMin;
   int ranksPerNodeMax;
   unsigned int status; // Bitmask of rasACStatus values.
   unsigned int errors; // Bitmask of rasACError values.
-  uint64_t firstCollOpCount; // collOpCount of the first rank, to compare against.
+  uint64_t firstCollOpCounts[NCCL_NUM_FUNCTIONS]; // collOpCounts of the first rank, to compare against.
+  int nIncompleteRanks; // Number of ranks that we didn't get any response from.
 };
 
+// Auxiliary structure used when processing the rasPeerInfo data stored in the global rasPeers array.  Makes it possible
+// to extract a subset of peers (e.g., the dead ones), to sort by a different key without altering the original array,
+// and also has room for extracted temporary data such as the number of peers per node or the number of GPUs per peer.
+struct rasAuxPeerInfo {
+  struct rasPeerInfo* peer; // Points to an element in rasPeers.
+  int value;
+};
+
+// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query, specifically when iterating over
+// each communicator's ranks.  Makes it possible to sort by a different key without altering the original array, and
+// also has room for extracted temporary data such as the rank's status or a count of collective operations.
+struct rasAuxCommRank {
+  struct rasCollComms::comm::rank* rank; // Points to the results buffer.
+  uint64_t value;
+};
+
+// The RAS client listening socket of this RAS thread (normally port 28028).
+int rasClientListeningSocket = -1;
+
 // Connected RAS clients.
-struct rasClient* rasClients;
-int nRasClients;
+struct rasClient* rasClientsHead;
+struct rasClient* rasClientsTail;
 
 // Minimum byte count to increment the output buffer size by if it's too small.
 #define RAS_OUT_INCREMENT 4096
@@ -85,6 +106,7 @@ static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS
                            // Still, 1024 should normally be plenty (verbose output may make things more difficult,
                            // but we do check for overflows, so it will just be trimmed).
 
+
 static ncclResult_t getNewClientEntry(struct rasClient** pClient);
 static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen);
 static void rasClientTerminate(struct rasClient* client);
@@ -101,15 +123,13 @@ static void rasOutExtract(char* buffer);
 static int rasOutLength();
 static void rasOutReset();
 
-static int rasPeersNGpuCompare(const void* e1, const void* e2);
-static int rasPeersNProcsCompare(const void* e1, const void* e2);
-static int rasPeersHostPidCompare(const void* e1, const void* e2);
+static int rasAuxPeersValueCompare(const void* e1, const void* e2);
 static int ncclSocketsHostCompare(const void* p1, const void* p2);
 static int rasValCountsCompareRev(const void* p1, const void* p2);
 static int rasAuxCommsCompareRev(const void* p1, const void* p2);
-static int rasCommRanksPeerCompare(const void* p1, const void* p2);
-static int rasCommRanksCollOpCompare(const void* p1, const void* p2);
+static int rasAuxCommRanksValueCompare(const void* p1, const void* p2);
 
+static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size);
 static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size);
 static const char* ncclErrorToString(ncclResult_t err);
 static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size);
@@ -181,21 +201,20 @@ fail:
 // Returns the index of the first available entry in the rasClients array, enlarging the array if necessary.
 static ncclResult_t getNewClientEntry(struct rasClient** pClient) {
   struct rasClient* client;
-  int i;
-  for (i = 0; i < nRasClients; i++)
-    if (rasClients[i].status == RAS_CLIENT_CLOSED)
-      break;
-  if (i == nRasClients) {
-    NCCLCHECK(ncclRealloc(&rasClients, nRasClients, nRasClients+RAS_INCREMENT));
-    nRasClients += RAS_INCREMENT;
-  }
 
-  client = rasClients+i;
-  memset(client, '\0', sizeof(*client));
+  NCCLCHECK(ncclCalloc(&client, 1));
+
   client->sock = client->pfd = -1;
   ncclIntruQueueConstruct(&client->sendQ);
   client->timeout =  RAS_COLLECTIVE_LEG_TIMEOUT;
-  client->collIdx = -1;
+
+  if (rasClientsHead) {
+    rasClientsTail->next = client;
+    client->prev = rasClientsTail;
+    rasClientsTail = client;
+  } else {
+    rasClientsHead = rasClientsTail = client;
+  }
 
   *pClient = client;
   return ncclSuccess;
@@ -219,22 +238,32 @@ static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgL
   struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
   meta->offset = 0;
   meta->length = msgLen;
-  ncclIntruQueueEnqueue(&client->sendQ, meta);
-  assert(client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED);
-  rasPfds[client->pfd].events |= POLLOUT;
+  if (client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED) {
+    ncclIntruQueueEnqueue(&client->sendQ, meta);
+    rasPfds[client->pfd].events |= POLLOUT;
+  } else {
+    INFO(NCCL_RAS, "RAS invalid client status %d -- internal error?", client->status);
+  }
 }
 
 // Terminates a connection with a RAS client.
 static void rasClientTerminate(struct rasClient* client) {
   (void)close(client->sock);
-  client->sock = -1;
-  client->status = RAS_CLIENT_CLOSED;
   rasPfds[client->pfd].fd = -1;
   rasPfds[client->pfd].events = rasPfds[client->pfd].revents = 0;
-  client->pfd = -1;
   while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&client->sendQ)) {
     free(meta);
   }
+
+  if (client == rasClientsHead)
+    rasClientsHead = rasClientsHead->next;
+  if (client == rasClientsTail)
+    rasClientsTail = rasClientsTail->prev;
+  if (client->prev)
+    client->prev->next = client->next;
+  if (client->next)
+    client->next->prev = client->prev;
+  free(client);
 }
 
 
@@ -245,16 +274,12 @@ static void rasClientTerminate(struct rasClient* client) {
 // Invoked when an asynchronous operation that a client was waiting on completes.  Finds the right client and
 // reinvokes rasClientRun.
 ncclResult_t rasClientResume(struct rasCollective* coll) {
-  int collIdx = coll-rasCollectives;
-  int i;
-  struct rasClient* client = nullptr;
-  for (i = 0; i < nRasClients; i++) {
-    client = rasClients+i;
-    if (client->status != RAS_CLIENT_CLOSED && client->collIdx == collIdx) {
+  struct rasClient* client;
+
+  for (client = rasClientsHead; client; client = client->next)
+    if (client->coll == coll)
       break;
-    }
-  }
-  if (i == nRasClients) {
+  if (client == nullptr) {
     INFO(NCCL_RAS, "RAS failed to find a matching client!");
     rasCollFree(coll);
     goto exit;
@@ -266,8 +291,7 @@ exit:
 }
 
 // Handles a ready client FD from the main event loop.
-void rasClientEventLoop(int clientIdx, int pollIdx) {
-  struct rasClient* client = rasClients+clientIdx;
+void rasClientEventLoop(struct rasClient* client, int pollIdx) {
   bool closed = false;
 
   if (client->status == RAS_CLIENT_CONNECTED) {
@@ -431,7 +455,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) {
         break;
       }
     case RAS_CLIENT_CONNS:
-      assert(client->collIdx != -1);
       NCCLCHECKGOTO(rasClientRunConns(client), ret, exit);
 #endif
       client->status = RAS_CLIENT_COMMS;
@@ -440,7 +463,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) {
         break;
       }
     case RAS_CLIENT_COMMS:
-      assert(client->collIdx != -1);
       NCCLCHECKGOTO(rasClientRunComms(client), ret, exit);
       client->status = RAS_CLIENT_FINISHED;
       break;
@@ -459,7 +481,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasPeerInfo* peersReSorted = nullptr;
+  struct rasAuxPeerInfo* auxRasPeers = nullptr;
   int totalGpus, totalNodes, firstNGpusNode, firstNGpusGlobal, firstNPeersGlobal;
   bool consistentNGpusNode, consistentNGpusGlobal, consistentNPeersGlobal;
   int firstIdx, nPeers;
@@ -467,6 +489,8 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   int nValCounts;
   static int cudaDriver = -1, cudaRuntime = -1;
 
+  TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting");
+
   rasOutReset();
   rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
                " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
@@ -481,7 +505,6 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
 
-  rasOutReset();
   totalGpus = totalNodes = 0;
   firstNGpusNode = 0; // #GPUs on the first peer of a node.
   firstNGpusGlobal = 0; // #GPUs on peerIdx 0.
@@ -489,7 +512,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   consistentNGpusGlobal = true; // Whether #GPUs/peer is consistent between the peers *on all nodes*.
   consistentNPeersGlobal = true; // Whether #peers/node is consistent between all nodes.
   nPeers = 0; // #peers on a node.
-  firstNPeersGlobal = 0;
+  firstNPeersGlobal = 0; // #peers on the first node.
   for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
     int nGpus = __builtin_popcountll(rasPeers[peerIdx].cudaDevs);
     totalGpus += nGpus;
@@ -522,6 +545,11 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
     }
   } // for (peerIdx)
 
+  TRACE(NCCL_RAS, "RAS: totalNodes %d, nRasPeers %d, totalGpus %d", totalNodes, nRasPeers, totalGpus);
+  TRACE(NCCL_RAS, "RAS: consistentNPeersGlobal %d, consistentNGpusGlobal %d, consistentNGpusNode %d",
+        consistentNPeersGlobal, consistentNGpusGlobal, consistentNGpusNode);
+  TRACE(NCCL_RAS, "RAS: firstNPeersGlobal %d, firstNGpusGlobal %d", firstNPeersGlobal, firstNGpusGlobal);
+
   rasOutAppend("Job summary\n"
                "===========\n\n");
 
@@ -532,22 +560,24 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
                  totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
   } else {
     // Gather the stats on the number of processes per node.  However, that number is not a property of a peer,
-    // but of a group of peers, so calculating it is more involved.  We make a copy of rasPeers and creatively
-    // misuse it: cudaDevs of each element will be repurposed to store the number of processes on the node.
-    NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail);
-    memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+    // but of a group of peers, so calculating it is more involved.  We store the value in a temporary auxRasPeers
+    // array.
+    NCCLCHECKGOTO(ncclCalloc(&auxRasPeers, nRasPeers), ret, fail);
 
     firstIdx = 0;
     nPeers = 0;
     for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+      auxRasPeers[peerIdx].peer = rasPeers+peerIdx;
       if (peerIdx == 0) {
         nPeers = 1;
         firstIdx = 0;
       } else { // peerIdx > 0
-        if (!ncclSocketsSameNode(&peersReSorted[peerIdx].addr, &peersReSorted[peerIdx-1].addr)) {
+        if (!ncclSocketsSameNode(&auxRasPeers[peerIdx].peer->addr, &auxRasPeers[peerIdx-1].peer->addr)) {
+          TRACE(NCCL_RAS, "RAS: node %s: nPeers %d",
+                ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers);
           for (int i = firstIdx; i < peerIdx; i++) {
             // Go back and update the number of processes of all the elements of that node.
-            peersReSorted[i].cudaDevs = nPeers;
+            auxRasPeers[i].value = nPeers;
           }
           nPeers = 1;
           firstIdx = peerIdx;
@@ -557,21 +587,23 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
       } // peerIdx > 0
       if (peerIdx == nRasPeers-1) {
         // Last iteration of the loop.
+        TRACE(NCCL_RAS, "RAS: node %s: nPeers %d",
+              ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers);
         for (int i = firstIdx; i < nRasPeers; i++) {
-          peersReSorted[i].cudaDevs = nPeers;
+          auxRasPeers[i].value = nPeers;
         }
       }
     } // for (peerIdx)
 
-    // Re-sort it now using the number of processes on the node (cudaDevs) as the primary key, host IP as the
+    // Re-sort it now using the number of processes on the node (value) as the primary key, host IP as the
     // secondary, and process id as the tertiary.
-    qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNProcsCompare);
+    qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare);
 
     // Calculate the distribution of different numbers of peers per node.
     nValCounts = 0;
     for (int peerIdx = 0; peerIdx < nRasPeers;) {
-      if (peerIdx == 0 || peersReSorted[peerIdx].cudaDevs != peersReSorted[peerIdx-1].cudaDevs) {
-        valCounts[nValCounts].value = peersReSorted[peerIdx].cudaDevs;
+      if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) {
+        valCounts[nValCounts].value = auxRasPeers[peerIdx].value;
         valCounts[nValCounts].count = 1;
         valCounts[nValCounts].firstIdx = peerIdx;
         nValCounts++;
@@ -579,14 +611,15 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         valCounts[nValCounts-1].count++;
       }
       // Advance peerIdx to the next node.
-      peerIdx += peersReSorted[peerIdx].cudaDevs;
-    }
+      peerIdx += auxRasPeers[peerIdx].value;
+    } // for (peerIdx)
     // valCounts is currently sorted by value (the number of peers per node).  Sort it by the count (most frequent
     // number of peers first).
     qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
 
     // Print it out, the most frequent peer counts first.
     if (consistentNGpusNode && consistentNGpusGlobal) {
+      // consistentNPeersGlobal must be false
       rasOutAppend("  Nodes  Processes         GPUs\n"
                    "          per node  per process\n");
       for (int i = 0; i < nValCounts; i++) {
@@ -594,7 +627,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         rasOutAppend("%7d  %9ld  %11d\n",
                      vc->count, vc->value, firstNGpusGlobal);
       }
-    } else {
+    } else { // !consistentNGpusNode || !consistentNGpusGlobal
       rasOutAppend("  Nodes  Processes\n"
                    "          per node\n");
       for (int i = 0; i < nValCounts; i++) {
@@ -606,24 +639,29 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
       // We calculate and print the GPUs/process separately.  This is required for !consistentNGpusNode and
       // it also makes our life easier above for !consistentNGpusGlobal (which could require a larger valCounts).
 
-      // Sort peers by the GPU count, to simplify data extraction.
-      memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+      // Sort peers by the GPU count, to simplify data extraction.  Not sure how fast __builtin_popcountll is so we
+      // may just as well cache it...
+      for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+        auxRasPeers[peerIdx].value = __builtin_popcountll(auxRasPeers[peerIdx].peer->cudaDevs);
+        TRACE(NCCL_RAS, "RAS: node %s pid %d: nGpus %d",
+              ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)),
+              auxRasPeers[peerIdx].peer->pid, auxRasPeers[peerIdx].value);
+      }
       // GPU count is the primary key, host IP is the secondary, and process id is the tertiary.
-      qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNGpuCompare);
+      qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare);
 
       // Calculate the distribution of different numbers of GPUs per peer.
       nValCounts = 0;
       for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
-        if (peerIdx == 0 || __builtin_popcountll(peersReSorted[peerIdx].cudaDevs) !=
-                            __builtin_popcountll(peersReSorted[peerIdx-1].cudaDevs)) {
-          valCounts[nValCounts].value = __builtin_popcountll(peersReSorted[peerIdx].cudaDevs);
+        if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) {
+          valCounts[nValCounts].value = auxRasPeers[peerIdx].value;
           valCounts[nValCounts].count = 1;
           valCounts[nValCounts].firstIdx = peerIdx;
           nValCounts++;
         } else {
           valCounts[nValCounts-1].count++;
         }
-      }
+      } // for (peerIdx)
       // valCounts is currently sorted by value (number of GPUs per peer).  Sort it by the count (most frequent
       // GPU counts first).
       qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
@@ -637,7 +675,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         rasOutAppend("         %9d  %11ld\n",
                      vc->count, vc->value);
       }
-    }
+    } // !consistentNGpusNode || !consistentNGpusGlobal
     rasOutAppend("\n"
                  "  Nodes  Processes         GPUs\n"
                  "(total)    (total)      (total)\n"
@@ -652,16 +690,16 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         // provided that they meet our definition of an outlier.
         if (rasCountIsOutlier(vc->count, client->verbose, totalNodes)) {
           rasOutAppend("\nThe outlier node%s:\n", (vc->count > 1 ? "s" : ""));
-          // peersReSorted is sorted by the node IP address (not port!) as the secondary key and the pid as
+          // auxRasPeers is sorted by the node IP address (not port!) as the secondary key and the pid as
           // the tertiary, which comes in handy when printing...
           for (int peerIdx = vc->firstIdx; peerIdx < vc->count*vc->value + vc->firstIdx; peerIdx += vc->value) {
             lineBuf[0] = '\0';
             for (int j = 0; j < vc->value; j++) {
               snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                       (j > 0 ? "," : ""), peersReSorted[j].pid);
+                       (j > 0 ? "," : ""), auxRasPeers[j].peer->pid);
             }
             rasOutAppend("  Node %s running process%s %s\n",
-                         ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
+                         ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)),
                          (vc->value > 1 ? "es" : ""), lineBuf);
           } // for (peerIdx)
         } // if (rasCountIsOutlier(vc->count))
@@ -678,13 +716,12 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_CONNS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_CONNS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress; // We need to wait for async. responses.
   }
@@ -696,18 +733,18 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_COMMS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress;
   }
+  TRACE(NCCL_RAS, "RAS: rasClientRunInit: scheduling RAS_COLL_COMMS and finishing");
 exit:
-  free(peersReSorted);
+  free(auxRasPeers);
   return ret;
 fail:
   goto exit;
@@ -721,13 +758,16 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollective* coll = client->coll;
   struct rasCollConns* connsData = (struct rasCollConns*)coll->data;
   int expected;
   struct rasPeerInfo* peersBuf = nullptr;
 
-  assert(coll->nFwdSent == coll->nFwdRecv);
-  client->collIdx = -1;
+  if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) {
+    INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status);
+    return ncclInternalError;
+  }
+  client->coll = nullptr;
 
   rasOutReset();
   rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9);
@@ -822,13 +862,12 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_COMMS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress;
   }
@@ -847,10 +886,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollective* coll = client->coll;
   struct rasCollComms* commsData = (struct rasCollComms*)coll->data;
   struct rasCollComms::comm* comm;
-  struct rasCollComms::comm::rank* ranksReSorted = nullptr;
+  struct rasAuxCommRank* auxCommRanks = nullptr;
   struct rasValCount* valCounts = nullptr;
   int nValCounts;
   struct rasValCount* collOpCounts = nullptr;
@@ -860,7 +899,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   int vcIdx;
   int nPeersMissing;
   uint64_t* peerNvmlDevs = nullptr;
-  const char*const statusStr[] = { "UNKNOWN", "INIT", "RUNNING", "FINALIZE", "ABORT" };
+  const char*const statusStr[] = { "NOCOMM", "INIT", "RUNNING", "FINALIZE", "ABORT" };
   const char*const errorStr[] = {
     // Listing them all like this, while a bit of a hassle, is less effort than formatting in a temporary buffer.
     "OK",
@@ -873,14 +912,22 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     "INCOMPLETE,ERROR,MISMATCH"
   };
 
-  assert(coll->nFwdSent == coll->nFwdRecv);
-  client->collIdx = -1;
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: starting");
+  TRACE(NCCL_RAS, "RAS: coll nLegTimeouts %d, nPeers %d, nData %d; commsData nComms %d",
+        coll->nLegTimeouts, coll->nPeers, coll->nData, commsData->nComms);
+
+  if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) {
+    INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status);
+    return ncclInternalError;
+  }
+  client->coll = nullptr;
 
   rasOutReset();
   rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
 
   // Calculate the number of missing peers early as we rely on it for other things.
   nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers;
+  TRACE(NCCL_RAS, "RAS: nRasPeers %d, nRasDeadPeers %d, nPeersMissing %d", nRasPeers, nRasDeadPeers, nPeersMissing);
 
   // Sort the communicators by size.  As the structure is inconvenient to move around due to the elements being
   // of variable length, we create an auxiliary array that includes pointers to individual elements and simply sort
@@ -896,12 +943,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     auxComms[commIdx].comm = comm;
     comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
   }
-  NCCLCHECKGOTO(ncclCalloc(&ranksReSorted, maxCommSize), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&auxCommRanks, maxCommSize), ret, fail);
+  TRACE(NCCL_RAS, "RAS: maxCommSize %d", maxCommSize);
 
   // For convenience, create a translation table from rasCollective's peerIdx to rasPeers peerIdx.
   NCCLCHECKGOTO(ncclCalloc(&peerIdxConv, coll->nPeers), ret, fail);
-  for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++)
+  for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) {
     peerIdxConv[peerIdx] = rasPeerFind(coll->peers+peerIdx);
+    TRACE(NCCL_RAS, "RAS: coll peers[%d] -> rasPeers[%d]", peerIdx, peerIdxConv[peerIdx]);
+  }
   // Sort coll->peers to match the ordering of rasPeers -- we may need it later...
   qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
 
@@ -910,42 +960,75 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     struct rasAuxComm* auxComm = auxComms+commIdx;
     int nRanks = 0;
     comm = auxComm->comm;
+    TRACE(NCCL_RAS, "RAS: coll comms[%d]: commId (0x%lx, 0x%lx, 0x%lx), commNRanks %d, nRanks %d, nMissingRanks %d",
+          commIdx, comm->commId.commHash, comm->commId.hostHash, comm->commId.pidHash,
+          comm->commNRanks, comm->nRanks, comm->nMissingRanks);
 
-    if (comm->commNRanks > comm->nRanks) {
+    if (comm->nMissingRanks > 0) {
       // There are two possibilities here.  Either we are missing the data on some ranks because the processes are
       // unreachable, or the processes _are_ reachable but didn't report to be part of this communicator (which
-      // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort).  Because we
-      // currently don't collect data about missing ranks, we can't reliably distinguish these two cases.
-      // For now we rely on an approximation: if we _know_ that some peers failed to respond, we mark this
-      // as an INCOMPLETE error; otherwise as a MISMATCH warning.
-      if (nPeersMissing > 0 || nRasDeadPeers > 0)
-        auxComm->errors |= RAS_ACE_INCOMPLETE;
-      else {
+      // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort).
+      if (nPeersMissing == 0 && nRasDeadPeers == 0) {
+        // We received data from _all_ processes.  That's an easy case.
         auxComm->errors |= RAS_ACE_MISMATCH;
-        auxComm->status |= RAS_ACS_UNKNOWN;
-      }
-    }
+        auxComm->status |= RAS_ACS_NOCOMM;
+      } else {
+        // We failed to receive data from some processes but we don't know if that's why we don't have the info about
+        // some ranks of this communicator.  We need to check all the missing ranks one-by-one as different ranks may
+        // have different reason.
+        struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
 
-    memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
-    // Convert ranksReSorted' peerIdx to rasPeers and sort by it -- that way we will have the ranks sorted
-    // by process _and_ node, which makes counting easy.
-    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
-      ranksReSorted[rankIdx].peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
-    qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksPeerCompare);
+        for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+          struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+          void* found;
+          if ((found = bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers),
+                               ncclSocketsCompare)) != nullptr) {
+            // We did receive the data from that process, but not about this communicator.
+            auxComm->errors |= RAS_ACE_MISMATCH;
+            auxComm->status |= RAS_ACS_NOCOMM;
+          } else {
+            // We failed to receive data from that process.
+            auxComm->errors |= RAS_ACE_INCOMPLETE;
+            auxComm->nIncompleteRanks++;
+          }
+          TRACE(NCCL_RAS, "RAS: comm missingRank[%d] commRank %d, addr %td (-> %d), cudaDev %d, nvmlDev %d",
+                rankIdx, missingRank->commRank, (found ? ((union ncclSocketAddress*)found) - coll->peers: -1),
+                rasPeerFind(&missingRank->addr), missingRank->cudaDev, missingRank->nvmlDev);
+        } // for (rankIdx)
+      } // nPeersMissing > 0 || nRasDeadPeers > 0
+    } // if (comm->nMissingRanks > 0)
+
+    // Initialize auxCommRanks from comm->rank, converting peerIdx to rasPeers, then sort by it -- that way we will
+    // have the ranks sorted by node and process, which makes counting easy.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+      struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+      auxCommRanks[rankIdx].rank = rank;
+      auxCommRanks[rankIdx].value = peerIdxConv[rank->peerIdx];
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] commRank %d, peerIdx %d (-> %d), cudaDev %d, nvmlDev %d",
+            rankIdx, rank->commRank, rank->peerIdx, peerIdxConv[rank->peerIdx], rank->cudaDev, rank->nvmlDev);
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] collOpCounts (%ld, %ld, %ld, %ld, %ld)",
+            rankIdx, rank->collOpCounts[0], rank->collOpCounts[1], rank->collOpCounts[2], rank->collOpCounts[3],
+            rank->collOpCounts[4]);
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] status initState %d, asyncError %d, finalizeCalled %d, destroyFlag %d, "
+            "abortFlag %d", rankIdx, rank->status.initState, rank->status.asyncError, rank->status.finalizeCalled,
+            rank->status.destroyFlag, rank->status.abortFlag); /**/
+    }
+    // This also sorts by the commRank, which we don't care about here, but it won't hurt.
+    qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
 
     // Count the peers and nodes, get the status/error indicators.
     for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-      struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+      struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
       if (rankIdx == 0) {
         auxComm->nPeers = auxComm->nNodes = 1;
         auxComm->ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS;
         auxComm->ranksPerNodeMax = 0;
-        auxComm->firstCollOpCount = rank->collOpCount;
+        memcpy(auxComm->firstCollOpCounts, auxRank->rank->collOpCounts, sizeof(auxComm->firstCollOpCounts));
         nRanks = 1;
       } else { // rankIdx > 0
-        if (rank->peerIdx != rank[-1].peerIdx) {
+        if (auxRank->value != auxRank[-1].value) {
           auxComm->nPeers++;
-          if (!ncclSocketsSameNode(&rasPeers[rank->peerIdx].addr, &rasPeers[rank[-1].peerIdx].addr)) {
+          if (!ncclSocketsSameNode(&rasPeers[auxRank->value].addr, &rasPeers[auxRank[-1].value].addr)) {
             auxComm->nNodes++;
             if (auxComm->ranksPerNodeMin > nRanks)
               auxComm->ranksPerNodeMin = nRanks;
@@ -953,7 +1036,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               auxComm->ranksPerNodeMax = nRanks;
             nRanks = 0;
           }
-        } // if (rank->peerIdx != rank[-1].peerIdx)
+        } // if (auxRank->value != auxRank[-1].value)
         nRanks++;
       } // rankIdx > 0
       if (rankIdx == comm->nRanks-1) {
@@ -964,25 +1047,27 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
           auxComm->ranksPerNodeMax = nRanks;
       }
 
-      if (rank->status.abortFlag)
+      if (auxRank->rank->status.abortFlag)
         auxComm->status |= RAS_ACS_ABORT;
-      else if (rank->status.finalizeCalled || rank->status.destroyFlag) {
+      else if (auxRank->rank->status.finalizeCalled || auxRank->rank->status.destroyFlag) {
         // destroyFlag is set by ncclCommDestroy and ncclCommAbort.  finalizeCalled appears to be set by
         // ncclCommFinalize only.  According to the docs, ncclCommDestroy *can* be called without calling
         // ncclCommFinalize first.  The code structure here ensures that we attribute destroyFlag properly
         // as a finalize state indicator (and ignore it in case of ncclCommAbort).
         auxComm->status |= RAS_ACS_FINALIZE;
       }
-      else if (rank->status.initState == ncclSuccess)
+      else if (auxRank->rank->status.initState == ncclSuccess)
         auxComm->status |= RAS_ACS_RUNNING;
-      else // rank->initState != ncclSuccess
+      else // auxRank->rank->initState != ncclSuccess
         auxComm->status |= RAS_ACS_INIT;
 
-      if (rank->collOpCount != auxComm->firstCollOpCount)
-        auxComm->errors |= RAS_ACE_MISMATCH;
-      if (rank->status.initState != ncclSuccess && rank->status.initState != ncclInProgress)
+      for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS && !(auxComm->errors & RAS_ACE_MISMATCH); collIdx++) {
+        if (auxRank->rank->collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx])
+          auxComm->errors |= RAS_ACE_MISMATCH;
+      }
+      if (auxRank->rank->status.initState != ncclSuccess && auxRank->rank->status.initState != ncclInProgress)
         auxComm->errors |= RAS_ACE_ERROR;
-      if (rank->status.asyncError != ncclSuccess && rank->status.asyncError != ncclInProgress)
+      if (auxRank->rank->status.asyncError != ncclSuccess && auxRank->rank->status.asyncError != ncclInProgress)
         auxComm->errors |= RAS_ACE_ERROR;
     } // for (rankIdx)
 
@@ -990,9 +1075,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       // We've got a status mismatch between ranks.
       auxComm->errors |= RAS_ACE_MISMATCH;
     }
+    TRACE(NCCL_RAS, "RAS: auxComm nPeers %d, nNodes %d, nIncompleteRanks %d",
+          auxComm->nPeers, auxComm->nNodes, auxComm->nIncompleteRanks);
+    TRACE(NCCL_RAS, "RAS: auxComm ranksPerNodeMin %d, ranksPerNodeMax %d, status 0x%x, errors 0x%x",
+          auxComm->ranksPerNodeMin, auxComm->ranksPerNodeMax, auxComm->status, auxComm->errors);
   } // for (commIdx)
   // Sort it by size/nNodes/status/errors/missing ranks.
-  qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev);
+  if (auxComms)
+    qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev);
 
   // Calculate the distribution of different communicator sizes.
   NCCLCHECKGOTO(ncclCalloc(&valCounts, commsData->nComms), ret, fail);
@@ -1014,10 +1104,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     }
   }
 
-  rasOutAppend("Group     Comms     Nodes     Ranks     Ranks     Ranks    Status  Errors\n"
-               "    #  in group  per comm  per node  per comm  in group\n");
-  if (commsData->nComms == 0)
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: done with initial data processing");
+
+  if (commsData->nComms > 0) {
+    rasOutAppend("Group     Comms     Nodes     Ranks     Ranks     Ranks    Status  Errors\n"
+                 "    #  in group  per comm  per node  per comm  in group\n");
+  } else {
     rasOutAppend("No communicator data collected!\n");
+  }
 
   // Allocate an auxiliary structure used for counting the number of ranks (unique GPUs) in a group.
   NCCLCHECKGOTO(ncclCalloc(&peerNvmlDevs, coll->nPeers), ret, fail);
@@ -1058,6 +1152,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  // status (which is a bitmask) into an array index.
                  statusStr[(sizeof(unsigned int)*8-1)-__builtin_clz(auxComm->status)], errorStr[auxComm->errors]);
   }
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
 
   rasOutAppend("\nErrors\n"
                "======\n\n");
@@ -1068,12 +1167,12 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     if (rasCountIsOutlier(nPeersMissing, client->verbose)) {
       // Extract a list of missing peers.  We don't want to print it right away because it would be sorted
       // by address (including port, which isn't meaningful to end users).
-      struct rasPeerInfo* peersBuf = nullptr;
+      struct rasAuxPeerInfo* auxPeersBuf = nullptr;
       int nPeersBuf;
 
       // Both rasPeers and coll->peers are sorted by address (the latter we sorted above) which makes comparing
       // them much easier.
-      NCCLCHECKGOTO(ncclCalloc(&peersBuf, nPeersMissing), ret, fail);
+      NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nPeersMissing), ret, fail);
       nPeersBuf = 0;
       for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) {
         int cmp;
@@ -1088,30 +1187,42 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
         } else if (cmp < 0) {
           // Process missing from coll->peers.  Don't report dead ones though, as they are not included
           // in nPeersMissing and are reported separately below.
-          if (!rasPeerIsDead(&rasPeers[rasPeerIdx].addr)) {
-            assert(nPeersBuf < nPeersMissing);
-            memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf));
+          bool dead;
+          if (!(dead = rasPeerIsDead(&rasPeers[rasPeerIdx].addr))) {
+            if (nPeersBuf < nPeersMissing) {
+              auxPeersBuf[nPeersBuf++].peer = rasPeers+rasPeerIdx;
+            } else {
+              INFO(NCCL_RAS, "RAS overflow of auxPeersBuf: nPeersBuf %d, rasPeerIdx %d (%s), collPeerIdx %d -- "
+                   "internal error?",
+                   nPeersBuf, rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), collPeerIdx);
+            }
           }
+          TRACE(NCCL_RAS, "RAS rasPeerIdx %d (%s) is missing from coll->peers; dead %d",
+                rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), dead);
           rasPeerIdx++;
         } else { // cmp > 0
           // Process not found in rasPeers -- shouldn't happen, unless during a race?
+          INFO(NCCL_RAS, "RAS failed to find coll->peer[%d] (%s) in rasPeers -- internal error?",
+               collPeerIdx, ncclSocketToString(coll->peers+collPeerIdx, rasLine));
           collPeerIdx++;
         } // cmp > 0
       } // for (rasPeerIdx, collPeerIdx)
 
-      // Sort the output by host and pid.
-      qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare);
+      // Sort the output by host and pid.  rasAuxPeersValueCompare uses value as the primary key, which is 0 for
+      // all auxPeersBuf elements here, so it will do.
+      qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare);
       for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
-        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid,
-                     ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)),
-                     (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""),
-                     rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf,
+        struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx;
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid,
+                     ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf,
                                         sizeof(lineBuf)));
       }
       if (nPeersBuf != nPeersMissing)
         rasOutAppend("  [could not find information on %d process%s]\n",
                      nPeersMissing-nPeersBuf, (nPeersMissing-nPeersBuf > 1 ? "es" : ""));
-      free(peersBuf);
+      free(auxPeersBuf);
     } // if (rasCountIsOutlier(nPeersMissing))
     rasOutAppend("\n");
   }
@@ -1121,31 +1232,35 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  "  %d job process%s considered dead (unreachable via the RAS network)\n", nRasDeadPeers,
                  (nRasDeadPeers > 1 ? "es are" : " is"));
     if (rasCountIsOutlier(nRasDeadPeers, client->verbose)) {
-      struct rasPeerInfo* peersReSorted = nullptr;
-      int nPeersReSorted = 0;
-      NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasDeadPeers), ret, fail);
+      // rasDeadPeers contains only addresses, whereas we want a complete rasPeerInfo, and sorted differently.
+      struct rasAuxPeerInfo* auxPeersBuf = nullptr;
+      int nPeersBuf = 0;
+      NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nRasDeadPeers), ret, fail);
       for (int i = 0; i < nRasDeadPeers; i++) {
         int peerIdx = rasPeerFind(rasDeadPeers+i);
         if (peerIdx != -1)
-          memcpy(peersReSorted+(nPeersReSorted++), rasPeers+peerIdx, sizeof(*peersReSorted));
+          auxPeersBuf[nPeersBuf++].peer = rasPeers+peerIdx;
       }
-      // Sort the output by host and pid, not host and port.
-      qsort(peersReSorted, nPeersReSorted, sizeof(*peersReSorted), rasPeersHostPidCompare);
-      for (int peerIdx = 0; peerIdx < nPeersReSorted; peerIdx++) {
-        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersReSorted[peerIdx].pid,
-                     ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
-                     (__builtin_popcountll(peersReSorted[peerIdx].cudaDevs) > 1 ? "s" : ""),
-                     rasGpuDevsToString(peersReSorted[peerIdx].cudaDevs, peersReSorted[peerIdx].nvmlDevs, lineBuf,
+      // Sort the output by host and pid, not host and port.  rasAuxPeersValueCompare uses value as the primary key,
+      // which is 0 for all auxPeersBuf elements here, so it will do.
+      qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare);
+      for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
+        struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx;
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid,
+                     ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf,
                                         sizeof(lineBuf)));
       }
-      if (nPeersReSorted != nRasDeadPeers)
+      if (nPeersBuf != nRasDeadPeers)
         rasOutAppend("  [could not find information on %d process%s]\n",
-                     nRasDeadPeers-nPeersReSorted, (nRasDeadPeers-nPeersReSorted > 1 ? "es" : ""));
-      free(peersReSorted);
+                     nRasDeadPeers-nPeersBuf, (nRasDeadPeers-nPeersBuf > 1 ? "es" : ""));
+      free(auxPeersBuf);
     } // if (rasCountIsOutlier(nRasDeadPeers)
     rasOutAppend("\n");
   }
 
+  // Continue printing the largest communicators first, as in the summary table.
   for (vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
     struct rasValCount* vc;
     vc = valCounts+vcIdx;
@@ -1154,23 +1269,28 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       comm = auxComm->comm;
 
       if (auxComm->errors & RAS_ACE_INCOMPLETE) {
-        int nRanksMissing = comm->commNRanks - comm->nRanks;
         rasOutAppend("#%d-%d (%016lx) INCOMPLETE\n"
                      "  Missing communicator data from %d rank%s\n", vcIdx, commIdx - vc->firstIdx,
-                     comm->commHash, nRanksMissing, (nRanksMissing > 1 ? "s" : ""));
-        if (rasCountIsOutlier(nRanksMissing, client->verbose)) {
-          lineBuf[0] = '\0';
-          // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
-          // exception of the missing ranks...
-          for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
-            if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
-              rankIdx++;
-            } else {
-              snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                       (rankIdx == commRank ? "" : ","), commRank);
-            }
-          } // for (commRank)
-          rasOutAppend("  The missing rank%s: %s\n", (nRanksMissing > 1 ? "s" : ""), lineBuf);
+                     comm->commId.commHash, auxComm->nIncompleteRanks, (auxComm->nIncompleteRanks > 1 ? "s" : ""));
+        if (rasCountIsOutlier(auxComm->nIncompleteRanks, client->verbose)) {
+          struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
+          for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+            struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+            // Filter out ranks that provided a response but not for this communicator.
+            if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), ncclSocketsCompare) ==
+                nullptr) {
+              int peerIdx = rasPeerFind(&missingRank->addr);
+              if (peerIdx != -1) {
+                rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                             missingRank->commRank,
+                             rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, lineBuf, sizeof(lineBuf)),
+                             rasPeers[peerIdx].pid,
+                             ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+              } else {
+                rasOutAppend("  Rank %d -- [process information not found]\n", missingRank->commRank);
+              }
+            } // if rank did not respond
+          } // for (rankIdx)
         } // if (rasCountIsOutlier(nRanksMissing))
         rasOutAppend("\n");
       } // if (auxComm->errors & RAS_ACE_INCOMPLETE)
@@ -1178,7 +1298,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       if (auxComm->errors & RAS_ACE_ERROR) {
         int ncclErrors[ncclNumResults];
         int nErrors;
-        rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+        rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash);
 
         memset(ncclErrors, '\0', sizeof(ncclErrors));
         for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
@@ -1203,6 +1323,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       } // if (auxComm->errors & RAS_ACE_ERROR)
     } // for (commIdx)
   } // for (vcIdx)
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
 
   rasOutAppend("Warnings\n"
                "========\n\n");
@@ -1213,15 +1338,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  coll->nLegTimeouts, (coll->nLegTimeouts > 1 ? "s" : ""));
   }
 
+  // Continue printing the largest communicators first, as in the summary table.
   for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
     struct rasValCount* vc = valCounts+vcIdx;
     for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) {
-      bool inconsistent;
       struct rasAuxComm* auxComm = auxComms+commIdx;
       comm = auxComm->comm;
 
       if (auxComm->errors & RAS_ACE_MISMATCH) {
-        rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+        rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash);
 
         if (collOpCounts == nullptr) {
           // Allocating comm->commNRanks elements ensures that we won't need to reallocate, because the valCounts
@@ -1234,28 +1359,31 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
           rasOutAppend("  Communicator ranks have different status\n");
 
           // We need to sort the ranks by status.  However, status is normally calculated from other fields.
-          // We will copy the ranks and reuse collOpCount to store it.
-          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+          // We will store it in the auxCommRanks' value.
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+            struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+            struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
+            auxRank->rank = rank;
 
             if (rank->status.abortFlag)
-              rank->collOpCount = RAS_ACS_ABORT;
+              auxRank->value = RAS_ACS_ABORT;
             else if (rank->status.finalizeCalled || rank->status.destroyFlag)
-              rank->collOpCount = RAS_ACS_FINALIZE;
+              auxRank->value = RAS_ACS_FINALIZE;
             else if (rank->status.initState == ncclSuccess)
-              rank->collOpCount = RAS_ACS_RUNNING;
+              auxRank->value = RAS_ACS_RUNNING;
             else
-              rank->collOpCount = RAS_ACS_INIT;
+              auxRank->value = RAS_ACS_INIT;
           }
-          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
+          qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
           // Calculate the frequency of different status values.
           int nCollOpCounts = 0;
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
+            if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) {
               // __builtin_clz returns the number of leading 0-bits.  This makes it possible to translate the
-              // status (which is a bitmask) into an array index.
-              collOpCounts[nCollOpCounts].value = (sizeof(unsigned int)*8-1) - __builtin_clz(ranksReSorted[rankIdx].collOpCount);
+              // status (which is a bitmask) into an array index.  The argument is an unsigned int (there is no
+              // 64-bit version seemingly, but we don't actually need one here).
+              collOpCounts[nCollOpCounts].value =
+                (sizeof(unsigned int)*8-1) - __builtin_clz((unsigned int)auxCommRanks[rankIdx].value);
               collOpCounts[nCollOpCounts].count = 1;
               collOpCounts[nCollOpCounts].firstIdx = rankIdx;
               nCollOpCounts++;
@@ -1263,11 +1391,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               collOpCounts[nCollOpCounts-1].count++;
             }
           }
-          if (comm->nRanks < comm->commNRanks) {
-            // Add a "fake" element corresponding to the missing entries.  The statusStr array contains the "UNKNOWN"
-            // string at index 0.
-            collOpCounts[nCollOpCounts].value = 0;
-            collOpCounts[nCollOpCounts].count = comm->commNRanks - comm->nRanks;
+          if (comm->nMissingRanks - auxComm->nIncompleteRanks > 0) {
+            // Add a "fake" element corresponding to the NOCOMM entries, since they are not in the ranks array.
+            collOpCounts[nCollOpCounts].value = 0; // The index of "NOCOMM" in statusStr.
+            collOpCounts[nCollOpCounts].count = comm->nMissingRanks - auxComm->nIncompleteRanks;
             collOpCounts[nCollOpCounts].firstIdx = -1; // "Fake" entry identifier.
             nCollOpCounts++;
           }
@@ -1280,114 +1407,159 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               rasOutAppend("  %d ranks have status %s\n", vcc->count, statusStr[vcc->value]);
             if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
               if (vcc->firstIdx != -1) {
-                // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
+                // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing...
                 for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
-                  int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+                  int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx];
                   if (peerIdx != -1) {
                     if (vcc->count > 1)
                       rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
-                                   ranksReSorted[rankIdx].commRank,
-                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   auxCommRanks[rankIdx].rank->commRank,
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
                                    rasPeers[peerIdx].pid,
                                    ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
                     else
                       rasOutAppend("  Rank %d has status %s -- GPU %s managed by process %d on node %s\n",
-                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value],
-                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value],
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
                                    rasPeers[peerIdx].pid,
                                    ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
                   } else { // peerIdx == -1
                     if (vcc->count > 1)
-                      rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
+                      rasOutAppend("  Rank %d -- [process information not found]\n",
+                                   auxCommRanks[rankIdx].rank->commRank);
                     else
                       rasOutAppend("  Rank %d has status %s -- [process information not found]\n",
-                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value]);
+                                   auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value]);
                   } // peerIdx == -1
                 } // for (rankIdx)
               } else {
-                // UNKNOWN ranks.  Format a string with their rank numbers (we don't know anything more).
-                lineBuf[0] = '\0';
-                // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
-                // exception of the missing ranks...
-                for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
-                  if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
-                    rankIdx++;
-                  } else {
-                    snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                             (rankIdx == commRank ? "" : ","), commRank);
-                  }
-                } // for (commRank)
-                if (vcc->count > 1) {
-                  rasOutAppend("  The unknown ranks: %s\n", lineBuf);
-                } else {
-                  rasOutAppend("  Rank %s has status %s\n", lineBuf, statusStr[vcc->value]);
-                }
-              }
+                // NOCOMM ranks are in a different array.
+                struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks +
+                                                                                                 comm->nRanks);
+                for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+                  struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+                  // Filter out ranks that did not respond at all.
+                  if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers),
+                              ncclSocketsCompare)) {
+                    int peerIdx = rasPeerFind(&missingRank->addr);
+                    if (peerIdx != -1) {
+                      if (vcc->count > 1) {
+                        rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                     missingRank->commRank, rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev,
+                                                                           lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+                      } else {
+                        rasOutAppend("  Rank %d has status %s -- GPU %s managed by process %d on node %s\n",
+                                     missingRank->commRank, statusStr[vcc->value],
+                                     rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev,
+                                                    lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+                      }
+                    } else { // peerIdx == -1
+                      if (vcc->count > 1) {
+                        rasOutAppend("  Rank %d -- [process information not found]\n", missingRank->commRank);
+                      } else {
+                        rasOutAppend("  Rank %d has status %s -- [process information not found]\n",
+                                     missingRank->commRank, statusStr[vcc->value]);
+                      }
+                    } // peerIdx == -1
+                  } // if rank responded
+                } // for (rankIdx)
+              } // vcc->firstIdx == -1
             } // if (rasCountIsOutlier(vcc->count))
           } // for (coc)
         } // if (__builtin_popcount(auxComm->status) > 1)
 
-        inconsistent = false;
-        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-          if (comm->ranks[rankIdx].collOpCount != auxComm->firstCollOpCount) {
-            inconsistent = true;
-            break;
-          }
-        }
-        if (inconsistent) {
-          rasOutAppend("  Communicator ranks have different collective operation counts\n");
+        for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS; collIdx++) {
+          bool inconsistent = false;
 
-          // Sort the ranks by collOpCount and rank for easy counting.
-          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
-          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
-          // Calculate the frequency of different collOpCount values.
-          int nCollOpCounts = 0;
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
-              collOpCounts[nCollOpCounts].value = ranksReSorted[rankIdx].collOpCount;
-              collOpCounts[nCollOpCounts].count = 1;
-              collOpCounts[nCollOpCounts].firstIdx = rankIdx;
-              nCollOpCounts++;
-            } else {
-              collOpCounts[nCollOpCounts-1].count++;
+            if (comm->ranks[rankIdx].collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx]) {
+              inconsistent = true;
+              break;
             }
           }
-          // Sort by that frequency (most frequent first).
-          qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
 
-          for (int coc = 0; coc < nCollOpCounts; coc++) {
-            struct rasValCount* vcc = collOpCounts+coc;
-            if (vcc->count > 1)
-              rasOutAppend("  %d ranks have launched up to operation %ld\n", vcc->count, vcc->value);
-            if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
-              // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
-              for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
-                int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
-                if (peerIdx != -1) {
-                  if (vcc->count > 1)
-                    rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
-                                 ranksReSorted[rankIdx].commRank,
-                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
-                                 rasPeers[peerIdx].pid,
-                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
-                  else
-                    rasOutAppend("  Rank %d has launched up to operation %ld -- GPU %s managed by process %d on node %s\n",
-                                 ranksReSorted[rankIdx].commRank, vcc->value,
-                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
-                                 rasPeers[peerIdx].pid,
-                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
-                } else { // peerIdx == -1
-                  if (vcc->count > 1)
-                    rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
-                  else
-                     rasOutAppend("  Rank %d has launched up to operation %ld -- [process information not found]\n",
-                                  ranksReSorted[rankIdx].commRank, vcc->value);
-                } // peerIdx == -1
-              } // for (rankIdx)
-            } // if (rasCountIsOutlier(vcc->count))
-          } // for (coc)
-        } // if (inconsistent)
-        rasOutAppend("\n");
+          if (inconsistent) {
+            rasOutAppend("  Communicator ranks have different %s operation counts\n", ncclFuncStr[collIdx]);
+
+            // Sort the ranks by collOpCounts[collIdx] and commRank for easy counting.
+            for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+              struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+              struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
+              auxRank->rank = rank;
+              auxRank->value = rank->collOpCounts[collIdx];
+            }
+            qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
+            // Calculate the frequency of different collOpCounts[collIdx] values.
+            int nCollOpCounts = 0;
+            for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+              if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) {
+                collOpCounts[nCollOpCounts].value = auxCommRanks[rankIdx].value;
+                collOpCounts[nCollOpCounts].count = 1;
+                collOpCounts[nCollOpCounts].firstIdx = rankIdx;
+                nCollOpCounts++;
+              } else {
+                collOpCounts[nCollOpCounts-1].count++;
+              }
+            }
+            // Sort by that frequency (most frequent first).
+            qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
+
+            for (int coc = 0; coc < nCollOpCounts; coc++) {
+              struct rasValCount* vcc = collOpCounts+coc;
+              if (vcc->count > 1) {
+                if (vcc->value > 0)
+                  rasOutAppend("  %d ranks have launched up to operation %ld\n", vcc->count, vcc->value);
+                else
+                  rasOutAppend("  %d ranks have not launched any operations\n", vcc->count);
+              }
+              if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
+                // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing...
+                for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
+                  int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx];
+                  if (peerIdx != -1) {
+                    if (vcc->count > 1) {
+                      rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                   auxCommRanks[rankIdx].rank->commRank,
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                   rasPeers[peerIdx].pid,
+                                   ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                    } else {
+                      if (vcc->value > 0) {
+                        rasOutAppend("  Rank %d has launched up to operation %ld -- GPU %s managed by process %d "
+                                     "on node %s\n", auxCommRanks[rankIdx].rank->commRank, vcc->value,
+                                     rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                      } else {
+                        rasOutAppend("  Rank %d has not launched any operations -- GPU %s managed by process %d "
+                                     "on node %s\n", auxCommRanks[rankIdx].rank->commRank,
+                                     rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                      }
+                    }
+                  } else { // peerIdx == -1
+                    if (vcc->count > 1) {
+                      rasOutAppend("  Rank %d -- [process information not found]\n",
+                                   auxCommRanks[rankIdx].rank->commRank);
+                    } else {
+                      if (vcc->value > 0)
+                        rasOutAppend("  Rank %d has launched up to operation %ld -- [process information not found]\n",
+                                     auxCommRanks[rankIdx].rank->commRank, vcc->value);
+                      else
+                        rasOutAppend("  Rank %d has not launched any operations -- [process information not found]\n",
+                                     auxCommRanks[rankIdx].rank->commRank);
+                    }
+                  } // peerIdx == -1
+                } // for (rankIdx)
+              } // if (rasCountIsOutlier(vcc->count))
+            } // for (coc)
+            rasOutAppend("\n");
+          } // if (inconsistent)
+        } // for (collIdx)
       } // if (auxComm->errors & RAS_ACE_MISMATCH)
     } // for (commIdx)
   } // for (vcIdx)
@@ -1398,20 +1570,26 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   rasOutExtract(msg);
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
+
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: finishing");
 exit:
   free(peerNvmlDevs);
   free(collOpCounts);
   free(valCounts);
   free(peerIdxConv);
-  free(ranksReSorted);
+  free(auxCommRanks);
   free(auxComms);
   return ret;
 fail:
   goto exit;
 }
 
+// Generates detailed info about encountered errors, be it initialization ones or asynchronous ones.
 static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm,
                                      const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync) {
+  // Because the number of possible error kinds is finite and small, we don't bother in this case with allocating
+  // temporary data structures, counting the errors, sorting arrays, etc.  Instead, in each iteration we pick the most
+  // numerous error kind, we iterate through the ranks in search for this error, and immediately add it to the output.
   for (;;) {
     int maxCount = 0;
     ncclResult_t maxCountIdx = ncclSuccess;
@@ -1489,17 +1667,20 @@ static void rasOutAppend(const char* format, ...) {
   }
 
   nRasOutBuffer += needed;
-  assert(nRasOutBuffer <= rasOutBufferSize);
+  if (nRasOutBuffer >= rasOutBufferSize)
+    nRasOutBuffer = rasOutBufferSize - 1; // Should never happen, but just to be extra sure...
 exit:
   ;
 }
 
 // Copies the output data from an internal buffer to a user-supplied one, including the terminating '\0'.
 // The user buffer must already be allocated and be at least rasOutLength() bytes long (which includes
-// the terminating '\0').
+// the terminating '\0').  Resets the output buffer when done.
 static void rasOutExtract(char* buffer) {
-  if (rasOutBuffer)
+  if (rasOutBuffer) {
     memcpy(buffer, rasOutBuffer, rasOutLength());
+    rasOutReset();
+  }
 }
 
 // Returns the current length of the used portion of the output buffer, *not* including the terminating '\0'.
@@ -1524,60 +1705,25 @@ exit:
 // Various sorting callbacks used when grouping/formatting data. //
 ///////////////////////////////////////////////////////////////////
 
-// Sorting callback for rasPeerInfo elements.  Sorts by the number of bits set in cudaDevs.  Uses the host IP as the
-// secondary key and the process id as the tertiary key.
-static int rasPeersNGpuCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
-  int c1 = __builtin_popcountll(p1->cudaDevs);
-  int c2 = __builtin_popcountll(p2->cudaDevs);
+// Sorting callback for rasAuxPeerInfo elements.  Sorts by value, with the peers host IP as the secondary key and
+// the process id as the tertiary key.
+static int rasAuxPeersValueCompare(const void* e1, const void* e2) {
+  const struct rasAuxPeerInfo* p1 = (const struct rasAuxPeerInfo*)e1;
+  const struct rasAuxPeerInfo* p2 = (const struct rasAuxPeerInfo*)e2;
 
-  if (c1 == c2) {
+  if (p1->value == p2->value) {
     // Host IP address is the secondary key.
-    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+    int cmp = ncclSocketsHostCompare(&p1->peer->addr, &p2->peer->addr);
     if (cmp == 0) {
       // Process ID is the tertiary key.
-      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+      cmp = (p1->peer->pid < p2->peer->pid ? -1 : (p1->peer->pid > p2->peer->pid ? 1 : 0));
     }
     return cmp;
   } else {
-    return (c1 < c2 ? -1 : 1);
+    return (p1->value < p2->value ? -1 : 1);
   }
 }
 
-// Sorting callback for rasPeerInfo elements.  Sorts by the number of peers per node, which we store in cudaDevs.
-// Uses the host IP as the secondary key and the process id as the tertiary key.
-static int rasPeersNProcsCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
-
-  if (p1->cudaDevs == p2->cudaDevs) {
-    // Host IP address is the secondary key.
-    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
-    if (cmp == 0) {
-      // Process ID is the tertiary key.
-      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
-    }
-    return cmp;
-  } else {
-    return (p1->cudaDevs < p2->cudaDevs ? -1 : 1);
-  }
-}
-
-// Sorting callback for rasPeerInfo elements.  Sorts by the host IP and the process id as the secondary key (rather
-// than the port).
-static int rasPeersHostPidCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
-
-  int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
-  if (cmp == 0) {
-    // Process ID is the secondary key.
-    cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
-  }
-  return cmp;
-}
-
 // Sorting callback for ncclSocketAddress.  Unlike the ncclSocketsCompare, it ignores the port.
 static int ncclSocketsHostCompare(const void* p1, const void* p2) {
   const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1;
@@ -1599,7 +1745,8 @@ static int ncclSocketsHostCompare(const void* p1, const void* p2) {
     cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr));
   } else {
     // The only remaining valid case are empty addresses.
-    assert(family == 0);
+    if (family != 0)
+      INFO(NCCL_RAS, "RAS invalid address family %d -- internal error?", family);
     cmp = 0; // Two empty addresses are equal...
   }
 
@@ -1657,24 +1804,16 @@ static int rasAuxCommsCompareRev(const void* p1, const void* p2) {
   }
 }
 
-// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the peerIdx.
-static int rasCommRanksPeerCompare(const void* p1, const void* p2) {
-  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
-  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
+// Sorting callback for rasAuxCommRank elements.  Sorts by value, with rank's commRank as the secondary key.
+static int rasAuxCommRanksValueCompare(const void* p1, const void* p2) {
+  const struct rasAuxCommRank* r1 = (const struct rasAuxCommRank*)p1;
+  const struct rasAuxCommRank* r2 = (const struct rasAuxCommRank*)p2;
 
-  return (r1->peerIdx < r2->peerIdx ? -1 : (r1->peerIdx > r2->peerIdx ? 1 : 0));
-}
-
-// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the collOpCount, with rank as the secondary key.
-static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
-  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
-  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
-
-  if (r1->collOpCount == r2->collOpCount) {
-    // Use the rank as the secondary key.
-    return (r1->commRank < r2->commRank ? -1 : (r1->commRank > r2->commRank ? 1 : 0));
+  if (r1->value == r2->value) {
+    // Use the commRank as the secondary key.
+    return (r1->rank->commRank < r2->rank->commRank ? -1 : (r1->rank->commRank > r2->rank->commRank ? 1 : 0));
   } else {
-    return (r1->collOpCount < r2->collOpCount ? -1 : 1);
+    return (r1->value < r2->value ? -1 : 1);
   }
 }
 
@@ -1705,14 +1844,20 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf,
   return buf;
 }
 
+// Formats a GPU string based on the CUDA/NVML ids provided.  If the CUDA id is different from the NVML id, both are
+// printed.
+static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size) {
+  snprintf(buf, size, "%d", cudaDev);
+  if (cudaDev != nvmlDev) {
+    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", nvmlDev);
+  }
+  return buf;
+}
+
 // Formats a GPU string based on the rasCollComms's rank.  If the CUDA id is different from the NVML id, both are
 // printed.
 static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) {
-  snprintf(buf, size, "%d", rank->cudaDev);
-  if (rank->cudaDev != rank->nvmlDev) {
-    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", rank->nvmlDev);
-  }
-  return buf;
+  return rasGpuToString(rank->cudaDev, rank->nvmlDev, buf, size);
 }
 
 // Converts a NCCL error result to a string.
@@ -1753,3 +1898,21 @@ static bool rasCountIsOutlier(int count, bool verbose, int totalCount) {
            (totalCount == -1 || count <= totalCount * RAS_CLIENT_OUTLIER_FRACTION);
   }
 }
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasClientSupportTerminate() {
+  (void)close(rasClientListeningSocket);
+  rasClientListeningSocket = -1;
+
+  free(rasOutBuffer);
+  rasOutBuffer = nullptr;
+  nRasOutBuffer = rasOutBufferSize = 0;
+
+  for (struct rasClient* client = rasClientsHead; client;) {
+    struct rasClient* clientNext = client->next;
+    rasClientTerminate(client);
+    client = clientNext;
+  }
+
+  // rasClientsHead and rasClientsTail are taken care of by rasClientTerminate().
+}
diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc
index 201144f..7283360 100644
--- a/src/ras/collectives.cc
+++ b/src/ras/collectives.cc
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out duriyng development only!
+#define NDEBUG // Comment out during development only!
 #include <cassert>
 #include <mutex>
 
@@ -12,6 +12,7 @@
 #include "checks.h"
 #include "comm.h"
 #include "nccl.h"
+#include "transport.h"
 #include "utils.h"
 #include "ras_internal.h"
 
@@ -32,14 +33,14 @@ static int nRasCollHistory, rasCollHistNextIdx;
 // Monotonically increased to ensure that each collective originating locally has a unique Id.
 static uint64_t rasCollLastId;
 
-// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
+// Keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
 // no such tracking).
-struct rasCollective* rasCollectives;
-static int nRasCollectives;
+struct rasCollective* rasCollectivesHead;
+struct rasCollective* rasCollectivesTail;
 
 static ncclResult_t getNewCollEntry(struct rasCollective** pColl);
 static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
-                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx);
+                                       const struct rasCollRequest* req, size_t reqLen, struct rasConnection* fromConn);
 static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen);
 static ncclResult_t rasCollReadyResp(struct rasCollective* coll);
 static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
@@ -47,12 +48,17 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
                                         const union ncclSocketAddress* peers, int nPeers,
                                         const char* data, int nData, int nLegTimeouts);
 
-static ncclResult_t rasCollConnsInit(char** pData, int* pNData);
+static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData);
 static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg);
 
-static ncclResult_t rasCollCommsInit(char** pData, int* pNData);
+static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData);
 static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg);
+static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm);
 static int ncclCommsCompare(const void* p1, const void* p2);
+static int peersHashesCompare(const void* p1, const void* p2);
+static int peersHashesSearch(const void* k, const void* e);
+static int rasCommIdCompare(const void* p1, const void* p2);
+static int rasCollCommsMissingRankSearch(const void* k, const void* e);
 
 
 ///////////////////////////////////////////////////////////////////////////////////////
@@ -62,22 +68,26 @@ static int ncclCommsCompare(const void* p1, const void* p2);
 // Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary.
 static ncclResult_t getNewCollEntry(struct rasCollective** pColl) {
   struct rasCollective* coll;
-  int i;
-  for (i = 0; i < nRasCollectives; i++)
-    if (rasCollectives[i].type == RAS_MSG_NONE)
-      break;
-  if (i == nRasCollectives) {
-    NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT));
-    nRasCollectives += RAS_INCREMENT;
-  }
+  int nRasConns;
+
+  NCCLCHECK(ncclCalloc(&coll, 1));
 
-  coll = rasCollectives+i;
-  memset(coll, '\0', sizeof(*coll));
   coll->startTime = clockNano();
-  coll->fromConnIdx = -1;
+  coll->fromConn = nullptr;
   // We are unlikely to use the whole array, but at least we won't need to realloc.
+  nRasConns = 0;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    nRasConns++;
   NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns));
 
+  if (rasCollectivesHead) {
+    rasCollectivesTail->next = coll;
+    coll->prev = rasCollectivesTail;
+    rasCollectivesTail = coll;
+  } else {
+    rasCollectivesHead = rasCollectivesTail = coll;
+  }
+
   *pColl = coll;
   return ncclSuccess;
 }
@@ -95,21 +105,23 @@ void rasCollReqInit(struct rasCollRequest* req) {
 // in preparation for collective response messages.
 // pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible
 // in scenarios such as a total of two peers.
-// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless
+// pColl provides on return a pointer to the allocated rasCollective structure to track this collective (unless
 // it's a broadcast, which require no such tracking).
-ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx,
-                               int fromConnIdx) {
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone,
+                               struct rasCollective** pColl, struct rasConnection* fromConn) {
   struct rasCollective* coll = nullptr;
+  struct rasCollRequest* reqMod = (struct rasCollRequest*)req;
+  size_t reqLen = 0;
   if (req->type >= RAS_COLL_CONNS) {
     // Keep track of this collective operation so that we can handle the responses appropriately.
     NCCLCHECK(getNewCollEntry(&coll));
-    if (pCollIdx)
-      *pCollIdx = coll-rasCollectives;
+    if (pColl)
+      *pColl = coll;
     memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr));
     coll->rootId = req->rootId;
     coll->type = req->type;
     coll->timeout = req->timeout;
-    coll->fromConnIdx = fromConnIdx;
+    coll->fromConn = fromConn;
     if (ncclCalloc(&coll->peers, 1) == ncclSuccess) {
       memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers));
       coll->nPeers = 1;
@@ -117,9 +129,9 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen,
 
     // Collective-specific initialization of accumulated data (using local data for now).
     if (req->type == RAS_COLL_CONNS)
-      (void)rasCollConnsInit(&coll->data, &coll->nData);
+      (void)rasCollConnsInit(&reqMod, &reqLen, &coll->data, &coll->nData);
     else if (req->type == RAS_COLL_COMMS)
-      (void)rasCollCommsInit(&coll->data, &coll->nData);
+      (void)rasCollCommsInit(&reqMod, &reqLen, &coll->data, &coll->nData);
   } else { // req->type < RAS_COLL_CONNS
     // Add the info to the collective message history.
     nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
@@ -131,42 +143,42 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen,
     // Collective-specific message handling.
     if (req->type == RAS_BC_DEADPEER) {
       bool done = false;
-      rasMsgHandleBCDeadPeer(req, &done);
+      rasMsgHandleBCDeadPeer(&reqMod, &reqLen, &done);
       if (done)
         goto exit;
     }
   } // req->type < RAS_COLL_CONNS
 
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
-    rasConns[connIdx].linkFlag = false;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    conn->linkFlag = false;
 
-  (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx);
-  (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx);
+  (void)rasLinkSendCollReq(&rasNextLink, coll, reqMod, reqLen, fromConn);
+  (void)rasLinkSendCollReq(&rasPrevLink, coll, reqMod, reqLen, fromConn);
 
   if (coll && pAllDone)
     *pAllDone = (coll->nFwdSent == coll->nFwdRecv);
 exit:
+  if (reqMod != req)
+    free(reqMod);
   return ncclSuccess;
 }
 
 // Sends the collective message through all connections associated with this link (with the exception of the one
 // the message came from, if any).
 static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
-                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
-      if (!conn->linkFlag) {
-        // We send collective messages through fully established and operational connections only.
-        if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
-          if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr)
-            coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx;
-        } // if (conn->sockIdx != -1 && RAS_SOCK_READY)
-        conn->linkFlag = true;
-      } // if (!conn->linkFlag)
-    } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx)
-  } // for (i)
+                                       const struct rasCollRequest* req, size_t reqLen,
+                                       struct rasConnection* fromConn) {
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
+    if (linkConn->conn && linkConn->conn != fromConn && !linkConn->conn->linkFlag) {
+      // We send collective messages through fully established and operational connections only.
+      if (linkConn->conn->sock && linkConn->conn->sock->status == RAS_SOCK_READY &&
+          !linkConn->conn->experiencingDelays) {
+        if (rasConnSendCollReq(linkConn->conn, req, reqLen) == ncclSuccess && coll != nullptr)
+          coll->fwdConns[coll->nFwdSent++] = linkConn->conn;
+      } // linkConn->conn is fully established and operational.
+      linkConn->conn->linkFlag = true;
+    } // if (linkConn->conn && linkConn->conn != fromConn && !linkConn->con->linkFlag)
+  } // for (linkConn)
 
   return ncclSuccess;
 }
@@ -190,8 +202,8 @@ static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct
 // in which case it can immediately send the response.
 ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
   bool allDone = false;
-  int collIdx = -1;
-  assert(sock->connIdx != -1);
+  struct rasCollective* coll = nullptr;
+  assert(sock->conn);
 
   // First check if we've already handled this request (through another connection).
   for (int i = 0; i < nRasCollHistory; i++) {
@@ -202,7 +214,7 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
       if (msg->collReq.type >= RAS_COLL_CONNS) {
         // Send an empty response so that the sender can account for it.  The non-empty response has already been
         // sent through the connection that we received the request through first.
-        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+        NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId,
                                       /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
       }
       goto exit;
@@ -211,31 +223,29 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
 
   if (msg->collReq.type >= RAS_COLL_CONNS) {
     // Check if we're currently handling this collective request.
-    for (int i = 0; i < nRasCollectives; i++) {
-      struct rasCollective* coll = rasCollectives+i;
-      if (coll->type != RAS_MSG_NONE &&
-          memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+    for (coll = rasCollectivesHead; coll; coll = coll->next) {
+      if (memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
           msg->collReq.rootId == coll->rootId) {
         assert(msg->collReq.type == coll->type);
 
         // Send an empty response so that the sender can account for it.  The non-empty response will be
         // sent through the connection that we received the request through first.
-        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+        NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId,
                                       /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
         goto exit;
       } // if match
-    } // for (i)
+    } // for (coll)
   } // if (msg->collReq.type >= RAS_COLL_CONNS)
 
   // Re-broadcast the message to my peers (minus the one it came from) and handle it locally.
-  NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx));
+  NCCLCHECK(rasNetSendCollReq(&msg->collReq, &allDone, &coll, sock->conn));
 
   if (msg->collReq.type >= RAS_COLL_CONNS && allDone) {
-    assert(collIdx != -1);
+    assert(coll);
     // We are a leaf process -- send the response right away.  This can probably trigger only for the case of a total
     // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer
     // has more than one connection so there should always be _some_ other peer to forward the request to.
-    NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx));
+    NCCLCHECK(rasCollReadyResp(coll));
   }
 exit:
   return ncclSuccess;
@@ -245,9 +255,9 @@ exit:
 // Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't
 // any peers (unlikely), the peers sent their responses (likely), or we timed out.
 static ncclResult_t rasCollReadyResp(struct rasCollective* coll) {
-  if (coll->fromConnIdx != -1) {
+  if (coll->fromConn) {
     // For remotely-initiated collectives, send the response back.
-    NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId,
+    NCCLCHECK(rasConnSendCollResp(coll->fromConn, &coll->rootAddr, coll->rootId,
                                   coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts));
 
     // Add the identifying info to the collective message history.
@@ -302,18 +312,15 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
 // the data from the response into the accumulated data.  If all the responses have been accounted for, sends the
 // accumulated response back.
 ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
-  int collIdx;
-  struct rasCollective* coll = nullptr;
+  struct rasCollective* coll;
   char line[SOCKET_NAME_MAXLEN+1];
 
-  for (collIdx = 0; collIdx < nRasCollectives; collIdx++) {
-    coll = rasCollectives+collIdx;
-    if (coll->type != RAS_MSG_NONE &&
-        memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
+  for (coll = rasCollectivesHead; coll; coll = coll->next) {
+    if (memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
         msg->collResp.rootId == coll->rootId)
       break;
   }
-  if (collIdx == nRasCollectives) {
+  if (coll == nullptr) {
     INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!",
          ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId,
          ncclSocketToString(&sock->sock.addr, rasLine));
@@ -321,11 +328,11 @@ ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
   }
 
   coll->nLegTimeouts += msg->collResp.nLegTimeouts;
-  assert(sock->connIdx != -1);
-  // Account for the received response in our collective operation tracking.
+  assert(sock->conn);
+  // Account for the received response in our collective operations tracking.
   for (int i = 0; i < coll->nFwdSent; i++) {
-    if (coll->fwdConns[i] == sock->connIdx) {
-      coll->fwdConns[i] = -1;
+    if (coll->fwdConns[i] == sock->conn) {
+      coll->fwdConns[i] = nullptr;
       break;
     }
   }
@@ -353,46 +360,53 @@ exit:
 
 // Removes a connection from all ongoing collectives.  Called when a connection is experiencing a delay or is being
 // terminated.
-void rasCollsPurgeConn(int connIdx) {
-  for (int i = 0; i < nRasCollectives; i++) {
-    struct rasCollective* coll = rasCollectives+i;
-    if (coll->type != RAS_MSG_NONE) {
-      char line[SOCKET_NAME_MAXLEN+1];
-      if (coll->fromConnIdx == connIdx) {
-        INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
-             ncclSocketToString(&coll->rootAddr, line), coll->rootId,
-             ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-        rasCollFree(coll);
-      } else {
-        for (int j = 0; j < coll->nFwdSent; j++) {
-          if (coll->fwdConns[j] == connIdx) {
-            coll->fwdConns[j] = -1;
-            coll->nFwdRecv++;
-            coll->nLegTimeouts++;
-            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
-                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
-                 ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
-                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
-            if (coll->nFwdSent == coll->nFwdRecv)
-              (void)rasCollReadyResp(coll);
-            break;
-          }
-        } // for (j)
-      } // coll->fromConnIdx != connIdx
-    } // !RAS_MSG_NONE
-  } // for (i)
+void rasCollsPurgeConn(struct rasConnection* conn) {
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    char line[SOCKET_NAME_MAXLEN+1];
+    if (coll->fromConn == conn) {
+      INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
+           ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+           ncclSocketToString(&conn->addr, rasLine));
+      rasCollFree(coll);
+    } else {
+      for (int i = 0; i < coll->nFwdSent; i++) {
+        if (coll->fwdConns[i] == conn) {
+          coll->fwdConns[i] = nullptr;
+          coll->nFwdRecv++;
+          coll->nLegTimeouts++;
+          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+               coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+          if (coll->nFwdSent == coll->nFwdRecv)
+            (void)rasCollReadyResp(coll);
+          break;
+        }
+      } // for (i)
+    } // coll->fromConn != conn
+    coll = collNext;
+  } // for (coll)
 }
 
 // Frees a rasCollective entry and any memory associated with it.
 void rasCollFree(struct rasCollective* coll) {
+  if (coll == nullptr)
+    return;
+
   free(coll->fwdConns);
-  coll->fwdConns = nullptr;
   free(coll->peers);
-  coll->peers = nullptr;
   free(coll->data);
-  coll->data = nullptr;
-  coll->fromConnIdx = -1;
-  coll->type = RAS_MSG_NONE;
+
+  if (coll == rasCollectivesHead)
+    rasCollectivesHead = rasCollectivesHead->next;
+  if (coll == rasCollectivesTail)
+    rasCollectivesTail = rasCollectivesTail->prev;
+  if (coll->prev)
+    coll->prev->next = coll->next;
+  if (coll->next)
+    coll->next->prev = coll->prev;
+  free(coll);
 }
 
 // Invoked from the main RAS thread loop to handle timeouts of the collectives.
@@ -407,64 +421,64 @@ void rasCollFree(struct rasCollective* coll) {
 // and send back whatever we have.  Unfortunately, the peer that the RAS client is connected to will in all likelihood
 // time out first, so at that point any delayed responses that eventually arrive are likely to be too late...
 void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) {
-    struct rasCollective* coll = rasCollectives+collIdx;
-    if (coll->type == RAS_MSG_NONE || coll->timeout == 0)
-      continue;
-
-    if (now - coll->startTime > coll->timeout) {
-      // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
-      if (!coll->timeoutWarned) {
-        INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
-             ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
-             (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
-        coll->timeoutWarned = true;
-      }
-      for (int i = 0; i < coll->nFwdSent; i++) {
-        if (coll->fwdConns[i] != -1) {
-          struct rasConnection* conn = rasConns+coll->fwdConns[i];
-          char line[SOCKET_NAME_MAXLEN+1];
-          if (!conn->experiencingDelays && conn->sockIdx != -1) {
-            struct rasSocket* sock = rasSockets+conn->sockIdx;
-            // Ensure that the connection is fully established and operational, and that the socket hasn't been
-            // re-created during the handling of the collective (which would suggest that the request may have been
-            // lost).
-            if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime)
-              continue;
-          }
-          // In all other cases we declare a timeout so that we can (hopefully) recover.
-          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
-               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
-               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
-               coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
-          coll->fwdConns[i] = -1;
-          coll->nFwdRecv++;
-          coll->nLegTimeouts++;
-        } // if (coll->fwdConns[i] != -1)
-      } // for (i)
-      if (coll->nFwdSent == coll->nFwdRecv) {
-        (void)rasCollReadyResp(coll);
-      } else {
-        // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
-        // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
-        // (incomplete) responses will arrive shortly, so we should wait a little longer.
-        if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
-          // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
-          // the originator of the collective, if it's not us, may have timed out already anyway).
-          INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    if (coll->timeout > 0) {
+      if (now - coll->startTime > coll->timeout) {
+        // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
+        if (!coll->timeoutWarned) {
+          INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
                ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
                (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
-          coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
-          coll->nFwdRecv = coll->nFwdSent;
+          coll->timeoutWarned = true;
+        }
+        for (int i = 0; i < coll->nFwdSent; i++) {
+          if (coll->fwdConns[i]) {
+            struct rasConnection* conn = coll->fwdConns[i];
+            char line[SOCKET_NAME_MAXLEN+1];
+            if (!conn->experiencingDelays && conn->sock) {
+              // Ensure that the connection is fully established and operational, and that the socket hasn't been
+              // re-created during the handling of the collective (which would suggest that the request may have been
+              // lost).
+              if (conn->sock->status == RAS_SOCK_READY && conn->sock->createTime < coll->startTime)
+                continue;
+            }
+            // In all other cases we declare a timeout so that we can (hopefully) recover.
+            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+                 ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+            coll->fwdConns[i] = nullptr;
+            coll->nFwdRecv++;
+            coll->nLegTimeouts++;
+          } // if (coll->fwdConns[i])
+        } // for (i)
+        if (coll->nFwdSent == coll->nFwdRecv) {
           (void)rasCollReadyResp(coll);
         } else {
-          *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
-        }
-      } // conn->nFwdRecv < conn->nFwdSent
-    } else {
-      *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
-    }
-  } // for (collIdx)
+          // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
+          // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
+          // (incomplete) responses will arrive shortly, so we should wait a little longer.
+          if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
+            // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
+            // the originator of the collective, if it's not us, may have timed out already anyway).
+            INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+                 ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+                 (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+            coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
+            coll->nFwdRecv = coll->nFwdSent;
+            (void)rasCollReadyResp(coll);
+          } else {
+            *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
+          }
+        } // conn->nFwdRecv < conn->nFwdSent
+      } else {
+        *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
+      }
+    } // if (coll->timeout > 0)
+
+    coll = collNext;
+  } // for (coll)
 }
 
 
@@ -476,15 +490,16 @@ void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
 // For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well
 // as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen,
 // but the system clocks may not be perfectly in sync).
-static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
+static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) {
   struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN};
   struct rasCollConns* pConnsData;
 
+  *pReqLen = rasCollDataLength(RAS_COLL_CONNS);
+
   // Update the statistical data first and in the process also calculate how much connection-specific space we
   // will need.
-  for (int i = 0; i < nRasConns; i++) {
-    struct rasConnection* conn = rasConns+i;
-    if (conn->inUse && conn->travelTimeCount > 0) {
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+    if (conn->travelTimeCount > 0) {
       if (connsData.travelTimeMin > conn->travelTimeMin)
         connsData.travelTimeMin = conn->travelTimeMin;
       if (connsData.travelTimeMax < conn->travelTimeMax)
@@ -502,9 +517,9 @@ static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
   pConnsData = (struct rasCollConns*)*pData;
   memcpy(pConnsData, &connsData, sizeof(*pConnsData));
   if (connsData.nNegativeMins > 0) {
-    for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) {
-      struct rasConnection* conn = rasConns+i;
-      if (conn->inUse && conn->travelTimeMin < 0) {
+    int negMinsIdx = 0;
+    for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+      if (conn->travelTimeMin < 0) {
         struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx;
         memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source));
         memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest));
@@ -560,10 +575,26 @@ static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg*
 // Initializes the accumulated data with just the local data for now.
 // For this particular collective, we keep for every communicator information about every rank, to help identify
 // the missing ones and the discrepancies between the ones that did respond.
-static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
+// For any new (previously unseen) communicator we also save the basic identification data about every rank that is
+// "missing" (i.e., not part of this process).  During merging, this should be replaced by the actual data from
+// those ranks, if they are responsive.  We want to provide this information to the user (so that we can say more
+// than "rank xyz missing").
+// Every "new" communicator is also recorded in the (updated) request, so that when that request is forwarded to our
+// peers, those peers don't needlessly send us the same data.
+static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) {
+  ncclResult_t ret = ncclSuccess;
   struct rasCollComms* commsData;
-  int nComms = 0, nRanks = 0;
+  int nComms = 0, nRanks = 0, nMissingRanks = 0;
+  bool skipMissing = false;
   std::lock_guard<std::mutex> lock(ncclCommsMutex);
+  struct rasCollComms::comm* comm;
+  struct rasCollRequest* req = nullptr;
+  struct rasPeerInfo** peersReSorted = nullptr;
+  int firstNewSkipMissingIdx = -1;
+
+  *pReqLen = rasCollDataLength(RAS_COLL_COMMS) +
+    (*pReq)->comms.nSkipMissingRanksComms * sizeof(*(*pReq)->comms.skipMissingRanksComms);
+  *pData = nullptr;
 
   // Start by counting the communicators so that we know how much space to allocate.
   // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case
@@ -572,77 +603,152 @@ static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
     qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare);
     ncclCommsSorted = true;
   }
-  for (int i = 0; i < nNcclComms; i++) {
-    if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting.
+  for (int commIdx = 0; commIdx < nNcclComms; commIdx++) {
+    if (ncclComms[commIdx] == nullptr) // nullptr's are always at the end after sorting.
       break;
-    if (i == 0) {
-      nComms = 1;
-    } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
-      nComms++;
-    }
-    nRanks++;
-  }
+    // A process may manage multiple GPUs and thus have multiple communicators with the same commHash.
+    // Comparing just the commHash is OK though within communicators that are part of the same process.
+    if (commIdx == 0 || ncclComms[commIdx]->commHash != ncclComms[commIdx-1]->commHash) {
+      skipMissing = rasCollCommsSkipMissing(*pReq, ncclComms[commIdx]);
+      if (!skipMissing) {
+        // Add this communicator to the request so that the processes we forward the request to know not to fill in
+        // the missing rank info.
+        struct rasCommId* skipComm;
+        if (req == nullptr) {
+          // We pessimistically allocate space for all the remaining communicators so that we don't need to reallocate.
+          int newSize = *pReqLen + (nNcclComms-commIdx) * sizeof(*req->comms.skipMissingRanksComms);
+          NCCLCHECKGOTO(ncclCalloc((char**)&req, newSize), ret, fail);
+          memcpy(req, *pReq, *pReqLen);
+          *pReq = req;
+          firstNewSkipMissingIdx = req->comms.nSkipMissingRanksComms;
+        }
+        skipComm = req->comms.skipMissingRanksComms + req->comms.nSkipMissingRanksComms++;
+        skipComm->commHash = ncclComms[commIdx]->commHash;
+        skipComm->hostHash = ncclComms[commIdx]->peerInfo->hostHash;
+        skipComm->pidHash = ncclComms[commIdx]->peerInfo->pidHash;
 
-  // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent
+        nMissingRanks += ncclComms[commIdx]->nRanks;
+      } // if (!skipMissing)
+      nComms++;
+    } // if encountered a new communicator
+    nRanks++;
+    if (!skipMissing)
+      nMissingRanks--;
+  } // for (commIdx)
+
+  // rasCollComms has nested variable-length arrays, which makes the size calculation and subsequent
   // pointer manipulations somewhat unwieldy...
-  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks);
-  NCCLCHECK(ncclCalloc(pData, *pNData));
+  // This is extra complicated because of the "hidden" array of struct rasCollCommsMissingRank following the
+  // ranks array for each communicator.
+  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks) +
+    nMissingRanks * sizeof(struct rasCollCommsMissingRank);
+  NCCLCHECKGOTO(ncclCalloc(pData, *pNData), ret, fail);
   commsData = (struct rasCollComms*)*pData;
   commsData->nComms = nComms;
 
   // comm points at the space in the accumulated data where the info about the current communicator is to be stored.
-  struct rasCollComms::comm* comm = commsData->comms;
-  for (int i = 0; i < nNcclComms; i++) {
-    struct rasCollComms::comm::rank* rank;
-    ncclResult_t asyncError;
-    if (ncclComms[i] == nullptr)
-      break;
-    if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
-      if (i > 0)
-        comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
-      comm->commHash = ncclComms[i]->commHash;
-      comm->commNRanks = ncclComms[i]->nRanks;
-      comm->nRanks = 0;
-    } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) {
-      INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
-           "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) {
-      INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
-           ncclComms[i]->rank, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    }
-    if (comm->nRanks == comm->commNRanks) {
-      INFO(NCCL_RAS,
-           "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)",
-           comm->commNRanks, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    }
-    rank = comm->ranks+comm->nRanks;
-    rank->commRank = ncclComms[i]->rank;
-    // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
-    // always 0.  It will increase after we send this response back to the peer we got the request from.
-    rank->peerIdx = 0;
-    rank->collOpCount = ncclComms[i]->collOpCount;
-    rank->status.initState = ncclComms[i]->initState;
-    if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess)
-      rank->status.asyncError = asyncError;
-    rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0);
-    rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0);
-    rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0);
-    rank->cudaDev = ncclComms[i]->cudaDev;
-    rank->nvmlDev = ncclComms[i]->nvmlDev;
-    comm->nRanks++;
-  }
-  assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData);
+  comm = commsData->comms;
+  // collCommIdx counts rasCollComms::comm (comm); commIdx indexes ncclComms.
+  for (int collCommIdx = 0, commIdx = 0; collCommIdx < nComms; collCommIdx++) {
+    struct ncclComm* ncclComm = ncclComms[commIdx];
 
-  return ncclSuccess;
+    comm->commId.commHash = ncclComm->commHash;
+    comm->commId.hostHash = ncclComm->peerInfo->hostHash;
+    comm->commId.pidHash = ncclComm->peerInfo->pidHash;
+    comm->commNRanks = ncclComm->nRanks;
+    comm->nRanks = comm->nMissingRanks = 0;
+
+    // Fill in the comm->ranks array.
+    for (; commIdx < nNcclComms && ncclComms[commIdx] && ncclComms[commIdx]->commHash == comm->commId.commHash;
+         commIdx++) {
+      ncclComm = ncclComms[commIdx];
+      struct rasCollComms::comm::rank* rank = comm->ranks+comm->nRanks;
+      ncclResult_t asyncError;
+      rank->commRank = ncclComm->rank;
+      // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
+      // always 0.  It will increase after we send this response back to the peer we got the request from.
+      rank->peerIdx = 0;
+      memcpy(rank->collOpCounts, ncclComm->seqNumber, sizeof(rank->collOpCounts));
+      rank->status.initState = ncclComm->initState;
+      if (ncclCommGetAsyncError(ncclComm, &asyncError) == ncclSuccess)
+        rank->status.asyncError = asyncError;
+      rank->status.finalizeCalled = (ncclComm->finalizeCalled != 0);
+      rank->status.destroyFlag = (ncclComm->destroyFlag != 0);
+      rank->status.abortFlag = (__atomic_load_n(ncclComm->abortFlag, __ATOMIC_ACQUIRE) != 0);
+      rank->cudaDev = ncclComm->cudaDev;
+      rank->nvmlDev = ncclComm->nvmlDev;
+      comm->nRanks++;
+    } // for (commIdx)
+
+    if (firstNewSkipMissingIdx != -1 &&
+        memcmp(req->comms.skipMissingRanksComms+firstNewSkipMissingIdx, &comm->commId, sizeof(comm->commId)) == 0) {
+      // Fill in the missingRanks array that follows the comm->ranks.
+      struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
+
+      if (peersReSorted == nullptr) {
+        // Create a lookup table to rasPeers that is sorted by hostHash and pidHash, to reduce the complexity of the
+        // lookups in the missingRankIdx loop below.
+        NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail);
+        for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++)
+          peersReSorted[peerIdx] = rasPeers+peerIdx;
+        qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesCompare);
+      }
+
+      comm->nMissingRanks = comm->commNRanks - comm->nRanks;
+      for (int missingRankIdx = 0, rankIdx = 0; missingRankIdx < comm->nMissingRanks; missingRankIdx++) {
+        struct rasCollCommsMissingRank* missingRank;
+        struct ncclPeerInfo* info;
+        struct rasPeerInfo** peer;
+        uint64_t key[2];
+        // Look for the next "hole" in the ranks array.
+        while (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == rankIdx+missingRankIdx)
+          rankIdx++;
+
+        missingRank = missingRanks + missingRankIdx;
+        missingRank->commRank = rankIdx + missingRankIdx;
+        info = ncclComm->peerInfo + missingRank->commRank;
+        key[0] = info->hostHash - ncclComm->commHash;
+        key[1] = info->pidHash - ncclComm->commHash;
+        peer = (struct rasPeerInfo**)bsearch(key, peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesSearch);
+        if (peer)
+          memcpy(&missingRank->addr, &(*peer)->addr, sizeof(missingRank->addr));
+        missingRank->cudaDev = info->cudaDev;
+        missingRank->nvmlDev = info->nvmlDev;
+      } // for (missingRankIdx)
+
+      if (++firstNewSkipMissingIdx == req->comms.nSkipMissingRanksComms)
+        firstNewSkipMissingIdx = -1;
+    } // if need to fill in the missingRanks
+
+    comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks) +
+                                        comm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
+  } // for (collCommIdx)
+  assert(((char*)comm) - (char*)commsData <= *pNData);
+
+  if (req) {
+    // Finish updating the request.
+    *pReqLen = rasCollDataLength(RAS_COLL_COMMS) +
+      req->comms.nSkipMissingRanksComms * sizeof(*req->comms.skipMissingRanksComms);
+    qsort(req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms,
+          sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare);
+  }
+ret:
+  free(peersReSorted);
+  return ret;
+fail:
+  if (req) {
+    free(req);
+    *pReq = nullptr;
+  }
+  free(*pData);
+  *pData = nullptr;
+  goto ret;
 }
 
 // Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data.
 static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) {
-  struct rasCollComms* collData;
-  struct rasCollComms* msgData;
+  struct rasCollComms* collData; // Data previously stored (locally) by our process.
+  struct rasCollComms* msgData; // Data just received from another process.
   int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
   ALIGN_SIZE(dataOffset, alignof(int64_t));
 
@@ -650,7 +756,7 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
   collData = (struct rasCollComms*)coll->data;
 
   if (msgData->nComms > 0) {
-    struct rasCollComms* newData = nullptr;
+    struct rasCollComms* newData = nullptr; // Destination buffer for the merged data.
 
     // Allocate the new buffer pessimistically (sized as the sum of the two old ones).
     NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData));
@@ -661,25 +767,28 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
     for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) {
       int cmp;
       if (collIdx < collData->nComms && msgIdx < msgData->nComms)
-        cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0));
+        cmp = rasCommIdCompare(&collComm->commId, &msgComm->commId);
       else
         cmp = (collIdx < collData->nComms ? -1 : 1);
 
       if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) {
         INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
-             "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash);
+             "possible hash collision (0x%lx, 0x%lx, 0x%lx)", collComm->commNRanks, msgComm->commNRanks,
+             collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash);
         cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1);
-        // We try to preserve both separately, although the input data might already be messed up anyway...
+        // We try to preserve them both separately...
       }
 
       if (cmp == 0) {
         // Merge the comms.
-        newComm->commHash = collComm->commHash;
+        memcpy(&newComm->commId, &collComm->commId, sizeof(newComm->commId));
         newComm->commNRanks = collComm->commNRanks;
         if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) {
           INFO(NCCL_RAS,
-               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)",
-               collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash);
+               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible hash collision "
+               "(0x%lx, 0x%lx, 0x%lx)", collComm->nRanks + msgComm->nRanks, newComm->commNRanks,
+               collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash);
+          newComm->nRanks = newComm->commNRanks;
           // We'll skip the extras in the loop below.
         } else {
           newComm->nRanks = collComm->nRanks + msgComm->nRanks;
@@ -691,16 +800,18 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
           int cmpRank;
           if (newRankIdx == newComm->commNRanks)
             break; // Short of failing, the best we can do is skip...
-          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks)
+          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks) {
             cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 :
                        (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0));
-          else
+          } else {
             cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1);
+          }
 
           // There shouldn't be any overlaps in ranks between different sources.
           if (cmpRank == 0) {
-            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
-                 collComm->ranks[collRankIdx].commRank, newComm->commHash);
+            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible hash collision "
+                 "(0x%lx, 0x%lx, 0x%lx)", collComm->ranks[collRankIdx].commRank,
+                 newComm->commId.commHash, newComm->commId.hostHash, newComm->commId.pidHash);
             msgRankIdx++; // Short of failing, the best we can do is skip...
           }
           memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ :
@@ -708,23 +819,63 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
           if (cmpRank > 0) {
             // peerIdx values from msgComm need to shift after merge.
             newComm->ranks[newRankIdx].peerIdx += coll->nPeers;
-          }
+
+            if (collComm->nMissingRanks > 0) {
+              // Remove the corresponding entry from missingRanks.
+              struct rasCollCommsMissingRank* missingRank;
+              missingRank = (struct rasCollCommsMissingRank*)bsearch(&newComm->ranks[newRankIdx].commRank,
+                                                                     collComm->ranks+collComm->nRanks,
+                                                                     collComm->nMissingRanks,
+                                                                     sizeof(struct rasCollCommsMissingRank),
+                                                                     rasCollCommsMissingRankSearch);
+              if (missingRank) {
+                // Mark the entry as no longer needed.
+                memset(&missingRank->addr, '\0', sizeof(missingRank->addr));
+              } else {
+                INFO(NCCL_RAS, "RAS failed to find missingRank data -- internal error?");
+              }
+            } // if (collComm->nMissingRanks > 0)
+          } // if (cmpRank > 0)
         } // for (newRankIdx)
-        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks));
-        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks));
+        if (collComm->nMissingRanks > 0) {
+          // Copy the missingRanks to newComm, skipping over any no longer needed entries.
+          union ncclSocketAddress emptyAddr;
+          struct rasCollCommsMissingRank* collMissingRanks;
+          struct rasCollCommsMissingRank* newMissingRanks;
+          int newRankIdx;
+
+          memset(&emptyAddr, '\0', sizeof(emptyAddr));
+          collMissingRanks = (struct rasCollCommsMissingRank*)(collComm->ranks+collComm->nRanks);
+          newMissingRanks = (struct rasCollCommsMissingRank*)(newComm->ranks+newComm->nRanks);
+          newRankIdx = 0;
+          for (int collRankIdx = 0; collRankIdx < collComm->nMissingRanks; collRankIdx++) {
+            if (memcmp(&collMissingRanks[collRankIdx].addr, &emptyAddr, sizeof(emptyAddr))) {
+              memcpy(newMissingRanks + newRankIdx++, collMissingRanks + collRankIdx, sizeof(*newMissingRanks));
+            }
+          }
+          newComm->nMissingRanks = newRankIdx;
+          assert(newComm->nRanks + newComm->nMissingRanks == newComm->commNRanks);
+        }
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks) +
+                                               newComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks) +
+                                                collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
         collIdx++;
-        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks));
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks) +
+                                               msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
         msgIdx++;
       } else if (cmp < 0) {
         // Copy from collComm.
-        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks);
+        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks) +
+          collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank);
         memcpy(newComm, collComm, commSize);
         newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
         collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize);
         collIdx++;
       } else { // cmp > 0
         // Copy from msgComm.
-        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks);
+        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks) +
+          msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank);
         memcpy(newComm, msgComm, commSize);
         for (int i = 0; i < newComm->nRanks; i++) {
           // peerIdx values from msgComm need to shift after merge.
@@ -745,18 +896,87 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
   return ncclSuccess;
 }
 
+// Checks if a given communicator is in the skipMissingRanksComms array of the request.
+static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm) {
+  struct rasCommId id;
+  id.commHash = comm->commHash;
+  id.hostHash = comm->peerInfo->hostHash;
+  id.pidHash = comm->peerInfo->pidHash;
+  return (bsearch(&id, req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms,
+                  sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare) != nullptr);
+}
+
 // Sorting callback for the ncclComms array.
 static int ncclCommsCompare(const void* p1, const void* p2) {
-  const ncclComm** pc1 = (const ncclComm**)p1;
-  const ncclComm** pc2 = (const ncclComm**)p2;
+  const ncclComm* comm1 = *(const ncclComm**)p1;
+  const ncclComm* comm2 = *(const ncclComm**)p2;
 
   // Put nullptr's at the end.
-  if (*pc1 == nullptr || *pc2 == nullptr)
-    return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0));
+  if (comm1 == nullptr || comm2 == nullptr)
+    return (comm1 != nullptr ? -1 : (comm2 != nullptr ? 1 : 0));
 
-  if ((*pc1)->commHash == (*pc2)->commHash) {
-    return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0));
+  if (comm1->commHash == comm2->commHash) {
+    return (comm1->rank < comm2->rank ? -1 : (comm1->rank > comm2->rank ? 1 : 0));
   } else {
-    return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1);
+    return (comm1->commHash < comm2->commHash ? -1 : 1);
   }
 }
+
+// Sorting callback for a lookup table to rasPeers.  Sorts by the hostHash (primary) and pidHash (secondary).
+static int peersHashesCompare(const void* p1, const void* p2) {
+  const struct rasPeerInfo* pi1 = *(const struct rasPeerInfo**)p1;
+  const struct rasPeerInfo* pi2 = *(const struct rasPeerInfo**)p2;
+
+  if (pi1->hostHash == pi2->hostHash) {
+    return (pi1->pidHash < pi2->pidHash ? -1 : (pi1->pidHash > pi2->pidHash ? 1 : 0));
+  } else {
+    return (pi1->hostHash < pi2->hostHash ? -1 : 1);
+  }
+}
+
+// Search callback for a lookup table to rasPeers.  Searches by the hostHash and pidHash.  The key is an array
+// containing the hostHash at index 0 and the pidHash at index 1.
+static int peersHashesSearch(const void* k, const void* e) {
+  const uint64_t* key = (const uint64_t*)k;
+  const struct rasPeerInfo* elem = *(const struct rasPeerInfo**)e;
+
+  if (key[0] == elem->hostHash) {
+    return (key[1] < elem->pidHash ? -1 : (key[1] > elem->pidHash ? 1 : 0));
+  } else {
+    return (key[0] < elem->hostHash ? -1 : 1);
+  }
+}
+
+// Sorting/searching callback for struct rasCommId.  Sorts by commHash, then hostHash, then pidHash.
+static int rasCommIdCompare(const void* p1, const void* p2) {
+  const struct rasCommId* i1 = (const struct rasCommId*)p1;
+  const struct rasCommId* i2 = (const struct rasCommId*)p2;
+  if (i1->commHash == i2->commHash) {
+    if (i1->hostHash == i2->hostHash) {
+      return (i1->pidHash < i2->pidHash ? -1 : (i1->pidHash > i2->pidHash ? 1 : 0));
+    } else {
+      return (i1->hostHash < i2->hostHash ? -1 : 1);
+    }
+  } else {
+    return (i1->commHash < i2->commHash ? -1 : 1);
+  }
+}
+
+// Search callback for rasCollComms::comm rasCollCommsMissingRank array.  The key is the commRank.
+static int rasCollCommsMissingRankSearch(const void* k, const void* e) {
+  int key = *(const int*)k;
+  const struct rasCollCommsMissingRank* elem = (const struct rasCollCommsMissingRank*)e;
+
+  return (key < elem->commRank ? -1 : (key > elem->commRank ? 1 : 0));
+}
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasCollectivesTerminate() {
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    rasCollFree(coll);
+    coll = collNext;
+  }
+
+  // rasCollectivesHead and rasCollectivesTail are taken care of by rasCollFree().
+}
diff --git a/src/ras/peers.cc b/src/ras/peers.cc
index f2692d3..8573209 100644
--- a/src/ras/peers.cc
+++ b/src/ras/peers.cc
@@ -40,10 +40,11 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks
 static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1);
 
 static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
-                                      struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1);
+                                      struct rasRankInit* ranks = nullptr, int nranks = 0,
+                                      struct rasConnection* fromConn = nullptr);
 static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
                                            bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
-                                           int fromConnIdx);
+                                           struct rasConnection* fromConn);
 static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
                                            int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks);
 ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
@@ -146,6 +147,8 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks
     rankPeer->pid = rank->pid;
     rankPeer->cudaDevs = (1UL << rank->cudaDev);
     rankPeer->nvmlDevs = (1UL << rank->nvmlDev);
+    rankPeer->hostHash = rank->hostHash;
+    rankPeer->pidHash = rank->pidHash;
     rankPeerIdx++;
 
     // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how
@@ -357,12 +360,12 @@ int rasPeerFind(const union ncclSocketAddress* addr) {
 // ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members
 // of the new communicator being established), and who thus don't need to be notified.  updatedDeadPeers can
 // be used, however, to request at least the propagation of rasDeadPeers to such peers.
-// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to
+// fromConn -- if provided -- identifies the connection used to receive this update; there's no need to
 // propagate the update back through it.
 // Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new
 // connections as needed.
 static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
-                                      struct rasRankInit* ranks, int nranks, int fromConnIdx) {
+                                      struct rasRankInit* ranks, int nranks, struct rasConnection* fromConn) {
   ncclResult_t ret = ncclSuccess;
 
   // Do we actually have anything to do?
@@ -371,8 +374,8 @@ static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nN
 
   // Start by propagating the update through the RAS network links.  We consider any errors during this process
   // to be non-fatal (we can re-sync later around a keep-alive exchange).
-  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
-  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn);
+  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn);
 
   // Calculate new link peers and open new connections if needed.
   NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail);
@@ -388,15 +391,13 @@ fail:
 // for the explanation of the function arguments.
 static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
                                            bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
-                                           int fromConnIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
+                                           struct rasConnection* fromConn) {
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
     // Note that we don't send the update via the connection that we received this notification from in the first
     // place (while it wouldn't loop indefinitely, it would add a needless extra exchange).
-    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
+    if (linkConn->conn && linkConn->conn != fromConn) {
       // Failed propagations are not considered fatal (we will retry after a keep-alive).
-      (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
+      (void)rasConnPropagateUpdate(linkConn->conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
     }
   }
 
@@ -407,7 +408,7 @@ static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct ra
 // arguments.
 static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
                                            int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) {
-  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+  if (conn->sock && conn->sock->status == RAS_SOCK_READY) {
     // If we have the rank info, check if the peer on the other side of this connection has participated in the new
     // communicator.
     int connRank = -1;
@@ -462,7 +463,8 @@ ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct ras
   msg->peersUpdate.deadPeersHash = rasDeadPeersHash;
   msg->peersUpdate.nDeadPeers = nDeadPeers;
   memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0]));
-  memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+  if (nDeadPeers > 0)
+    memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
 
   if (nPeers > 0)
     conn->lastSentPeersHash = rasPeersHash;
@@ -485,8 +487,7 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
   ncclResult_t ret = ncclSuccess;
   struct rasMsg* newMsg = nullptr;
   int newMsgLen = 0;
-  assert(sock->connIdx != -1);
-  struct rasConnection* conn = rasConns+sock->connIdx;
+  assert(sock->conn);
   int nPeers, nDeadPeers;
   int deadPeersOffset = 0;
   bool updatePeers, updateDeadPeers;
@@ -496,8 +497,8 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
        msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
   INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d",
        rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers);
-  conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
-  conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
+  sock->conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
+  sock->conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
 
   // Prepare ours to send back.  We don't enqueue it right away because we want to make sure first that we need
   // to send it.  We'll find out by comparing the hash values after the merge.
@@ -545,15 +546,15 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
       rasDeadPeersDump();
 
     // If post-merge the hashes are still different, send our (dead) peers back.
-    updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash);
-    updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash &&
-                       conn->lastRecvDeadPeersHash != rasDeadPeersHash);
+    updatePeers = (sock->conn->lastSentPeersHash != rasPeersHash && sock->conn->lastRecvPeersHash != rasPeersHash);
+    updateDeadPeers = (sock->conn->lastSentDeadPeersHash != rasDeadPeersHash &&
+                       sock->conn->lastRecvDeadPeersHash != rasDeadPeersHash);
     if (updatePeers || updateDeadPeers) {
       newMsg->peersUpdate.peersHash = rasPeersHash;
       newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash;
       if (updatePeers) {
         assert(nPeers > 0);
-        conn->lastSentPeersHash = rasPeersHash;
+        sock->conn->lastSentPeersHash = rasPeersHash;
       } else {
         // If hashes match, make sure that we don't send the rasPeers back.
         newMsg->peersUpdate.nPeers = 0;
@@ -564,14 +565,14 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
 
       if (updateDeadPeers) {
         assert(nRasDeadPeers > 0);
-        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        sock->conn->lastSentDeadPeersHash = rasDeadPeersHash;
 
         ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
         deadPeersOffset = newMsgLen;
         newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers);
 
         memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
-        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        sock->conn->lastSentDeadPeersHash = rasDeadPeersHash;
         newMsg->peersUpdate.nDeadPeers = nRasDeadPeers;
       } else {
         newMsg->peersUpdate.nDeadPeers = 0;
@@ -580,13 +581,13 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
       INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)",
            newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers);
 
-      rasConnEnqueueMsg(conn, newMsg, newMsgLen);
+      rasConnEnqueueMsg(sock->conn, newMsg, newMsgLen);
       newMsg = nullptr;
     } // if (updatePeers || updateDeadPeers)
 
     // Propagate the changes through our RAS network links.
     NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0,
-                                    sock->connIdx), ret, fail);
+                                    sock->conn), ret, fail);
   }
 
 exit:
@@ -603,7 +604,7 @@ fail:
 
 // Reinitializes the connection(s) of a particular link, following a peers update.
 // Adding new peers can affect the calculation of the link's primary connection and also the fallbacks.
-// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn
+// The newly added peers could also shift all the existing peerIdx values, invalidating the values in rasLinkConn
 // structures, so it's better to drop it all and recalculate from scratch.
 // We recalculate the primary peer; if an active connection to it already exists, then we're done.  If there
 // is no connection, we create one.  If a connection exists but is experiencing delays then we add a fallback and
@@ -611,77 +612,51 @@ fail:
 // External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed).
 static ncclResult_t rasLinkReinitConns(struct rasLink* link) {
   struct rasLinkConn* linkConn;
-  struct rasConnection* conn = nullptr;
   int newPeerIdx = myPeerIdx;
 
-  if (link->connsSize == 0) {
-    link->connsSize = RAS_INCREMENT;
-    NCCLCHECK(ncclCalloc(&link->conns, link->connsSize));
+  if (link->conns) {
+    // Free the old contents but keep the first entry for convenience (though wipe it).
+    for (struct rasLinkConn* linkConn = link->conns->next; linkConn;) {
+      struct rasLinkConn* linkConnNext = linkConn->next;
+      free(linkConn);
+      linkConn = linkConnNext;
+    }
+    memset(link->conns, '\0', sizeof(*link->conns));
+    link->lastUpdatePeersTime = 0;
+  } else { // link->conns == nullptr
+    NCCLCHECK(ncclCalloc(&link->conns, 1));
   }
-  link->nConns = 0;
 
-  // Establish a connection for this link.  We iterate as long as the connections we find are experiencing delays.
-  while (newPeerIdx != -1) {
-    if (link->nConns == link->connsSize) {
-      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
-      link->connsSize += RAS_INCREMENT;
-    }
+  // Fill in the entry for the primary connection.
+  linkConn = link->conns;
+  linkConn->peerIdx = newPeerIdx = rasLinkCalculatePeer(link, myPeerIdx, /*isFallback*/false);
+  linkConn->conn = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : nullptr);
+  linkConn->external = false;
 
-    newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1);
-    if (newPeerIdx == -1) {
-      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
-      if (link->nConns > 0)
-        break;
-    }
-    linkConn = link->conns+link->nConns;
-    linkConn->peerIdx = newPeerIdx;
-    linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1);
-    linkConn->external = false;
-
-    // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration.
-    // Depending on the circumstances, we may first need to create that connection.
-    if (linkConn->connIdx == - 1) {
-      if (link->nConns == 0) {
-        if (linkConn->peerIdx != -1) {
-          INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
-               link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
-               ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-          // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
-          // to avoid races and the creation of duplicate connections.
-          if (myPeerIdx < linkConn->peerIdx) {
-            NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
-          }
-          else { // If we didn't initiate the connection, start the timeout.
-            link->lastUpdatePeersTime = clockNano();
-          }
-        } // if (linkConn->peerIdx != -1)
-      } else { // link->nConns > 0
-        INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s",
-             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-        NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx));
-      } // link->nConns > 0
-    } else { // linkConn->connIdx != -1
-      if (link->nConns == 0) {
-        INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
-             link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-      } else {
-        INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s",
-             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+  if (linkConn->conn == nullptr) {
+    if (linkConn->peerIdx != -1) {
+      // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
+      // to avoid races and the creation of duplicate connections.
+      INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
+           link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
+           ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      if (myPeerIdx < linkConn->peerIdx) {
+        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn));
       }
-    }
-    link->nConns++;
-    if (linkConn->connIdx == -1)
-      break;
-    conn = rasConns+linkConn->connIdx;
-
-    // We check if the connection already went through the fallback calculation; if so, we'll need to create a new
-    // fallback in the next iteration, to ensure that RAS will keep retrying.
-    if (!conn->experiencingDelays)
-      break;
+      else { // If we didn't initiate the connection, start the timeout.
+        link->lastUpdatePeersTime = clockNano();
+      }
+    } // if (linkConn->peerIdx != -1)
+  } else { // linkConn->conn
+    INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
+         link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+  } // linkConn->conn
 
+  if (linkConn->conn && linkConn->conn->experiencingDelays) {
     INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
-         conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9,
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         linkConn->conn->experiencingDelays, (clockNano()-linkConn->conn->startRetryTime)/1e9,
+         (linkConn->conn->sock ? linkConn->conn->sock->status : - 1));
+    NCCLCHECK(rasLinkAddFallback(link, linkConn->conn));
   }
 
   return ncclSuccess;
@@ -701,39 +676,37 @@ int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallbac
     if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) {
       // peerIdx is a fallback and it is not running on the same node as us.
       int tryPeerIdx = newPeerIdx;
-      int tryConnIdx = -1;
+      struct rasConnection* tryConn = nullptr;
 
       // Try to skip the remaining peers on the same node as peerIdx.  We may end up skipping over some peers that
       // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a
       // little suboptimal one.
       while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) {
         if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) {
-          tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr);
-          if (tryConnIdx != -1) {
-            struct rasConnection* tryConn = rasConns+tryConnIdx;
+          tryConn = rasConnFind(&rasPeers[tryPeerIdx].addr);
+          if (tryConn) {
             // Check if the connection is fully established and operational, i.e., if the underlying socket
             // is ready and there's been recent communication on it.
-            if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY &&
-                !tryConn->experiencingDelays) {
+            if (tryConn->sock && tryConn->sock->status == RAS_SOCK_READY && !tryConn->experiencingDelays) {
               // We convinced ourselves that the node is not down.  We don't adjust newPeerIdx in
               // this case.  This is the only case when tryConnIdx != -1 after this loop.
               break;
             }
-          } // if (tryConnIdx != -1)
+          } // if (tryConn)
         } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr))
 
-        tryConnIdx = -1;
-        tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers;
+        tryConn = nullptr;
+        tryPeerIdx = (tryPeerIdx + link->direction + nRasPeers) % nRasPeers;
         if (tryPeerIdx == myPeerIdx)
           break;
       }
 
-      if (tryConnIdx == -1)
+      if (tryConn == nullptr)
         newPeerIdx = tryPeerIdx;
       if (tryPeerIdx == myPeerIdx)
         break;
     } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr))
-    
+
     if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) {
       newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers;
     }
@@ -932,7 +905,8 @@ bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSock
 static void rasPeersDump() {
   for (int p = 0; p < nRasPeers; p++) {
     const struct rasPeerInfo* peer = rasPeers+p;
-    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : ""));
+    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)),
+         (p == myPeerIdx ? " [this process]" : ""));
   }
   if (nRasPeers > 0)
     INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash);
@@ -958,3 +932,17 @@ static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nr
            rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2)));
   return result;
 }
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasPeersTerminate() {
+  free(rasPeers);
+  rasPeers = nullptr;
+  nRasPeers = 0;
+  rasPeersHash = 0;
+  myPeerIdx = -1;
+
+  free(rasDeadPeers);
+  rasDeadPeers = nullptr;
+  nRasDeadPeers = rasDeadPeersSize = 0;
+  rasDeadPeersHash = 0;
+}
diff --git a/src/ras/ras.cc b/src/ras/ras.cc
index 4905d7a..8ef551c 100644
--- a/src/ras/ras.cc
+++ b/src/ras/ras.cc
@@ -4,8 +4,10 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out during development only!
-#include <cassert>
+// Workaround for libstdc++ trying to force public visibility of std:: symbols.  We don't want to do that in libnccl.so.
+#include <bits/c++config.h>
+#undef _GLIBCXX_VISIBILITY
+#define _GLIBCXX_VISIBILITY(V)
 #include <cstddef>
 #include <mutex>
 #include <poll.h>
@@ -65,8 +67,8 @@ int nNcclComms = 0;
 bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank.
 
 static ncclResult_t rasLocalNotify(const struct rasNotification* msg);
-static ncclResult_t rasLocalHandle();
-static void rasLocalHandleTerminate();
+static ncclResult_t rasLocalHandle(bool* terminate);
+static void rasThreadCleanup();
 
 static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock);
 static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock);
@@ -74,6 +76,8 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock);
 
 static void* rasThreadMain(void*);
 
+static void rasTerminate() __attribute__((destructor));
+
 NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1);
 
 //////////////////////////////////////////////////
@@ -105,7 +109,6 @@ ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank)
 
       PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail);
       ncclSetThreadName(rasThread, "NCCL RAS");
-      (void)pthread_detach(rasThread);
 
       rasInitialized = true;
     }
@@ -157,18 +160,27 @@ ncclResult_t ncclRasCommFini(const struct ncclComm* comm) {
       }
     }
   }
-  if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) {
-    struct rasNotification msg;
-    msg.type = RAS_TERMINATE;
-    NCCLCHECK(rasLocalNotify(&msg));
-  }
+  ncclAtomicRefCountDecrement(&rasInitRefCount);
   return ncclSuccess;
 }
 
+// Global destructor.  Notifies the RAS thread to release all the resources
+// and terminate.  Waits for the thread to terminate.
+static void rasTerminate() {
+  struct rasNotification msg;
+  if (!rasInitialized)
+    return;
+  memset(&msg, '\0', sizeof(msg));
+  msg.type = RAS_TERMINATE;
+  if (rasLocalNotify(&msg) == ncclSuccess)
+    (void)pthread_join(rasThread, nullptr);
+}
+
 // Invoked by regular NCCL threads on every (non-split) comm initialization.  Provides info on all the ranks within
 // the communicator.
 ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) {
   struct rasNotification msg;
+  memset(&msg, '\0', sizeof(msg));
   msg.type = RAS_ADD_RANKS;
   msg.addRanks.ranks = ranks;
   msg.addRanks.nranks = nranks;
@@ -199,7 +211,7 @@ static ncclResult_t rasLocalNotify(const struct rasNotification* msg) {
 /////////////////////////////////////////////////////////////////////////////////
 
 // Handles asynchronous local notifications arriving from regular NCCL threads.
-static ncclResult_t rasLocalHandle() {
+static ncclResult_t rasLocalHandle(bool* terminate) {
   struct rasNotification msg;
 
   size_t done = 0;
@@ -212,9 +224,11 @@ static ncclResult_t rasLocalHandle() {
   }
 
   if (msg.type == RAS_ADD_RANKS) {
-    NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks));
+    (void)rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks);
+    // Not great if the above fails, but it shouldn't be critical; better to keep going.
   } else if (msg.type == RAS_TERMINATE) {
-    rasLocalHandleTerminate();
+    INFO(NCCL_RAS, "RAS handling local termination request");
+    *terminate = true;
   } else {
     WARN("RAS received unknown notification type %d", msg.type);
     return ncclInternalError;
@@ -223,10 +237,35 @@ static ncclResult_t rasLocalHandle() {
   return ncclSuccess;
 }
 
-// Handles local RAS_TERMINATE notification.
-static void rasLocalHandleTerminate() {
-  INFO(NCCL_RAS, "RAS handling local termination request");
-  // For now we don't do anything.
+// Cleans up local RAS state, normally in response to a RAS_TERMINATE notification.
+static void rasThreadCleanup() {
+  rasClientSupportTerminate();
+  rasNetTerminate();
+  rasCollectivesTerminate();
+  rasPeersTerminate();
+
+  {
+    std::lock_guard<std::mutex> lock(rasInitMutex);
+    (void)close(rasNotificationPipe[1]);
+    (void)close(rasNotificationPipe[0]);
+    // rasClientListeningSocket is taken care of by rasClientSupportTerminate().
+    rasNotificationPipe[0] = rasNotificationPipe[1] = -1;
+    (void)ncclSocketClose(&rasNetListeningSocket);
+    rasInitRefCount = 0;
+    rasInitialized = false;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+    free(ncclComms);
+    ncclComms = nullptr;
+    nNcclComms = 0;
+    ncclCommsSorted = false;
+  }
+
+  free(rasPfds);
+  rasPfds = nullptr;
+  nRasPfds = 0;
 }
 
 
@@ -270,10 +309,10 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
   else
     ncclIntruQueueEnqueue(&conn->sendQ, meta);
 
-  if (conn->sockIdx != -1) {
-    struct rasSocket* sock = rasSockets+conn->sockIdx;
-    if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
-      rasPfds[sock->pfd].events |= POLLOUT;
+  if (conn->sock) {
+    if (conn->sock->status == RAS_SOCK_READY ||
+        (conn->sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
+      rasPfds[conn->sock->pfd].events |= POLLOUT;
       ready = true;
     }
   }
@@ -283,31 +322,31 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
          "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
          msg->type, ncclSocketToString(&conn->addr, rasLine),
          conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         (conn->sock ? conn->sock->status : -1));
   }
 }
 
 // Attempts to send the queued RAS messages to another RAS thread.
 ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) {
-  struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock;
   struct rasMsgMeta* meta;
   *closed = 0;
   while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) {
-    if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
+    if (conn->sock->status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
       // We don't send anything beyond the handshake at this point.
       meta = nullptr;
       break;
     }
     if (meta->offset < sizeof(meta->length)) {
       // Send the length of the message.
-      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed));
+      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, &meta->length, sizeof(meta->length),
+                                   &meta->offset, closed));
       if (*closed)
         return ncclSuccess;
       if (meta->offset < sizeof(meta->length))
         break;
     }
     // Send the body of the message.
-    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length),
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, ((char*)&meta->msg)-sizeof(meta->length),
                                  meta->length+sizeof(meta->length), &meta->offset, closed));
     if (*closed)
       return ncclSuccess;
@@ -377,7 +416,7 @@ ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) {
 static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) {
   ncclResult_t ret = ncclSuccess;
   struct rasConnection* conn = nullptr;
-  int connIdx, peerIdx;
+  int peerIdx;
   struct rasMsg* newMsg = nullptr;
   int newMsgLen;
   char line[SOCKET_NAME_MAXLEN+1];
@@ -406,19 +445,16 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
   }
 
   // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race).
-  connIdx = rasConnFind(&msg->connInit.listeningAddr);
-  if (connIdx != -1) {
-    conn = rasConns+connIdx;
-
+  conn = rasConnFind(&msg->connInit.listeningAddr);
+  if (conn) {
     INFO(NCCL_RAS,
          "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
          (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
          conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0));
 
-    if (conn->sockIdx != -1) {
-      struct rasSocket* connSock = rasSockets+conn->sockIdx;
+    if (conn->sock) {
       INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)",
-           connSock->status, (clockNano()-connSock->createTime)/1e9);
+           conn->sock->status, (clockNano()-conn->sock->createTime)/1e9);
       // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have
       // a race where both sides attempt to establish a connection at roughly the same time, so the other side's
       // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them.
@@ -433,21 +469,19 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
         goto exit;
       } else {
         INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one");
-        rasSocketTerminate(connSock);
+        rasSocketTerminate(conn->sock);
       }
     }
-  }
-  if (!conn) {
+  } else { // conn == nullptr
     NCCLCHECK(getNewConnEntry(&conn));
     memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr));
-    connIdx = conn - rasConns;
   }
 
   sock->status = RAS_SOCK_READY;
   // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
 
-  conn->sockIdx = sock-rasSockets;
-  sock->connIdx = connIdx;
+  conn->sock = sock;
+  sock->conn = conn;
   memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr));
 
   // Make sure that the connection is part of the right links forming the RAS network.  At this point we only
@@ -456,8 +490,8 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
   // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before
   // the peers update.
   if (peerIdx != -1) {
-    (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx);
-    (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx);
+    (void)rasLinkConnUpdate(&rasNextLink, conn, peerIdx);
+    (void)rasLinkConnUpdate(&rasPrevLink, conn, peerIdx);
   }
 
   // Send a confirmation to the server that requested the connection (so that the resilience code can mark
@@ -504,12 +538,13 @@ static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct ras
 }
 
 // Handles the deadPeer broadcast.
-void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) {
-  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine));
+void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone) {
+  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&(*pReq)->deadPeer.addr, rasLine));
 
-  if (!rasPeerIsDead(&req->deadPeer.addr)) {
-    rasConnDisconnect(&req->deadPeer.addr);
-    (void)rasPeerDeclareDead(&req->deadPeer.addr);
+  *pReqLen = rasCollDataLength(RAS_BC_DEADPEER);
+  if (!rasPeerIsDead(&(*pReq)->deadPeer.addr)) {
+    rasConnDisconnect(&(*pReq)->deadPeer.addr);
+    (void)rasPeerDeclareDead(&(*pReq)->deadPeer.addr);
     *pDone = false;
   } else {
     INFO(NCCL_RAS, "RAS already knew it was dead");
@@ -530,6 +565,7 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock) {
 
   INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine));
 
+  memset(&msg, '\0', sizeof(msg));
   msg.type = RAS_MSG_CONNINITACK;
   msg.connInitAck.nack = 1;
   offset = 0;
@@ -557,16 +593,16 @@ static void* rasThreadMain(void*) {
   INFO(NCCL_RAS, "RAS thread started");
 
   // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket).
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
   rasPfds[pfd].fd = rasNotificationPipe[0];
   rasPfds[pfd].events = POLLIN;
 
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
-  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
+  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, exit);
   rasPfds[pfd].fd = rasNetListeningSocketFd;
   rasPfds[pfd].events = POLLIN;
 
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
   rasPfds[pfd].fd = rasClientListeningSocket;
   rasPfds[pfd].events = POLLIN;
 
@@ -595,32 +631,37 @@ static void* rasThreadMain(void*) {
       if (rasPfds[pollIdx].revents) {
         nEvents--;
         if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) {
-          (void)rasLocalHandle();
+          bool terminate = false;
+          NCCLCHECKGOTO(rasLocalHandle(&terminate), ret, exit);
+          if (terminate)
+            goto exit;
         } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) {
           (void)rasNetAcceptNewSocket();
         } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) {
           (void)rasClientAcceptNewSocket();
         } else {
           // Check if it's one of the RAS sockets.
-          int sockIdx;
-          for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
-            struct rasSocket* sock = rasSockets+sockIdx;
-            if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) {
-              rasSockEventLoop(sockIdx, pollIdx);
+          struct rasSocket* sock;
+          for (sock = rasSocketsHead; sock;) {
+            struct rasSocket* sockNext = sock->next;
+            if (rasPfds[pollIdx].fd == sock->sock.fd) {
+              rasSockEventLoop(sock, pollIdx);
               break;
             }
-          } // for (sockIdx)
+            sock = sockNext;
+          } // for (sock)
 
-          if (sockIdx == nRasSockets) {
+          if (sock == nullptr) {
             // Try a client socket instead.
-            for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) {
-              struct rasClient* client = rasClients+clientIdx;
-              if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) {
-                rasClientEventLoop(clientIdx, pollIdx);
+            for (struct rasClient* client = rasClientsHead; client;) {
+              struct rasClient* clientNext = client->next;
+              if (rasPfds[pollIdx].fd == client->sock) {
+                rasClientEventLoop(client, pollIdx);
                 break;
               }
-            } // for (clientIdx)
-          } // if (sockIdx == nRasSockets)
+              client = clientNext;
+            } // for (client)
+          } // if (sock == nullptr)
         } // dynamic fds
       } // if (revents)
     } // for (pollIdx)
@@ -636,14 +677,9 @@ static void* rasThreadMain(void*) {
     rasCollsHandleTimeouts(now, &nextWakeup);
   } // for (;;)
 
-fail:
-  WARN("fatal error - RAS thread terminating");
-  std::lock_guard<std::mutex> lock(rasInitMutex);
-  (void)close(rasNotificationPipe[1]);
-  (void)close(rasNotificationPipe[0]);
-  (void)close(rasClientListeningSocket);
-  (void)ncclSocketClose(&rasNetListeningSocket);
-  rasInitialized = false;
+exit:
+  rasThreadCleanup();
+  INFO(NCCL_RAS, "RAS thread terminating");
   return nullptr;
 }
 
diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h
index 715fff4..17326c3 100644
--- a/src/ras/ras_internal.h
+++ b/src/ras/ras_internal.h
@@ -42,6 +42,14 @@ typedef enum {
   RAS_COLL_COMMS = 1002, // Collect data about all communicators.
 } rasCollectiveType;
 
+// Unique communicator identifier.  commHash by itself is definitely not guaranteed to be unique.
+// Combined with the two other hashes, the chance is much better...
+// All three fields are used for sorting.
+struct rasCommId {
+  uint64_t commHash;
+  uint64_t hostHash, pidHash; // These are the hashes of the *first* rank (comm->peerInfo[0]).
+};
+
 // Payload of a collective request message (RAS_MSG_COLLREQ).
 struct rasCollRequest {
   union ncclSocketAddress rootAddr;
@@ -56,6 +64,10 @@ struct rasCollRequest {
     struct {
     } conns;
     struct {
+      int nSkipMissingRanksComms; // Number of elements in the array below.
+      // Communicators for which we do *not* need the missingRanks data in the responses
+      // (see struct rasCollCommsMissingRank later).
+      struct rasCommId skipMissingRanksComms[0]; // Variable length, sorted.
     } comms;
   };
 };
@@ -69,8 +81,8 @@ struct rasCollResponse {
   int nPeers;
   int nData; // Size of data in bytes.
   union ncclSocketAddress peers[0]; // Variable length.
-  // The peersAddrs array is followed by:
-  //alignas(int64_t) char data[0]; // Variable length, collective-dependent.
+  // The peers array is followed by:
+  // alignas(int64_t) char data[0]; // Variable length, collective-dependent.
 };
 
 // Describes a peer NCCL process.  Every RAS thread keeps an (identical) array of them, one entry for each
@@ -80,6 +92,8 @@ struct rasPeerInfo {
   pid_t pid;
   uint64_t cudaDevs; // Bitmask.  This is for local devices so 64 bits is enough.
   uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
+  uint64_t hostHash, pidHash; // Taken from ncclComm, but with the commHash subtracted to make it
+                              // communicator-independent.
 };
 
 // Describes a RAS message.  Every message is preceded by a (32-bit) message length.  All data in the host
@@ -112,7 +126,7 @@ struct rasMsg {
       int nPeers;
       int nDeadPeers;
       struct rasPeerInfo peers[0]; // Variable length.
-      // The peers array is followed by the following:
+      // The peers array is followed by:
       //union ncclSocketAddress deadPeers[0]; // Variable length.
     } peersUpdate;
     struct {
@@ -218,6 +232,9 @@ struct rasMsgMeta {
 // Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response).
 // For every collective operation, each participating RAS thread will create its own.
 struct rasCollective {
+  struct rasCollective* next;
+  struct rasCollective* prev;
+
   union ncclSocketAddress rootAddr;
   uint64_t rootId;
 
@@ -227,15 +244,16 @@ struct rasCollective {
   bool timeoutWarned;
 
   int64_t startTime; // For timeout calculations.
-  int fromConnIdx; // The connection we received the request from.
+  struct rasConnection* fromConn; // The connection we received the request from.
 
-  int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive.
+  struct rasConnection** fwdConns; // Connections we forwarded the request to; replaced by nullptr's as the
+                                   // responses arrive.
   int nFwdSent; // Count of the above (local process only).
   int nFwdRecv; // Count of the responses received or timeouts (local process only).
 
   int nLegTimeouts; // Collective (from this process and the responses we received).
 
-  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).
+  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).  Unsorted.
   int nPeers;
 
   char* data; // Collective (from this process and the responses we received).
@@ -261,13 +279,14 @@ struct rasCollConns {
 struct rasCollComms {
   int nComms;
   struct comm {
-    uint64_t commHash;
-    int commNRanks;
-    int nRanks; // number of elements in the array below, *not* in the communicator.
+    struct rasCommId commId;
+    int commNRanks; // >= nRanks + nMissingRanks
+    int nRanks; // Number of elements in the ranks array below, *not* in the communicator.
+    int nMissingRanks; // Number of elements in the missingRanks array below.
     struct rank {
       int commRank;
       int peerIdx; // Index within rasCollective->peers, *not* rasPeers.
-      uint64_t collOpCount;
+      uint64_t collOpCounts[NCCL_NUM_FUNCTIONS];
       struct {
         ncclResult_t initState:4;
         ncclResult_t asyncError:4;
@@ -278,34 +297,47 @@ struct rasCollComms {
       char cudaDev;
       char nvmlDev;
     } ranks[0]; // Variable length. Sorted by commRank.  Optimized for 1 GPU/process.
-  } comms[0]; // Variable length. Sorted by commHash.
+    // The ranks array is followed by:
+    // struct rasCollCommsMissingRank missingRanks[0]; // Variable length.  Sorted by commRank.
+  } comms[0]; // Variable length.  Sorted by commId.
+};
+
+// Provides info about missing ranks.  An array of these structures can be part of struct rasCollComms above.
+// Because the arrays are of variable length, we can't describe them in C.  To ensure that adding
+// rasCollCommsMissingRank structures doesn't mess up the alignment, we explicitly request one.
+struct alignas(struct rasCollComms) rasCollCommsMissingRank {
+  int commRank;
+  union ncclSocketAddress addr;
+  // We don't need pid here as we can look it up in rasPeers via addr.
+  char cudaDev;
+  char nvmlDev;
 };
 
 // Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one
 // or one of the fallbacks).
 struct rasLinkConn {
+  struct rasLinkConn* next;
   int peerIdx; // Index in the rasPeers array of the peer this entry describes.  Could be -1 (an entry initiated
                // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates).
-  int connIdx; // Index in the rasConns array of the connection to the above peer.  Could be -1 (a placeholder
-               // for a connection to be started by the remote peer).
+  struct rasConnection* conn; // The connection to the above peer.  Could be nullptr (a placeholder for a connection
+                              // to be started by the remote peer).
   bool external; // true if the entry exists only due to an external request (requested by a remote peer, most
                  // likely as part of fault recovery).  Such connections are kept as fallbacks even if there's a
                  // valid primary connection, in order to ensure that keep-alive messages are sent.
 };
 
 // Describes a link that forms the backbone of the RAS network.  Links focus on direction (previous/next in
-// case of 1-D topology) rather than a particular destination.  The are implemented using rasConnections, but
+// case of 1-D topology) rather than a particular destination.  They are implemented using rasConnections, but
 // they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS
 // network is reconfigured or a peer dies.
 struct rasLink {
   int direction; // 1 for nextLink, -1 for prevLink.
 
-  // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having
-  // problems with the primary connection).  The elements are de-facto ordered (highest-preference ones have
-  // the lowest indices).
+  // First element is the primary connection; any additional ones are fallbacks (that get created if we are having
+  // problems with the primary connection).  The highest-preference elements come first; the list is de-facto sorted
+  // by peerIdx, though peerIdx values can wrap around (given the ring/torus topology) and they can also be -1
+  // (the latter are stored at the end).
   struct rasLinkConn* conns;
-  int nConns;
-  int connsSize; // Array size; could be larger than nConns.
 
   // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect
   // the peer on the other side to do so) but that peer failed to initiate.
@@ -315,15 +347,15 @@ struct rasLink {
 // Describes a connection to another peer on the RAS network.  It is meant to be more persistent than a volatile
 // socket (described by the rasSocket structure), which can be affected by transient network issues.
 struct rasConnection {
-  bool inUse;
+  struct rasConnection* next;
+  struct rasConnection* prev;
 
   union ncclSocketAddress addr;
 
-  // Index of the current rasSocket in the rasSockets array.  Note that multiple rasSocket entries may point back
+  // Pointer to the current rasSocket.  Note that multiple rasSocket entries may point back
   // to a single entry here, for sockets that are in the process of being terminated and re-established.
-  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
-  // -1 if there is no such socket.
-  int sockIdx;
+  // nullptr if there is no such socket.
+  struct rasSocket* sock;
 
   // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges.
   // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash.
@@ -371,16 +403,18 @@ typedef enum {
 
 // Describes a socket implementing communication between two peers.
 struct rasSocket {
+  struct rasSocket* next;
+  struct rasSocket* prev;
+
   struct ncclSocket sock;
 
   rasSocketStatus status;
 
   int pfd; // Index in the rasPfds array.
 
- // Index of the corresponding entry in the rasConns array.
-  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
-  // -1 if there is no connection (normal condition on the accept side before the connInit message).
-  int connIdx;
+  // Pointer to the corresponding entry in the rasConns array.
+  // nullptr if there is no connection (a normal condition on the accept side before the connInit message).
+  struct rasConnection* conn;
 
   int64_t createTime;
   int64_t lastSendTime;
@@ -404,7 +438,10 @@ typedef enum {
 
 // Describes a RAS client.
 struct rasClient {
-  int sock;
+  struct rasClient* next;
+  struct rasClient* prev;
+
+  int sock; // File descriptor
 
   rasClientStatus status;
 
@@ -420,7 +457,7 @@ struct rasClient {
   int64_t timeout;
 
   // State stored during asynchronous operations such as collectives.
-  int collIdx; // Index to the onging rasCollective.
+  struct rasCollective* coll;
 };
 
 
@@ -440,31 +477,33 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
 ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent);
 ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed);
 ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock);
-void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone);
+void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone);
 ncclResult_t rasGetNewPollEntry(int* index);
 
 
 // rasnet.cc
 extern struct rasLink rasNextLink, rasPrevLink;
-extern struct rasConnection* rasConns;
-extern int nRasConns;
-extern struct rasSocket *rasSockets;
-extern int nRasSockets;
+extern struct rasConnection* rasConnsHead;
+extern struct rasConnection* rasConnsTail;
+extern struct rasSocket *rasSocketsHead;
+extern struct rasSocket *rasSocketsTail;
 
 ncclResult_t getNewConnEntry(struct rasConnection** pConn);
-ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx);
-int rasConnFind(const union ncclSocketAddress* addr);
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn);
+struct rasConnection* rasConnFind(const union ncclSocketAddress* addr);
 void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup);
 void rasConnDisconnect(const union ncclSocketAddress* addr);
 ncclResult_t rasNetAcceptNewSocket();
 void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup);
 void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0,
                         bool retry = true);
-void rasSockEventLoop(int sockIdx, int pollIdx);
+void rasSockEventLoop(struct rasSocket* sock, int pollIdx);
 void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup);
 ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock);
-ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false,
-                               bool insert = false, bool pretend = false, int* pLinkIdx = nullptr);
+ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn);
+ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx);
+void rasNetTerminate();
+
 
 // peers.cc
 extern struct rasPeerInfo* rasPeers;
@@ -483,29 +522,35 @@ ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr);
 bool rasPeerIsDead(const union ncclSocketAddress* addr);
 int ncclSocketsCompare(const void* p1, const void* p2);
 bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2);
+void rasPeersTerminate();
 
 
 // collectives.cc
-extern struct rasCollective* rasCollectives;
+extern struct rasCollective* rasCollectivesHead;
+extern struct rasCollective* rasCollectivesTail;
 
 void rasCollReqInit(struct rasCollRequest* req);
-ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr,
-                               int* pCollIdx = nullptr, int fromConnIdx = -1);
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone = nullptr,
+                               struct rasCollective** pColl = nullptr, struct rasConnection* fromConn = nullptr);
 ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock);
 ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock);
-void rasCollsPurgeConn(int connIdx);
+void rasCollsPurgeConn(struct rasConnection* conn);
 void rasCollFree(struct rasCollective* coll);
 void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasCollectivesTerminate();
+
 
 // client_support.cc
 extern int rasClientListeningSocket;
-extern struct rasClient* rasClients;
-extern int nRasClients;
+extern struct rasClient* rasClientsHead;
+extern struct rasClient* rasClientsTail;
+
 ncclResult_t rasClientInitSocket();
 ncclResult_t rasClientAcceptNewSocket();
 ncclResult_t rasClientResume(struct rasCollective* coll);
-void rasClientEventLoop(int clientIdx, int pollIdx);
+void rasClientEventLoop(struct rasClient* client, int pollIdx);
 const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size);
+void rasClientSupportTerminate();
 
 #endif // !NCCL_RAS_CLIENT
 
diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc
index 441ad19..43aa042 100644
--- a/src/ras/rasnet.cc
+++ b/src/ras/rasnet.cc
@@ -13,90 +13,106 @@
 struct rasLink rasNextLink = {1}, rasPrevLink = {-1};
 
 // Connections on the RAS network.
-struct rasConnection* rasConns;
-int nRasConns;
+struct rasConnection* rasConnsHead;
+struct rasConnection* rasConnsTail;
 
 // Sockets implementing the RAS network.
-struct rasSocket *rasSockets;
-int nRasSockets;
+struct rasSocket *rasSocketsHead;
+struct rasSocket *rasSocketsTail;
 
 // Magic file descriptor number when we want poll() to ignore an entry.  Anything negative would do, but
 // I didn't want to use -1 because it has a special meaning for us.
 #define POLL_FD_IGNORE -2
 
+static void freeConnEntry(struct rasConnection* conn);
 static void rasConnOpen(struct rasConnection* conn);
 static ncclResult_t rasConnPrepare(struct rasConnection* conn);
 static void rasConnTerminate(struct rasConnection* conn);
 
 static ncclResult_t getNewSockEntry(struct rasSocket** pSock);
+static void freeSockEntry(struct rasSocket* sock);
 
 static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup);
-static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup);
+static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup);
 static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack = false);
 
-static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx);
 static void rasConnResume(struct rasConnection* conn);
 static void rasLinkSanitizeFallbacks(struct rasLink* link);
-static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx = -1);
-static int rasLinkFindConn(const struct rasLink* link, int connIdx);
+static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend = false,
+                                   int* pLinkIdx = nullptr, struct rasLinkConn** pLinkConn = nullptr,
+                                   bool insert = true);
+static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx);
+static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external = false);
+static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn,
+                                           int* pLinkIdx = nullptr);
 
 
 ///////////////////////////////////////////////
 // Functions related to the RAS connections. //
 ///////////////////////////////////////////////
 
-// Allocates an entry in the rasConns array, enlarging the array if necessary.
+// Allocates a new entry in the rasConnections list.
 ncclResult_t getNewConnEntry(struct rasConnection** pConn) {
   struct rasConnection* conn;
-  int i;
-  for (i = 0; i < nRasConns; i++)
-    if (!rasConns[i].inUse)
-      break;
-  if (i == nRasConns) {
-    NCCLCHECK(ncclRealloc(&rasConns, nRasConns, nRasConns+RAS_INCREMENT));
-    nRasConns += RAS_INCREMENT;
-  }
 
-  conn = rasConns+i;
-  memset(conn, '\0', sizeof(*conn));
-  conn->inUse = true;
-  conn->sockIdx = -1;
+  NCCLCHECK(ncclCalloc(&conn, 1));
+
   ncclIntruQueueConstruct(&conn->sendQ);
   conn->travelTimeMin = INT64_MAX;
   conn->travelTimeMax = INT64_MIN;
 
+  if (rasConnsHead) {
+    rasConnsTail->next = conn;
+    conn->prev = rasConnsTail;
+    rasConnsTail = conn;
+  } else {
+    rasConnsHead = rasConnsTail = conn;
+  }
+
   *pConn = conn;
   return ncclSuccess;
 }
 
+// Frees an entry from the rasConns list.
+static void freeConnEntry(struct rasConnection* conn) {
+  if (conn == nullptr)
+    return;
+
+  if (conn == rasConnsHead)
+    rasConnsHead = rasConnsHead->next;
+  if (conn == rasConnsTail)
+    rasConnsTail = rasConnsTail->prev;
+  if (conn->prev)
+    conn->prev->next = conn->next;
+  if (conn->next)
+    conn->next->prev = conn->prev;
+  free(conn);
+}
+
 // Creates a new RAS network connection to a remote peer address.
-ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) {
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn) {
   ncclResult_t ret = ncclSuccess;
-  struct rasConnection* conn = nullptr;
+  struct rasConnection* conn;
 
   // First check if a connection entry for this peer already exists.
-  int connIdx = rasConnFind(addr);
-  if (connIdx != -1) {
-    conn = rasConns+connIdx;
-  }
+  conn = rasConnFind(addr);
 
-  if (conn && conn->sockIdx != -1) {
+  if (conn && conn->sock) {
     // An entry exists and has a socket associated with it -- nothing left for us to do.
-    if (pConnIdx)
-      *pConnIdx = connIdx;
+    if (pConn)
+      *pConn = conn;
     goto exit;
   }
 
-  if (!conn) {
+  if (conn == nullptr) {
     NCCLCHECKGOTO(getNewConnEntry(&conn), ret, exit);
     memcpy(&conn->addr, addr, sizeof(conn->addr));
     // We are establishing a new connection -- start the timeout.
     conn->startRetryTime = clockNano();
-    connIdx = conn - rasConns;
   }
 
-  if (pConnIdx)
-    *pConnIdx = connIdx;
+  if (pConn)
+    *pConn = conn;
 
   rasConnOpen(conn);
 
@@ -107,7 +123,7 @@ exit:
 // Opens a connection to a remote peer.
 static void rasConnOpen(struct rasConnection* conn) {
   ncclResult_t ret; // Not used.
-  struct rasSocket* sock;
+  struct rasSocket* sock = nullptr;
   bool closeSocketOnFail = false;
   int ready;
 
@@ -120,10 +136,8 @@ static void rasConnOpen(struct rasConnection* conn) {
 
   NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
 
-  // We delay the initialization of sockIdx, connIdx and status until this point so that in case of failures
-  // we don't need to clean them up.
-  conn->sockIdx = sock-rasSockets;
-  sock->connIdx = conn-rasConns;
+  conn->sock = sock;
+  sock->conn = conn;
   rasPfds[sock->pfd].fd = sock->sock.fd;
 
   // We ignore the possibly ready status of the socket at this point and consider it CONNECTING because
@@ -141,6 +155,7 @@ exit:
 fail:
   if (closeSocketOnFail)
     (void)ncclSocketClose(&sock->sock);
+  freeSockEntry(sock);
   goto exit;
 }
 
@@ -166,16 +181,13 @@ static ncclResult_t rasConnPrepare(struct rasConnection* conn) {
 }
 
 // Searches through rasConns for a connection with a provided address.
-int rasConnFind(const union ncclSocketAddress* addr) {
-  // rasConns is not sorted (given the number of indices, it would be a massive hassle to keep it that way)
-  // so binary search won't do...
-  for (int i = 0; i < nRasConns; i++) {
-    struct rasConnection* conn = rasConns+i;
-    if (conn->inUse && memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0)
-      return i;
+struct rasConnection* rasConnFind(const union ncclSocketAddress* addr) {
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+    if (memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0)
+      return conn;
   }
 
-  return -1;
+  return nullptr;
 }
 
 // Handles any connection-related timeouts.  Many timeouts affect the underlying sockets and thus have been handled
@@ -184,58 +196,56 @@ int rasConnFind(const union ncclSocketAddress* addr) {
 // This is also where we declare peers as dead, etc.
 // Invoked from the main RAS event loop.
 void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
-    struct rasConnection* conn = rasConns+connIdx;
-
-    if (!conn->inUse)
-      continue;
-
-    if (conn->sockIdx != -1) {
-      struct rasSocket* sock = rasSockets+conn->sockIdx;
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    if (conn->sock) {
       bool sockTerminated = false;
 
       // Retry the socket connections that have been refused.
-      if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) {
-        if (now - sock->lastSendTime > RAS_CONNECT_RETRY) {
+      if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting) {
+        if (now - conn->sock->lastSendTime > RAS_CONNECT_RETRY) {
           int ready;
-          if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) {
+          if (ncclSocketReady(&conn->sock->sock, &ready) != ncclSuccess) {
             INFO(NCCL_RAS, "Unexpected error from ncclSocketReady; terminating the socket connection with %s",
-                 ncclSocketToString(&sock->sock.addr, rasLine));
-            rasSocketTerminate(sock, /*finalize*/true);
+                 ncclSocketToString(&conn->addr, rasLine));
+            rasSocketTerminate(conn->sock, /*finalize*/true);
             // We will retry below in the same loop.
             sockTerminated = true;
           } else {
             // We update lastSendTime even if !ready because we need it up-to-date for timeout calculations.
-            sock->lastSendTime = clockNano();
-            if (!ready && sock->sock.state == ncclSocketStateConnecting)
-              *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+            conn->sock->lastSendTime = clockNano();
+            if (!ready && conn->sock->sock.state == ncclSocketStateConnecting)
+              *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY);
             else
-              rasPfds[sock->pfd].fd = sock->sock.fd; // Enable the handling via the main loop.
+              rasPfds[conn->sock->pfd].fd = conn->sock->sock.fd; // Enable the handling via the main loop.
           } // if (ncclSocketReady)
         } else {
-          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+          *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY);
         }
-      } // if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting)
+      } // if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting)
 
       // For connections that have data to send but that we've been unable to send a message on for a while,
       // consider their sockets lost and terminate them.
-      if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) {
-        if (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) {
+      if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY) {
+        if (now - std::max(conn->sock->lastSendTime,
+                           ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) {
           INFO(NCCL_RAS, "RAS send stuck timeout error (%lds) on socket connection with %s",
-               (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) /
-               CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
-          rasSocketTerminate(sock, /*finalize*/false, RAS_STUCK_TIMEOUT);
+               (now - std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) /
+               CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+          rasSocketTerminate(conn->sock, /*finalize*/false, RAS_STUCK_TIMEOUT);
           // We will retry below in the same loop.
         } else {
-          *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime,
-                                                       ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+RAS_STUCK_TIMEOUT);
+          *nextWakeup = std::min(*nextWakeup,
+                                 std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+
+                                 RAS_STUCK_TIMEOUT);
         }
-      } // if (!ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY)
-    } // if (conn->sockIdx != -1)
+      } // if (!ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY)
+    } // if (conn->sock)
 
     // For connections that are being (re-)established, irrespective of whether there's a valid socket associated
-    // with them (conn->startIdx != -1), we need to check if any connection-level timeout has expired.
+    // with them, we need to check if any connection-level timeout has expired.
     if (conn->startRetryTime) {
+      bool connTerminated = false;
       // If we've been trying to open a connection for too long (60s), give up and mark the peer as dead
       // so that we don't try again.
       if (now - conn->startRetryTime > RAS_PEER_DEAD_TIMEOUT) {
@@ -248,82 +258,83 @@ void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
         rasCollReqInit(&bCast);
         bCast.type = RAS_BC_DEADPEER;
         memcpy(&bCast.deadPeer.addr, &conn->addr, sizeof(bCast.deadPeer.addr));
-        (void)rasNetSendCollReq(&bCast, rasCollDataLength(RAS_BC_DEADPEER));
+        (void)rasNetSendCollReq(&bCast);
 
-        continue;
+        connTerminated = true;
       } else {
         *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_PEER_DEAD_TIMEOUT);
       }
 
       // RAS_STUCK_TIMEOUT has already been handled in the socket function (we'll pick it up later via
-      // the conn->sockIdx == -1 test).
+      // the conn->sock == nullptr test).
 
-      // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try
-      // to establish fallback connections.
-      if (now - conn->startRetryTime > RAS_CONNECT_WARN) {
-        if (!conn->experiencingDelays) {
-          INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s",
-               (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+      if (!connTerminated) {
+        // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try
+        // to establish fallback connections.
+        if (now - conn->startRetryTime > RAS_CONNECT_WARN) {
+          if (!conn->experiencingDelays) {
+            INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s",
+                 (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
 
-          // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback
-          // connection(s).  At this point, it's mostly just a precaution; we will continue trying to establish
-          // the primary connection until RAS_PEER_DEAD_TIMEOUT expires.
-          conn->experiencingDelays = true;
-          (void)rasLinkAddFallback(&rasNextLink, connIdx);
-          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-          // rasConns may have been reallocated by the above calls.
-          conn = rasConns+connIdx;
+            // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback
+            // connection(s).  At this point, it's mostly just a precaution; we will continue trying to establish
+            // the primary connection until RAS_PEER_DEAD_TIMEOUT expires.
+            conn->experiencingDelays = true;
+            (void)rasLinkAddFallback(&rasNextLink, conn);
+            (void)rasLinkAddFallback(&rasPrevLink, conn);
 
-          // Stop collectives from waiting for a response over it.
-          rasCollsPurgeConn(connIdx);
-        } // if (!conn->experiencingDelays)
-      } else {
-        *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN);
-      }
-
-      // If a socket was terminated (or never opened, due to some error), try to open it now.
-      // We retry once a second.
-      if (conn->sockIdx == -1) {
-        if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) {
-          INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)",
-               ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays,
-               (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0));
-          rasConnOpen(conn);
+            // Stop collectives from waiting for a response over it.
+            rasCollsPurgeConn(conn);
+          } // if (!conn->experiencingDelays)
+        } else {
+          *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN);
         }
-        if (conn->sockIdx == -1)
-          *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY);
-      }
+
+        // If a socket was terminated (or never opened, due to some error), try to open it now.
+        // We retry once a second.
+        if (conn->sock == nullptr) {
+          if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) {
+            INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)",
+                 ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays,
+                 (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0));
+            rasConnOpen(conn);
+          }
+          if (conn->sock == nullptr)
+            *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY);
+        }
+      } // if (!connTerminated)
     } // if (conn->startRetryTime)
-  } // for (connIdx)
+
+    conn = connNext;
+  } // for (conn)
 }
 
 // Checks if we have a connection to a given peer and if so, terminates it.  The connection is removed from the
 // RAS links, though fallbacks are initiated if necessary.  Typically called just before declaring a peer dead.
 void rasConnDisconnect(const union ncclSocketAddress* addr) {
-  int connIdx = rasConnFind(addr);
-  if (connIdx != -1) {
-    (void)rasLinkAddFallback(&rasNextLink, connIdx);
-    (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-    rasLinkDropConn(&rasNextLink, connIdx);
-    rasLinkDropConn(&rasPrevLink, connIdx);
+  struct rasConnection* conn = rasConnFind(addr);
+  if (conn) {
+    (void)rasLinkAddFallback(&rasNextLink, conn);
+    (void)rasLinkAddFallback(&rasPrevLink, conn);
+    rasLinkConnDrop(&rasNextLink, conn);
+    rasLinkConnDrop(&rasPrevLink, conn);
 
-    rasConnTerminate(rasConns+connIdx);
+    rasConnTerminate(conn);
   }
 }
 
 // Terminates a connection and frees the rasConns entry.
 static void rasConnTerminate(struct rasConnection* conn) {
-  int connIdx = conn - rasConns;
-
   // Make sure there are no lingering rasSockets pointing to it.
-  for (int i = 0; i < nRasSockets; i++) {
-    struct rasSocket* sock = rasSockets+i;
-    if (sock->status != RAS_SOCK_CLOSED && sock->connIdx == connIdx)
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
+    if (sock->conn == conn)
       rasSocketTerminate(sock, /*finalize*/true);
+    sock = sockNext;
   }
 
   // Also check any ongoing collectives.
-  rasCollsPurgeConn(connIdx);
+  rasCollsPurgeConn(conn);
 
   while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&conn->sendQ)) {
     free(meta);
@@ -331,8 +342,7 @@ static void rasConnTerminate(struct rasConnection* conn) {
 
   INFO(NCCL_RAS, "RAS terminating a connection with %s", ncclSocketToString(&conn->addr, rasLine));
 
-  conn->inUse = false;
-  conn->sockIdx = -1; // Should be that way already, but just to be extra sure...
+  freeConnEntry(conn);
 }
 
 
@@ -344,7 +354,7 @@ static void rasConnTerminate(struct rasConnection* conn) {
 // corresponding rasConnection can't be established without knowing the peer's address.
 ncclResult_t rasNetAcceptNewSocket() {
   ncclResult_t ret = ncclSuccess;
-  struct rasSocket* sock;
+  struct rasSocket* sock = nullptr;
   int ready;
   bool socketInitialized = false;
   NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail);
@@ -370,91 +380,98 @@ exit:
 fail:
   if (socketInitialized)
     NCCLCHECK(ncclSocketClose(&sock->sock));
+  freeSockEntry(sock);
   goto exit;
 }
 
-// Returns the index of the first available entry in the rasConns array, enlarging the array if necessary.
+// Allocates a new entry in the rasSockets list.
 static ncclResult_t getNewSockEntry(struct rasSocket** pSock) {
   struct rasSocket* sock;
-  int i;
-  for (i = 0; i < nRasSockets; i++)
-    if (rasSockets[i].status == RAS_SOCK_CLOSED)
-      break;
-  if (i == nRasSockets) {
-    NCCLCHECK(ncclRealloc(&rasSockets, nRasSockets, nRasSockets+RAS_INCREMENT));
-    nRasSockets += RAS_INCREMENT;
-  }
 
-  sock = rasSockets+i;
-  memset(sock, '\0', sizeof(*sock));
+  NCCLCHECK(ncclCalloc(&sock, 1));
+
   sock->pfd = -1;
-  sock->connIdx = -1;
   sock->createTime = sock->lastSendTime = sock->lastRecvTime = clockNano();
 
+  if (rasSocketsHead) {
+    rasSocketsTail->next = sock;
+    sock->prev = rasSocketsTail;
+    rasSocketsTail = sock;
+  } else {
+    rasSocketsHead = rasSocketsTail = sock;
+  }
+
   *pSock = sock;
   return ncclSuccess;
 }
 
+// Frees an entry from the rasSockets list.
+static void freeSockEntry(struct rasSocket* sock) {
+  if (sock == nullptr)
+    return;
+
+  if (sock == rasSocketsHead)
+    rasSocketsHead = rasSocketsHead->next;
+  if (sock == rasSocketsTail)
+    rasSocketsTail = rasSocketsTail->prev;
+  if (sock->prev)
+    sock->prev->next = sock->next;
+  if (sock->next)
+    sock->next->prev = sock->prev;
+  free(sock);
+}
+
 // Invoked from the main RAS event loop to handle RAS socket timeouts.
 void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
-    struct rasSocket* sock = rasSockets+sockIdx;
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
 
-    if (sock->status == RAS_SOCK_CLOSED)
-      continue;
-
-    // For socket connections that are still being established, give up on the ones that take too long to initialize.
     if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) {
+      // For socket connections that are still being established, give up on the ones that take too long to initialize.
       if (now - sock->createTime > RAS_STUCK_TIMEOUT) {
-        if (sock->connIdx == -1) {
+        if (sock->conn == nullptr) {
           INFO(NCCL_RAS, "RAS init timeout error (%lds) on incoming socket connection from %s",
                (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
         } else {
-          struct rasConnection* conn = rasConns+sock->connIdx;
           INFO(NCCL_RAS, "RAS init timeout error (%lds) on socket connection with %s "
                "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
                (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine),
-               conn->experiencingDelays, (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0),
-               sock->status);
+               sock->conn->experiencingDelays,
+               (sock->conn->startRetryTime ? (now-sock->conn->startRetryTime)/1e9 : 0.0), sock->status);
         }
         rasSocketTerminate(sock, /*finalize*/true);
         // We may retry later.
-        continue;
       } else {
         *nextWakeup = std::min(*nextWakeup, sock->createTime+RAS_STUCK_TIMEOUT);
       }
-    } // if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE)
-
-    // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long.
-    if (sock->status == RAS_SOCK_TERMINATING) {
+    } else if (sock->status == RAS_SOCK_TERMINATING) {
+      // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long.
       if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_STUCK_TIMEOUT) {
         INFO(NCCL_RAS, "RAS termination stuck timeout error (%lds) on socket connection with %s",
              (now-std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock, /*finalize*/true);
         // This socket is presumably already being re-established, if needed.
-        continue;
       } else {
         *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_STUCK_TIMEOUT);
       }
-    } // if (sock->status == RAS_SOCK_TERMINATING)
-
-    // Terminate sockets that haven't been used in a good while.  In principle this shouldn't trigger for anything
-    // important due to shorter timeouts on RAS network connections, but in case of weird situations like process
-    // suspend, rasSocketTerminate will do additional checking.
-    if (sock->status == RAS_SOCK_READY) {
+    } else if (sock->status == RAS_SOCK_READY) {
+      // Terminate sockets that haven't been used in a good while.  In principle this shouldn't trigger for anything
+      // important due to shorter timeouts on RAS network connections, but in case of weird situations like process
+      // suspend, rasSocketTerminate will do additional checking.
       if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_IDLE_TIMEOUT) {
         INFO(NCCL_RAS, "RAS idle timeout (%lds) on socket connection with %s",
              (now - std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock, /*finalize*/false, /*startRetryOffset*/0, /*retry*/false);
-        continue;
         // The RAS network timeout handler will terminate the conn it was associated with, if any.
       } else {
         *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_IDLE_TIMEOUT);
       }
     } // if (sock->status == RAS_SOCK_READY)
-  } // for (sockIdx)
+
+    sock = sockNext;
+  } // for (sock)
 }
 
 // Handles the termination of a RAS socket.
@@ -464,19 +481,19 @@ void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
 // For not fully established sockets, we can terminate immediately as there's no useful data to extract.
 void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) {
   assert(sock->status != RAS_SOCK_CLOSED);
-  if (sock->connIdx != -1) {
-    struct rasConnection* conn = rasConns+sock->connIdx;
-    // If the sockIdx of the connection points back to us, it means that we are the current socket of this
+  if (sock->conn) {
+    struct rasConnection* conn = sock->conn;
+    // If the sock of the connection points back to us, it means that we are the current socket of this
     // connection, so we have additional work to do before we can terminate it.
-    if (conn->sockIdx == sock-rasSockets) {
+    if (conn->sock == sock) {
       // Reset it to indicate there's no valid socket associated with that connection anymore.
-      conn->sockIdx = -1;
+      conn->sock = nullptr;
 
       // Don't attempt to retry on sockets that have been unused for so long that the remote peer probably
       // deliberately closed them.  Make an exception for sockets that are part of the RAS network links.
       if ((retry &&
            clockNano() - std::max(sock->lastSendTime, sock->lastRecvTime) < RAS_IDLE_TIMEOUT - RAS_IDLE_GRACE_PERIOD) ||
-          rasLinkFindConn(&rasNextLink, sock->connIdx) != -1 || rasLinkFindConn(&rasPrevLink, sock->connIdx) != -1) {
+          rasLinkConnFind(&rasNextLink, sock->conn) || rasLinkConnFind(&rasPrevLink, sock->conn)) {
         // For connections that were fine until now, the connection-level timeout starts at termination, and possibly
         // even earlier, depending on what event trigerred the termination -- if it was another timeout expiring, then
         // we need to include that timeout as well.
@@ -507,11 +524,11 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet
       } // if (retry)
 
       // Stop collectives from waiting for a response over this connection.
-      rasCollsPurgeConn(sock->connIdx);
-    } // if (conn->sockIdx == sock-rasSockets)
-  } // if (sock->connIdx != -1)
+      rasCollsPurgeConn(sock->conn);
+    } // if (conn->sock == sock)
+  } // if (sock->conn)
 
-  if (sock->status != RAS_SOCK_CONNECTING && sock->connIdx != -1 && !finalize && (rasPfds[sock->pfd].events & POLLIN)) {
+  if (sock->status != RAS_SOCK_CONNECTING && sock->conn && !finalize && (rasPfds[sock->pfd].events & POLLIN)) {
     if (sock->status != RAS_SOCK_TERMINATING) {
       // The receiving side is still open -- close just the sending side.
       (void)ncclSocketShutdown(&sock->sock, SHUT_WR);
@@ -525,20 +542,15 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet
   } else {
     // Either the caller requested finalization or we cannot receive on it.
     (void)ncclSocketClose(&sock->sock);
-    sock->status = RAS_SOCK_CLOSED;
     rasPfds[sock->pfd].fd = -1;
     rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
-    sock->pfd = sock->connIdx = -1;
-    sock->recvOffset = sock->recvLength = 0;
     free(sock->recvMsg);
-    sock->recvMsg = nullptr;
+    freeSockEntry(sock);
   }
 }
 
 // Handles a ready socket FD from the main event loop.
-void rasSockEventLoop(int sockIdx, int pollIdx) {
-  struct rasSocket* sock = rasSockets+sockIdx;
-
+void rasSockEventLoop(struct rasSocket* sock, int pollIdx) {
   if (sock->status == RAS_SOCK_CONNECTING) {
     int ready;
     // Socket is not yet fully established. Continue the OS or NCCL-level handshake.
@@ -554,15 +566,15 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
         (connectSide ? sock->lastSendTime : sock->lastRecvTime) = clockNano();
         sock->status = RAS_SOCK_HANDSHAKE;
         if (connectSide) {
-          assert(sock->connIdx != -1);
-          if (rasConns[sock->connIdx].sockIdx == sockIdx) {
-            if (rasConnPrepare(rasConns+sock->connIdx) != ncclSuccess) {
+          assert(sock->conn);
+          if (sock->conn->sock == sock) {
+            if (rasConnPrepare(sock->conn) != ncclSuccess) {
               INFO(NCCL_RAS, "RAS unexpected error from rasConnPrepare; terminating the socket connection with %s",
                    ncclSocketToString(&sock->sock.addr, rasLine));
               rasSocketTerminate(sock);
               // We may retry further down.
             }
-          } else {
+          } else { // sock->conn->sock != sock
             // The connection this socket is associated with no longer considers it to be the current one.
             // This could possibly happen due to a race condition.  Simply terminate it.
             INFO(NCCL_RAS, "RAS connected with %s via a socket that's no longer current!",
@@ -581,10 +593,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
     if (sock->status != RAS_SOCK_TERMINATING && (rasPfds[pollIdx].revents & POLLOUT)) {
       int closed = 0;
       bool allSent = false;
-      assert(sock->connIdx != -1);
-      struct rasConnection* conn = rasConns+sock->connIdx;
-      assert(conn->sockIdx == sockIdx);
-      if (rasConnSendMsg(conn, &closed, &allSent) != ncclSuccess) {
+      assert(sock->conn);
+      assert(sock->conn->sock == sock);
+      if (rasConnSendMsg(sock->conn, &closed, &allSent) != ncclSuccess) {
         INFO(NCCL_RAS, "RAS unexpected error from rasConnSendMsg; terminating the socket connection with %s",
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock);
@@ -612,9 +623,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
           // We may retry further down.
         } else if (closed) {
           const char* socketType;
-          if (sock->connIdx == -1)
+          if (sock->conn == nullptr)
             socketType = "incoming";
-          else if (rasConns[sock->connIdx].sockIdx != sockIdx)
+          else if (sock->conn->sock != sock)
             socketType = "old";
           else if (sock->status == RAS_SOCK_HANDSHAKE)
             socketType = "new";
@@ -624,25 +635,21 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
                socketType, ncclSocketToString(&sock->sock.addr, rasLine));
           rasSocketTerminate(sock, /*finalize*/true);
           // We may retry further down.
-        } else {
+        } else { // !closed
           sock->lastRecvTime = clockNano();
           if (msg) {
             (void)rasMsgHandle(msg, sock);
             free(msg);
-            // Message handlers can terminate a socket in certain cases; we need to check for
-            // that here so that we don't try to receive from a closed socket.
-            // No handlers are currently believed to create new sockets but better to be safe than sorry
-            // and re-init the sock variable.
-            sock = rasSockets+sockIdx;
-            if (sock->status == RAS_SOCK_CLOSED)
+            // Message handlers can terminate a socket in various cases.  We re-check rasPfds.events to ensure that
+            // this hasn't happened here (rasSocketTerminate will reset it when finalizing a socket).
+            if (!(rasPfds[pollIdx].revents & POLLIN))
               break;
           }
-          if (sock->connIdx != -1) {
-            struct rasConnection* conn = rasConns+sock->connIdx;
-            if (conn->sockIdx == sockIdx && (conn->startRetryTime || conn->experiencingDelays))
-              rasConnResume(conn);
+          if (sock->conn) {
+            if (sock->conn->sock == sock && (sock->conn->startRetryTime || sock->conn->experiencingDelays))
+              rasConnResume(sock->conn);
           }
-        }
+        } // !closed
       } while (msg);
     } // if (POLLIN)
   } // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING
@@ -658,109 +665,95 @@ void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup) {
   // A connection can belong to multiple links but, when it comes to various timeouts, we want to handle each
   // connection just once.  We solve that with a simple flag within a connection.  This also allows us to distinguish
   // connections that are part of a link from those that are not.
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
-    rasConns[connIdx].linkFlag = false;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    conn->linkFlag = false;
 
   (void)rasLinkHandleNetTimeouts(&rasNextLink, now, nextWakeup);
   (void)rasLinkHandleNetTimeouts(&rasPrevLink, now, nextWakeup);
 
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
-    struct rasConnection* conn = rasConns+connIdx;
-    if (conn->inUse && !conn->linkFlag) {
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    if (!conn->linkFlag) {
       // The connection is not part of any link.  Check if it should be terminated.
-      if (conn->sockIdx == -1 && ncclIntruQueueEmpty(&conn->sendQ)) {
+      if (conn->sock == nullptr && ncclIntruQueueEmpty(&conn->sendQ))
         rasConnTerminate(conn);
-        continue;
-      }
     }
+    conn = connNext;
   }
 }
 
 // Checks for and handles timeouts at the link level; primarily the keep-alives for link connections.
 static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-    if (linkConn->connIdx != -1) {
-      if (!rasConns[linkConn->connIdx].linkFlag) {
-        rasConnHandleNetTimeouts(linkConn->connIdx, now, nextWakeup);
-        // rasConns may have been reallocated by the above call, which is why we don't have a conn variable here.
-        // For the same reason we re-init linkConn.
-        linkConn = link->conns+i;
-        rasConns[linkConn->connIdx].linkFlag = true;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
+    if (linkConn->conn) {
+      if (!linkConn->conn->linkFlag) {
+        rasConnHandleNetTimeouts(linkConn->conn, now, nextWakeup);
+        linkConn->conn->linkFlag = true;
       }
-    } else if (i == 0 && link->lastUpdatePeersTime != 0) {
+    } else if (linkConn == link->conns && link->lastUpdatePeersTime != 0) {
       // This triggers when rasLinkReinitConns didn't create the primary connection because we have a higher address
       // than the peer.  If that peer fails to initiate within RAS_CONNECT_WARN, we need to take action.
       if (now - link->lastUpdatePeersTime > RAS_CONNECT_WARN) {
         INFO(NCCL_RAS, "RAS peer connect timeout warning (%lds) on socket connection from %s",
              (now-link->lastUpdatePeersTime) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
-        if (linkConn->connIdx != -1) {
-          rasConns[linkConn->connIdx].linkFlag = true;
+        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn));
+        if (linkConn->conn) {
+          linkConn->conn->linkFlag = true;
         }
-        // We used to connect to the first fallback but I think trying to connect to the calculated primary first
-        // in this case is more intuitive.
-        //(void)rasLinkTryFallback(link, -1);
         link->lastUpdatePeersTime = 0;
       } else {
         *nextWakeup = std::min(*nextWakeup, link->lastUpdatePeersTime+RAS_CONNECT_WARN);
       }
-    } // if (i == 0 && link->lastUpdatePeerTime != 0)
-  } // for (i)
+    } // if (linkConn == link->conns && link->lastUpdatePeerTime != 0)
+  } // for (linkConn)
 
   return ncclSuccess;
 }
 
 // Handles the sending of keep-alive messages and related timeouts for connections that are part of the RAS links.
-static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup) {
-  struct rasConnection* conn = rasConns+connIdx;
-  if (conn->sockIdx != -1) {
-    struct rasSocket* sock = rasSockets+conn->sockIdx;
-
-    if (sock->status == RAS_SOCK_READY) {
+static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup) {
+  if (conn->sock) {
+    if (conn->sock->status == RAS_SOCK_READY) {
       // Send a regular keep-alive message if we haven't sent anything in a while and we don't have anything queued.
       if (ncclIntruQueueEmpty(&conn->sendQ)) {
-        if (now - sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) {
+        if (now - conn->sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) {
           rasConnSendKeepAlive(conn);
         } else {
-          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_KEEPALIVE_INTERVAL);
+          *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_KEEPALIVE_INTERVAL);
         }
       }
 
       // For short timeouts print a warning but also pessimistically immediately try to establish fallback connections.
-      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) {
+      if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) {
         if (!conn->experiencingDelays) {
           INFO(NCCL_RAS, "RAS keep-alive timeout warning (%lds) on socket connection with %s",
-               (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+               (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
 
           // At this point, it's mostly just a precaution; we will continue with the primary connection until
           // RAS_PEER_DEAD_TIMEOUT expires.
           conn->experiencingDelays = true;
-          (void)rasLinkAddFallback(&rasNextLink, connIdx);
-          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-          // rasConns and rasSockets may have been reallocated by the above calls.
-          conn = rasConns+connIdx;
-          sock = rasSockets+conn->sockIdx;
+          (void)rasLinkAddFallback(&rasNextLink, conn);
+          (void)rasLinkAddFallback(&rasPrevLink, conn);
 
-          // Stop collectives from waiting for a response over it.
-          rasCollsPurgeConn(connIdx);
+          // Stop ongoing collectives from waiting for a response over this connection.
+          rasCollsPurgeConn(conn);
         }
       } else {
-        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN);
+        *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN);
       }
 
       // For long timeouts we need to act.
-      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) {
+      if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) {
         INFO(NCCL_RAS, "RAS keep-alive timeout error (%lds) on socket connection with %s",
-             (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
-        rasSocketTerminate(sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR);
+             (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+        rasSocketTerminate(conn->sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR);
         *nextWakeup = now; // Retry will be in the next iteration of the main loop so ensure we don't wait.
       } else {
-        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR);
+        *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR);
       }
-    } // if (sock->status == RAS_SOCK_READY)
-  } // if (conn->sockIdx != -1)
+    } // if (conn->sock->status == RAS_SOCK_READY)
+  } // if (conn->sock)
 }
 
 // Sends a keep-alive message to a peer on the RAS network.
@@ -768,17 +761,17 @@ static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack) {
   struct rasMsg* msg = nullptr;
   int msgLen = rasMsgLength(RAS_MSG_KEEPALIVE);
   if (rasMsgAlloc(&msg, msgLen) == ncclSuccess) {
-    int linkIdx;
+    struct rasLinkConn* linkConn;
     msg->type = RAS_MSG_KEEPALIVE;
     msg->keepAlive.peersHash = rasPeersHash;
     msg->keepAlive.deadPeersHash = rasDeadPeersHash;
     msg->keepAlive.nack = (nack ? 1 : 0);
 
-    linkIdx = rasLinkFindConn(&rasNextLink, conn-rasConns);
-    if (linkIdx != -1 && !rasNextLink.conns[linkIdx].external)
+    linkConn = rasLinkConnFind(&rasNextLink, conn);
+    if (linkConn && !linkConn->external)
       msg->keepAlive.linkMask |= 2; // Our rasNextLink should be the peer's rasPrevLink.
-    linkIdx = rasLinkFindConn(&rasPrevLink, conn-rasConns);
-    if (linkIdx != -1 && !rasPrevLink.conns[linkIdx].external)
+    linkConn = rasLinkConnFind(&rasPrevLink, conn);
+    if (linkConn && !linkConn->external)
       msg->keepAlive.linkMask |= 1; // Our rasPrevLink should be the peer's rasNextLink.
 
     (void)clock_gettime(CLOCK_REALTIME, &msg->keepAlive.realTime);
@@ -793,46 +786,51 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
   int64_t travelTime;
   int peerIdx;
 
-  assert(sock->connIdx != -1);
-  struct rasConnection* conn = rasConns+sock->connIdx;
+  assert(sock->conn);
   SYSCHECK(clock_gettime(CLOCK_REALTIME, &currentTime), "clock_gettime");
   travelTime = (currentTime.tv_sec-msg->keepAlive.realTime.tv_sec)*1000*1000*1000 +
     (currentTime.tv_nsec-msg->keepAlive.realTime.tv_nsec);
 
-  if (msg->keepAlive.peersHash != conn->lastRecvPeersHash) {
-    conn->lastRecvPeersHash = msg->keepAlive.peersHash;
+  if (msg->keepAlive.peersHash != sock->conn->lastRecvPeersHash) {
+    sock->conn->lastRecvPeersHash = msg->keepAlive.peersHash;
   }
-  if (msg->keepAlive.deadPeersHash != conn->lastRecvDeadPeersHash) {
-    conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash;
+  if (msg->keepAlive.deadPeersHash != sock->conn->lastRecvDeadPeersHash) {
+    sock->conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash;
   }
 
   // Make sure that the connection is part of the appropriate links forming the RAS network.  In particular, this
   // will add any externally-requested connections to the appropriate links (or remove existing ones, if no longer
   // needed).
-  peerIdx = rasPeerFind(&conn->addr);
+  peerIdx = rasPeerFind(&sock->conn->addr);
   // Note: it's possible for peerIdx to be -1 at this point if, due to races, the keepAlive arrives before
   // the peers update.
-  (void)rasLinkUpdateConn(&rasNextLink, (msg->keepAlive.linkMask & 1) ? sock->connIdx : -1, peerIdx, /*external*/true);
-  (void)rasLinkUpdateConn(&rasPrevLink, (msg->keepAlive.linkMask & 2) ? sock->connIdx : -1, peerIdx, /*external*/true);
+  if (msg->keepAlive.linkMask & 1)
+    (void)rasLinkConnAddExternal(&rasNextLink, sock->conn, peerIdx);
+  else
+    rasLinkConnDrop(&rasNextLink, sock->conn, /*external*/true);
+  if (msg->keepAlive.linkMask & 2)
+    (void)rasLinkConnAddExternal(&rasPrevLink, sock->conn, peerIdx);
+  else
+    rasLinkConnDrop(&rasPrevLink, sock->conn, /*external*/true);
 
   // If the keep-alive message is from a peer that doesn't actually need this connection (i.e., for that peer the
   // connection is just an external fallback), we should check if *we* still need it.  It might be that we don't,
-  // and because we stopped sending the keep-alives, our peer doesn't know about it.  rasLinkUpdateConn calls above
-  // will have wiped any external fallbacks, so anything that remains must be needed.
+  // and because we stopped sending the keep-alives, our peer doesn't know about it.  The rasLinkConnDrop calls
+  // above will have wiped any external fallbacks, so anything that remains must be needed.
   if (!msg->keepAlive.nack && msg->keepAlive.linkMask == 0) {
-    if (rasLinkFindConn(&rasNextLink, sock->connIdx) == -1 && rasLinkFindConn(&rasPrevLink, sock->connIdx) == -1) {
+    if (rasLinkConnFind(&rasNextLink, sock->conn) == nullptr && rasLinkConnFind(&rasPrevLink, sock->conn) == nullptr) {
       // We don't need this connection either.  Notify the peer about it.  To avoid an infinite loop, we set the
       // special nack flag in the message to distinguish it from regular keep-alives.
-      rasConnSendKeepAlive(conn, /*nack*/true);
+      rasConnSendKeepAlive(sock->conn, /*nack*/true);
     }
   }
 
-  if (conn->travelTimeMin > travelTime)
-    conn->travelTimeMin = travelTime;
-  if (conn->travelTimeMax < travelTime)
-    conn->travelTimeMax = travelTime;
-  conn->travelTimeSum += travelTime;
-  conn->travelTimeCount++;
+  if (sock->conn->travelTimeMin > travelTime)
+    sock->conn->travelTimeMin = travelTime;
+  if (sock->conn->travelTimeMax < travelTime)
+    sock->conn->travelTimeMax = travelTime;
+  sock->conn->travelTimeSum += travelTime;
+  sock->conn->travelTimeCount++;
 
   if (msg->keepAlive.peersHash != rasPeersHash || msg->keepAlive.deadPeersHash != rasDeadPeersHash) {
     // This could happen due to a short-lived race condition between the peers propagation
@@ -842,7 +840,7 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
     INFO(NCCL_RAS, "RAS keepAlive hash mismatch from %s (peersHash 0x%lx, deadPeersHash 0x%lx)",
          ncclSocketToString(&sock->sock.addr, rasLine), msg->keepAlive.peersHash, msg->keepAlive.deadPeersHash);
     INFO(NCCL_RAS, "RAS my peersHash 0x%lx, deadPeersHash 0x%lx", rasPeersHash, rasDeadPeersHash);
-    NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers));
+    NCCLCHECK(rasConnSendPeersUpdate(sock->conn, rasPeers, nRasPeers));
   }
   return ncclSuccess;
 }
@@ -857,100 +855,104 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
 // External connections are generally ignored by this whole process: in particular, we don't add fallbacks for
 // timing out external connections.  However, we will use an active external connection if it would be a better
 // option than whatever we can come up with.
-static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) {
-  int peerIdx = -1;
-  int linkIdx = -1;
+ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn) {
+  struct rasLinkConn* foundLinkConn = nullptr;
+  struct rasLinkConn* firstExtLinkConn = nullptr;
   int firstExtLinkIdx = -1;
-  int newPeerIdx;
+  int newPeerIdx, i;
 
   // First check if the connection is part of this link.  In the process also check if any of the link's connections
   // might be active -- if so, there's no need to initiate any more fallbacks and we can bail out.
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) {
     if (linkConn->peerIdx == -1) {
-      // Such elements are always at the very end of the array and we can't use them so we can just as well break.
+      // Such elements are always at the end and we can't use them so we can just as well break.
       break;
     }
 
     // Check for any other connection that might be a viable fallback (basically, anything that is not experiencing
     // delays).
-    if (linkConn->connIdx != -1 && linkConn->connIdx != connIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
-      if (!conn->experiencingDelays) {
-        if (!linkConn->external)
+    if (linkConn->conn && linkConn->conn != conn) {
+      if (!linkConn->conn->experiencingDelays) {
+        if (!linkConn->external) {
           goto exit; // We don't need to do anything if there's a non-external connection.
-        else if (linkConn->peerIdx != -1) {
+        } else if (linkConn->peerIdx != -1) {
           // Record the location of the first potentially viable external connection in the chain; we may prefer it
           // over anything we can come up with.
-          if (firstExtLinkIdx == -1)
+          if (firstExtLinkConn == nullptr) {
+            firstExtLinkConn = linkConn;
             firstExtLinkIdx = i;
-          if (linkIdx != -1)
+          }
+          if (foundLinkConn)
             break; // Break out of the loop if we already have all the data we might need.
         } // linkConn->external && linkConn->peerIdx != -1
-      } // if (!conn->experiencingDelays)
-    } // if (linkConn->connIdx != -1)
+      } // if (!linkConn->conn->experiencingDelays)
+    } // if (linkConn->conn && linkConn->conn != conn)
 
-    if (linkConn->connIdx == connIdx) {
+    if (linkConn->conn == conn) {
       if (linkConn->external)
         goto exit; // We don't add fallbacks for external connections...
-      peerIdx = linkConn->peerIdx;
-      linkIdx = i;
+      foundLinkConn = linkConn;
       // We are not breaking out of the loop here because we want to check for active connections on *all* potentially
       // viable elements (in particular, there could be some external ones beyond this one).
     }
   }
 
-  if (linkIdx == -1)
+  if (foundLinkConn == nullptr)
     goto exit;
 
   // We found an existing element so the connection is part of the link.  No existing non-external connections of this
   // link are active, so a fallback is needed.
-  assert(peerIdx != -1);
-  newPeerIdx = rasLinkCalculatePeer(link, peerIdx, /*isFallback*/linkIdx > 0);
+  assert(foundLinkConn->peerIdx != -1);
+  newPeerIdx = rasLinkCalculatePeer(link, foundLinkConn->peerIdx, /*isFallback*/(foundLinkConn != link->conns));
   // In principle we want to add (at most) one fallback.  However, if the found fallback connection already exists
   // and is also experiencing delays, we need to keep iterating.
   while (newPeerIdx != -1) {
-    int newConnIdx = rasConnFind(&rasPeers[newPeerIdx].addr);
+    struct rasConnection* newConn = rasConnFind(&rasPeers[newPeerIdx].addr);
+    int linkIdx;
+    struct rasLinkConn* newLinkConn;
     // If we previously found a potential external fallback connection, check if it's better than what we just found.
-    if (firstExtLinkIdx != -1) {
+    if (firstExtLinkConn) {
       linkIdx = -1;
       // Calculate the index that the newly found fallback would have (pretend mode).
-      NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/true,
-                                  &linkIdx));
+      NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/true, &linkIdx));
       assert(linkIdx != -1);
       if (firstExtLinkIdx < linkIdx) {
         // The external connection *is* better -- use it as a fallback instead and be done.
-        link->conns[firstExtLinkIdx].external = false;
+        firstExtLinkConn->external = false;
         goto exit;
       }
     }
-    NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/false,
-                                &linkIdx));
-    if (firstExtLinkIdx != -1 && linkIdx <= firstExtLinkIdx)
-      firstExtLinkIdx++; // Adjust if we inserted a new conn at a lower index.
+    NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/false, &linkIdx, &newLinkConn));
+    if (firstExtLinkConn && linkIdx <= firstExtLinkIdx)
+      firstExtLinkIdx++; // Adjust if we inserted a new entry ahead of this one.
 
     INFO(NCCL_RAS, "RAS link %d: %s fallback connection %d with %s",
-         link->direction, (newConnIdx == -1 ? "opening new" : "calculated existing"),
+         link->direction, (newConn == nullptr ? "opening new" : "calculated existing"),
          linkIdx, ncclSocketToString(&rasPeers[newPeerIdx].addr, rasLine));
     // Note that we don't follow here our convention of "lower address is the one establishing connections" --
     // that convention is for optimizing regular operations, but we don't want to take chances during fault
     // recovery. It may temporarily result in duplicate connections, but we have a mechanism to deal with those.
-    if (newConnIdx == -1)
-      NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &link->conns[linkIdx].connIdx));
+    if (newConn == nullptr) {
+      NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &newConn));
+      newLinkConn->conn = newConn;
+    }
 
-    struct rasConnection* conn = rasConns+link->conns[linkIdx].connIdx;
     // If the fallback connection is also experiencing delays, we need to keep trying.
-    if (!conn->experiencingDelays)
+    if (!newConn->experiencingDelays)
       break;
     INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
-         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         newConn->experiencingDelays, (newConn->startRetryTime ? (clockNano()-newConn->startRetryTime)/1e9 : 0.0),
+         (newConn->sock ? newConn->sock->status : -1));
 
     newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/true);
   }
-  if (newPeerIdx == -1)
-      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
+  if (newPeerIdx == -1) {
+    int nConns = 0;
+    for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next)
+      nConns++;
+    INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (total %d)", link->direction, nConns);
+  }
 exit:
   return ncclSuccess;
 }
@@ -958,7 +960,7 @@ exit:
 // Invoked when we receive a message over a connection that was just activated or was experiencing delays.
 // Cleans up the fallbacks, timers, etc, as appropriate.
 static void rasConnResume(struct rasConnection* conn) {
-  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+  if (conn->sock && conn->sock->status == RAS_SOCK_READY) {
     INFO(NCCL_RAS, "RAS %s connection with %s (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
          (conn->experiencingDelays && conn->startRetryTime == 0 ? "recovered" : "established"),
          ncclSocketToString(&conn->addr, rasLine), (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
@@ -972,218 +974,362 @@ static void rasConnResume(struct rasConnection* conn) {
     rasLinkSanitizeFallbacks(&rasPrevLink);
 
     if (!ncclIntruQueueEmpty(&conn->sendQ))
-      rasPfds[rasSockets[conn->sockIdx].pfd].events |= POLLOUT;
+      rasPfds[conn->sock->pfd].events |= POLLOUT;
   }
 }
 
 // Checks if the primary connection is fully established and if so, purges the fallbacks (as they are no longer needed).
 static void rasLinkSanitizeFallbacks(struct rasLink* link) {
-  if (link->nConns > 0 && link->conns[0].connIdx != -1) {
-    struct rasConnection* conn = rasConns+link->conns[0].connIdx;
-    if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
+  if (link->conns && link->conns->conn) {
+    struct rasConnection* conn = link->conns->conn;
+    if (conn->sock && conn->sock->status == RAS_SOCK_READY && !conn->experiencingDelays) {
       // We have a good primary.  Simply drop all the fallbacks (the external ones will get recreated via the
       // keepAlive messages).
-      for (int i = 1; i < link->nConns; i++) {
+      int i = 1;
+      for (struct rasLinkConn* linkConn = link->conns->next; linkConn; i++) {
+        struct rasLinkConn* linkConnNext = linkConn->next;
         INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
-             link->direction, (link->conns[i].external ? "external " : ""), i,
-             ncclSocketToString(&rasConns[link->conns[i].connIdx].addr, rasLine));
+             link->direction, (linkConn->external ? "external " : ""), i,
+             ncclSocketToString(&linkConn->conn->addr, rasLine));
+        free(linkConn);
+        linkConn = linkConnNext;
       }
-      link->nConns = 1;
+      link->conns->next = nullptr;
       link->lastUpdatePeersTime = 0;
     }
   }
 }
 
-// Attempt to drop a connection from a link.
-static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx) {
-  if (linkIdx == -1)
-    linkIdx = rasLinkFindConn(link, connIdx);
-  if (linkIdx != -1) {
-    if (linkIdx == 0) {
-      INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s",
-           link->direction, ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-    } else {
-      INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
-           link->direction, (link->conns[linkIdx].external ? "external " : ""), linkIdx,
-           ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-    }
-    memmove(link->conns+linkIdx, link->conns+linkIdx+1, (link->nConns-(linkIdx+1))*sizeof(*link->conns));
-    if (link->nConns > 1)
-      link->nConns--;
-    else {
-      link->conns[0].peerIdx = link->conns[0].connIdx = -1;
-    }
-
-    if (linkIdx == 0) {
-      // First ensure that the conn becoming the primary is not marked as external (we don't want to lose it if
-      // the remote peer loses interest in it).
-      link->conns[0].external = false;
-      if (link->conns[0].connIdx != -1) {
-        INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary",
-             link->direction, ncclSocketToString(&rasConns[link->conns[0].connIdx].addr, rasLine));
-      }
-      rasLinkSanitizeFallbacks(link);
-    }
-  }
-}
-
-// Checks if a given connection is a member of this link and if so, returns its entry index.
-// Returns -1 if connection not found.
-static int rasLinkFindConn(const struct rasLink* link, int connIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    if (link->conns[i].connIdx == connIdx)
-      return i;
-  }
-  return -1;
-}
-
-// Note: the behavior of this function has become super-complex and so it should be considered for refactoring.
-// Searches for and updates an entry in a RAS network link.  The conns array is de-facto sorted by peerIdx: it is
-// ordered by preference, though peerIdx values can wrap around (given the ring/torus topology) and they can also
-// be -1 (the latter are stored at the end).
-// external provides an updated value for the entry's external field.  A false value, if requested, is always set;
-// a true value, however, is only set if a new entry is added (external == true implies insert), i.e., if an entry
-// already exists and the function is invoked with external == true, the new value will be ignored.
-// If insert is set, it will, if necessary, insert a new entry if one is not already there.
-// If pretend is set, it will not modify the array and will just set *pLinkIdx as appropriate.
-// pLinkIdx is a pointer to an (optional) result where the index of the added/updated entry is stored.
-// -1 can be passed as peerIdx if unknown (possible in case of race conditions, and only if external).
-// -1 can be passed as connIdx if unknown or, if insert is *not* set, to indicate that the entry is to be removed
-// (the entry's external must match the argument external for it to be removed).
-ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external, bool insert,
-                               bool pretend, int* pLinkIdx) {
+// Adds an entry to a RAS network link (or updates one, if it already exists).
+// conn can be nullptr if the connection doesn't exist (yet).
+// peerIdx *cannot* be -1 when this function is invoked.
+// If pretend is true, the function will not modify the list and will just set *pLinkIdx and *pLinkConn as appropriate.
+// pLinkIdx and pLinkConn are (optional) pointers to the results; the index/address of the added/updated entry are
+// stored there.
+// insert (true by default) determines whether this is an "add" function (as implied by the name) or an "update" --
+// if set to false, it will refuse to add a new entry (but will update an existing one as needed).
+// Note: there is some code duplication between this function and rasLinkConnAddExternal so changes to one of them
+// may need to be sync'ed to the other one as well.  They used to be a single function that could do it all but the
+// logic was extremely difficult to follow then.
+static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend,
+                                   int* pLinkIdx, struct rasLinkConn** pLinkConn, bool insert) {
+  struct rasLinkConn* oldLinkConn = nullptr;
+  struct rasLinkConn* linkConnPrev = nullptr;
   int i, oldLinkIdx = -1;
 
-  if (external && connIdx != -1)
-    insert = true;
+  assert(peerIdx != -1);
+  if (conn) {
+    // Start by checking if we already have an element with this conn.
+    oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx);
+    if (oldLinkConn) {
+      if (pLinkConn)
+        *pLinkConn = oldLinkConn;
+      if (oldLinkConn->peerIdx != -1) {
+        assert(oldLinkConn->peerIdx == peerIdx);
 
-  if (connIdx != -1) {
-    // Start by checking if we already have an element with this connIdx.
-    oldLinkIdx = rasLinkFindConn(link, connIdx);
-    if (oldLinkIdx != -1) {
-      struct rasLinkConn* linkConn = link->conns+oldLinkIdx;
-      if (linkConn->peerIdx != -1)
-        assert(linkConn->peerIdx == peerIdx);
-
-      if (linkConn->peerIdx == peerIdx) {
-        if (!external && !pretend)
-          linkConn->external = false; // Ensure that external is cleared if so requested.
+        if (!pretend)
+          oldLinkConn->external = false; // Ensure that external is cleared.
         if (pLinkIdx)
           *pLinkIdx = oldLinkIdx;
-        goto exit; // Nothing more to do if both connIdx and peerIdx are up to date.
-      }
+        goto exit; // Nothing more to do if both conn and peerIdx are up to date.
+      } // if (oldLinkConn->peerIdx != -1)
 
-      // Otherwise (linkConn->peerIdx == -1 && peerIdx != -1) we have a conn that, due to -1 peerIdx, is in a wrong
-      // place in the array -- we need to find the right spot.  linkConn->peerIdx == -1 can only happen for external
-      // connections.
-      assert(external);
-    }
-  }
+      // Otherwise oldLinkConn->peerIdx == -1.  The oldLinkConn is in a wrong place in the list -- we need to find
+      // the right spot.  This can happen only for external connections.
+    } // if (oldLinkConn)
+  } // if (conn)
 
-  if (peerIdx != -1) {
-    // Search for the right spot in the conns array.
-    for (i = 0; i < link->nConns; i++) {
-      struct rasLinkConn* linkConn = link->conns+i;
-      if (peerIdx != -1 && linkConn->peerIdx == peerIdx) {
-        // The exact conn element already exists.
-        if (connIdx == -1 && !insert) {
-          // Drop the connection from the link.
-          if (linkConn->external == external) {
-            if (!pretend)
-              rasLinkDropConn(link, linkConn->connIdx, i);
-            else if (pLinkIdx)
-              *pLinkIdx = i;
-          }
-        } else { // connIdx != -1 || insert
-          if (!pretend) {
-            if (linkConn->connIdx != -1)
-              assert(linkConn->connIdx == connIdx);
-            else
-              linkConn->connIdx = connIdx;
-            if (!external)
-              linkConn->external = false; // Ensure that external is cleared if so requested.
-            if (i == 0) {
-              // We received a connection from the remote peer that matches the primary connection we've been
-              // waiting for.
-              rasLinkSanitizeFallbacks(link);
-            }
-          } // if (!pretend)
-          if (pLinkIdx)
-            *pLinkIdx = i;
-        } // connIdx != -1 || insert
+  // Search for the right spot in the conns list.
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (linkConn->peerIdx == peerIdx) {
+      // The exact linkConn element already exists.
+      if (linkConn->conn)
+        assert(linkConn->conn == conn);
+      if (!pretend) {
+        if (linkConn->conn == nullptr)
+          linkConn->conn = conn;
+        linkConn->external = false; // Ensure that external is cleared.
+        if (linkConn == link->conns) {
+          // We received a connection from the remote peer that matches the primary connection we've been
+          // waiting for.
+          rasLinkSanitizeFallbacks(link);
+        }
+      } // if (!pretend)
+      if (pLinkIdx)
+        *pLinkIdx = i;
+      if (pLinkConn)
+        *pLinkConn = linkConn;
+      goto exit;
+    } // if (linkConn->peerIdx == peerIdx)
 
-        goto exit;
-      } // if (peerIdx != -1 && linkConn->peerIdx == peerIdx)
-      if (!insert)
-        continue;
-      // Ensure that the i-1 index is also valid.
-      if (i == 0)
-        continue;
-      // Conns with peerIdx == -1 are stored at the end, so anything else needs to go before them.
-      if (peerIdx != -1 && linkConn->peerIdx == -1)
+    // Ensure that the previous element is valid.
+    if (linkConnPrev == nullptr)
+      continue;
+    // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done.
+    if (linkConn->peerIdx == -1)
+      break;
+    // Detect a roll-over and handle it specially.
+    if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) {
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 ||
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
         break;
-      // Detect a roll-over and handle it specially.
-      if (link->direction * (link->conns[i-1].peerIdx - linkConn->peerIdx) > 0) {
-        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 ||
-            link->direction * (peerIdx - linkConn->peerIdx) < 0)
-          break;
-      } else { // Regular, monotonic case with the peerIdx value between two existing elements.
-        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 &&
-            link->direction * (peerIdx - linkConn->peerIdx) < 0)
-          break;
-      }
-    } // for (i)
-  } else {
-    // If peerIdx == -1, insert the new element at the very end.  This can only happen for external connections.
-    assert(external && oldLinkIdx == -1);
-    i = link->nConns;
-  }
-  if (!insert)
-    goto exit;
+    } else { // Regular, monotonic case with the peerIdx value between two existing elements.
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 &&
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
+    }
+  } // for (linkConn)
 
-  // i holds the index at which to insert a new element.
-  if (pretend) {
-    if (pLinkIdx)
-      *pLinkIdx = i;
-    goto exit;
-  }
-
-  if (oldLinkIdx == -1) {
-    struct rasLinkConn* linkConn;
-    if (link->nConns == link->connsSize) {
-      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
-      link->connsSize += RAS_INCREMENT;
-    }
-    linkConn = link->conns+i;
-    // Shift existing conns with indices >= i to make room for the new one.
-    memmove(linkConn+1, linkConn, (link->nConns-i)*sizeof(*link->conns));
-    linkConn->peerIdx = peerIdx;
-    linkConn->connIdx = connIdx;
-    linkConn->external = external;
-    if (external) {
-      INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i,
-           ncclSocketToString((connIdx != -1 ? &rasConns[connIdx].addr : &rasPeers[peerIdx].addr), rasLine));
-    }
-    link->nConns++;
-  }
-  else { // oldLinkIdx > -1
-    // We already have the conn, we just need to move it to a new spot.
-    struct rasLinkConn* linkConn = link->conns+i;
-    assert(i <= oldLinkIdx); // We can only get here if linkConn->peerIdx == -1 && peerIdx != -1.
-    if (i != oldLinkIdx) {
-      struct rasLinkConn tmp;
-      struct rasLinkConn* linkConnNext = link->conns+i+1; // Just to silence the compiler.
-      // Move the existing conn from index oldLinkIdx to a (lower) index i, shifting the existing conns
-      // with indices in the range [i, oldLinkIdx).
-      memcpy(&tmp, link->conns+oldLinkIdx, sizeof(tmp));
-      memmove(linkConnNext, linkConn, (oldLinkIdx-i)*sizeof(*linkConn));
-      memcpy(linkConn, &tmp, sizeof(*linkConn));
-    }
-    if (!external)
-      linkConn->external = false; // Ensure that external is cleared if so requested.
-  } // oldLinkIdx > -1
+  // The new element should be inserted after linkConnPrev (which is at index i-1).
   if (pLinkIdx)
     *pLinkIdx = i;
+  if (pretend)
+    goto exit;
+
+  if (oldLinkConn) {
+    if (i != oldLinkIdx) {
+      // We already have the entry, but we need to move it to a new spot (which must be earlier in the list).
+      assert(i < oldLinkIdx);
+      // Remove oldLinkConn from its old spot.
+      for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) {
+        if (linkConn->next == oldLinkConn) {
+          linkConn->next = oldLinkConn->next;
+          break;
+        }
+      } // for (linkConn)
+      // Insert it at its new spot.
+      oldLinkConn->next = linkConnPrev->next;
+      linkConnPrev->next = oldLinkConn;
+    } // if (i != oldLinkIdx)
+    oldLinkConn->peerIdx = peerIdx;
+    oldLinkConn->external = false;
+  } else if (insert) {
+    struct rasLinkConn* linkConn;
+    NCCLCHECK(ncclCalloc(&linkConn, 1));
+    if (linkConnPrev) {
+      linkConn->next = linkConnPrev->next;
+      linkConnPrev->next = linkConn;
+    } else {
+      assert(link->conns == nullptr); // We never add an element that would replace an existing primary.
+      link->conns = linkConn;
+      // linkConn->next is already nullptr.
+    }
+    linkConn->peerIdx = peerIdx;
+    linkConn->conn = conn;
+    linkConn->external = false;
+    if (pLinkConn)
+      *pLinkConn = linkConn;
+  } // oldLinkConn == nullptr && insert
+
 exit:
   return ncclSuccess;
 }
+
+// Adds an external entry in a RAS network link (or updates one, if already exists).
+// conn *cannot* be nullptr when this function is invoked.
+// peerIdx can be -1 if unknown (possible in case of a race condition between keepAlive and peers update).
+// Note: there is some code duplication between this function and rasLinkConnAdd so changes to one of them
+// may need to be sync'ed to the other one as well.  They used to be a single function that could do it all but the
+// logic was extremely difficult to follow then.
+static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx) {
+  struct rasLinkConn* oldLinkConn = nullptr;
+  struct rasLinkConn* linkConnPrev = nullptr;
+  int i, oldLinkIdx = -1;
+
+  assert(conn);
+  oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx);
+  if (oldLinkConn) {
+    if (oldLinkConn->peerIdx != -1)
+      assert(oldLinkConn->peerIdx == peerIdx);
+
+    if (oldLinkConn->peerIdx == peerIdx)
+      goto exit; // Nothing more to do if both conn and peerIdx are up to date.  Note that we neither check nor
+                 // update the value of external here.
+
+    // Otherwise (oldLinkConn->peerIdx == -1 && peerIdx != -1) oldLinkConn, due to its -1 peerIdx, is in
+    // a wrong place in the array -- we need to find the right spot.  oldLinkConn->peerIdx == -1 can only happen for
+    // external connections.
+  } // if (oldLinkConn)
+
+  // Search for the right spot in the conns list.
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (peerIdx == -1) {
+      // We simply want to find the end of the list so that we can insert a new entry with -1 peerIdx there.
+      continue;
+    }
+    if (linkConn->peerIdx == peerIdx) {
+      // The exact linkConn element already exists.
+      if (linkConn->conn)
+        assert(linkConn->conn == conn);
+      if (linkConn->conn == nullptr)
+        linkConn->conn = conn;
+      if (linkConn == link->conns) {
+        // We received a connection from the remote peer that matches the primary connection we've been
+        // waiting for.  This shouldn't trigger for external connections (rasLinkConnUpdate should be invoked first,
+        // which will update the entry's conn, so rasLinkConnFind invoked at the top of this function should succeed),
+        // but better safe than sorry...
+        rasLinkSanitizeFallbacks(link);
+      }
+      goto exit;
+    } // if (linkConn->peerIdx == peerIdx)
+
+    // Ensure that the previous element is valid.
+    if (linkConnPrev == nullptr)
+      continue;
+    // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done.
+    if (linkConn->peerIdx == -1)
+      break;
+    // Detect a roll-over and handle it specially.
+    if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) {
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 ||
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
+    } else { // Regular, monotonic case with the peerIdx value between two existing elements.
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 &&
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
+    }
+  } // for (linkConn)
+
+  // The new element should be inserted after linkConnPrev (which is at index i-1).
+  if (oldLinkConn) {
+    if (i != oldLinkIdx) {
+      // We already have the entry, but we need to move it to a new spot (which must be earlier in the list).
+      assert(i < oldLinkIdx);
+      INFO(NCCL_RAS, "RAS link %d: moving %sfallback connection with %s from %d to %d", link->direction,
+           (oldLinkConn->external ? "external " : ""), ncclSocketToString(&conn->addr, rasLine), oldLinkIdx, i);
+      // Remove oldLinkConn from its old spot.
+      for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) {
+        if (linkConn->next == oldLinkConn) {
+          linkConn->next = oldLinkConn->next;
+          break;
+        }
+      } // for (linkConn)
+      // Insert it at its new spot.
+      oldLinkConn->next = linkConnPrev->next;
+      linkConnPrev->next = oldLinkConn;
+    } // if (i != oldLinkIdx)
+    oldLinkConn->peerIdx = peerIdx;
+    oldLinkConn->external = false;
+  } else { // oldLinkConn == nullptr
+    struct rasLinkConn* linkConn;
+    NCCLCHECK(ncclCalloc(&linkConn, 1));
+    if (linkConnPrev) {
+      INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i,
+           ncclSocketToString(&conn->addr, rasLine));
+      linkConn->next = linkConnPrev->next;
+      linkConnPrev->next = linkConn;
+      linkConn->external = true;
+    } else {
+      INFO(NCCL_RAS, "RAS link %d: adding external fallback with %s as a new primary connection", link->direction,
+           ncclSocketToString(&conn->addr, rasLine));
+      linkConn->next = link->conns;
+      link->conns = linkConn;
+      linkConn->external = false; // Primary connections are never external.
+    }
+    linkConn->peerIdx = peerIdx;
+    linkConn->conn = conn;
+  } // oldLinkConn == nullptr
+
+exit:
+  return ncclSuccess;
+}
+
+// Updates an existing entry in a RAS network link, if any.
+// Basically an easy-to-use variant of rasLinkConnAdd.
+// For this function, conn cannot be a nullptr and peerIdx cannot be -1.
+ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx) {
+  assert(conn && peerIdx != -1);
+
+  NCCLCHECK(rasLinkConnAdd(link, conn, peerIdx, /*pretend*/false, /*pLinkIdx*/nullptr, /*pLinkConn*/nullptr,
+                           /*insert*/false));
+  return ncclSuccess;
+}
+
+// Attempts to drop a connection from a link.
+// If the optional external argument is true, it will drop a connection only if its external flag is set
+// (otherwise the flag is ignored and a connection is always dropped if found).
+static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external) {
+  struct rasLinkConn* linkConnPrev = nullptr;
+  int i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (linkConn->conn == conn && (!external || linkConn->external)) {
+      if (linkConnPrev) {
+        INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
+             link->direction, (linkConn->external ? "external " : ""), i,
+             ncclSocketToString(&conn->addr, rasLine));
+        linkConnPrev->next = linkConn->next;
+        free(linkConn);
+      } else { // linkConnPrev == nullptr
+        INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s",
+             link->direction, ncclSocketToString(&conn->addr, rasLine));
+        if (linkConn->next) {
+          link->conns = linkConn->next;
+          // Ensure that the conn becoming the primary is not marked as external (we don't want to lose it if
+          // the remote peer loses interest in it).
+          link->conns->external = false;
+          if (link->conns->conn)
+            INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary",
+                 link->direction, ncclSocketToString(&link->conns->conn->addr, rasLine));
+          rasLinkSanitizeFallbacks(link);
+          free(linkConn);
+        } else { // linkConn->next == nullptr
+          // We prefer the primary entry to always be present, even if empty.
+          linkConn->peerIdx = -1;
+          linkConn->conn = nullptr;
+        } // linkConn->next == nullptr
+      } // linkConnPrev == nullptr
+      break;
+    } // if (linkConn->conn == conn)
+  } // for (linkConn)
+}
+
+// Checks if a given connection is a member of this link and if so, returns its link entry.
+// Optionally returns the position of the connection in the conns list.
+// Returns nullptr if connection not found.
+static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn,
+                                           int* pLinkIdx) {
+  int i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) {
+    if (linkConn->conn == conn) {
+      if (pLinkIdx)
+        *pLinkIdx = i;
+      return linkConn;
+    }
+  }
+  if (pLinkIdx)
+    *pLinkIdx = -1;
+  return nullptr;
+}
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasNetTerminate() {
+  for (struct rasLinkConn* linkConn = rasNextLink.conns; linkConn;) {
+    struct rasLinkConn* linkConnNext = linkConn->next;
+    free(linkConn);
+    linkConn = linkConnNext;
+  }
+  for (struct rasLinkConn* linkConn = rasPrevLink.conns; linkConn;) {
+    struct rasLinkConn* linkConnNext = linkConn->next;
+    free(linkConn);
+    linkConn = linkConnNext;
+  }
+  rasNextLink.conns = rasPrevLink.conns = nullptr;
+  rasNextLink.lastUpdatePeersTime = rasPrevLink.lastUpdatePeersTime = 0;
+
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    rasConnTerminate(conn);
+    conn = connNext;
+  }
+  // rasConnsHead and rasConnsTail are taken care of by rasConnTerminate().
+
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
+    rasSocketTerminate(sock);
+    sock = sockNext;
+  }
+  // rasSocketsHead and rasSocketsTail are taken care of by rasSocketTerminate().
+}
diff --git a/src/register/register.cc b/src/register/register.cc
index 9e8f6ea..930367a 100644
--- a/src/register/register.cc
+++ b/src/register/register.cc
@@ -92,8 +92,8 @@ static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) {
     }
   }
   if (reg->state & NVLS_REG_COMPLETE) {
-    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) {
-      WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize);
+    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize) != ncclSuccess) {
+      WARN("rank %d deregister NVLS buffer %p dev %d ucsize %ld mcsize %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize);
     }
     reg->regAddr = (CUdeviceptr)NULL;
   }
diff --git a/src/transport.cc b/src/transport.cc
index 5629ce7..f98b77a 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -11,11 +11,12 @@
 #include "timer.h"
 #include "transport.h"
 
-struct ncclTransport* ncclTransports[NTRANSPORTS] = {
+struct ncclTransport* ncclTransports[NTRANSPORTS+1] = {
   &p2pTransport,
   &shmTransport,
   &netTransport,
-  &collNetTransport
+  &collNetTransport,
+  &profilerTransport // Not really used for transport, only to create proxy ops polling on profiler counters.
 };
 
 template <int type>
@@ -111,12 +112,14 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
   gettimeofday(&timeStart, NULL);
   timeLast = timeStart; // struct copy
   bool timeReported = false;
+  cudaStream_t hostStream, deviceStream;
 
   NCCLCHECK(ncclCalloc(&data, maxPeers));
   NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail);
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
   // First time initialization
   for (int i=1; i<comm->nRanks; i++) {
     int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
@@ -195,7 +198,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
                 if (ret == ncclSuccess) {
                   conn->connected = 1;
                   /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
-                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail);
                 } else if (ret == ncclInProgress) {
                   allChannelsConnected = false;
                 }
@@ -214,7 +217,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
                 if (ret == ncclSuccess) {
                   conn->connected = 1;
                   /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
-                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail);
                 } else if (ret == ncclInProgress) {
                   allChannelsConnected = false;
                 }
@@ -286,8 +289,9 @@ exit:
   if (sendData) free(sendData);
   if (recvData) free(recvData);
 
-  NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
+  NCCLCHECK(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
   return ret;
 fail:
   goto exit;
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 6718012..c1ccfca 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -103,7 +103,7 @@ struct sendResources {
   int rank;
   int nranks;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   uint64_t* gdcSync;
   void* gdrDesc;
@@ -124,7 +124,7 @@ struct recvResources {
   int rank;
   int nranks;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int needFlush;
   uint64_t* gdcSync;
@@ -143,9 +143,19 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG
   return ncclSuccess;
 }
 
+// Returns the flags to be used by a call to cuMemGetHandleForAddressRange.
+static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) {
+  int flags = 0;
+#if CUDA_VERSION >= 12080
+  // Force mapping on PCIe on systems with both PCI and C2C attachments.
+  if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE;
+#endif
+  return flags;
+}
+
 struct setupReq {
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int needFlush;
   struct ncclCollNetSharedRes* collNet;
 };
@@ -168,8 +178,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.collNet = comm->collNetSharedRes;
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
-      req.useGdr ? "/GDRDMA" : "");
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "");
   return ncclSuccess;
 }
 
@@ -192,8 +202,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.collNet = comm->collNetSharedRes;
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
-      req.useGdr ? "/GDRDMA" : "");
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "");
   return ncclSuccess;
 }
 
@@ -454,6 +464,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
 }
 
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
   static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big");
@@ -505,16 +516,17 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
   struct connectMapMem* mapMem = map->mems+bank;
   NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
-  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
+  int dmabuf_fd = -1;
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                                                  NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
-                                                  &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                                       NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                                       &resources->sendMhandles[NCCL_PROTO_SIMPLE]),
+                  ret, fail);
     (void)close(dmabuf_fd);
   } else // FALL-THROUGH to nv_peermem GDR path
 #endif
@@ -525,10 +537,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   }
 
   *((struct connectMap**)respBuff) = &resources->map;
-  return ncclSuccess;
+
+exit:
+  return ret;
+fail:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+  }
+  goto exit;
 }
 
 static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
 
@@ -574,16 +594,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
   struct connectMapMem* mapMem = map->mems+bank;
   NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
-  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
+  int dmabuf_fd = -1;
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                                                  NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
-                                                  &resources->mhandles[NCCL_PROTO_SIMPLE]));
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                                       NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                                       &resources->mhandles[NCCL_PROTO_SIMPLE]),
+                  ret, fail);
     (void)close(dmabuf_fd);
   } else // FALL-THROUGH to nv_peermem GDR path
 #endif
@@ -600,7 +621,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
   if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
   *((struct connectMap**)respBuff) = &resources->map;
-  return ncclSuccess;
+
+exit:
+  return ret;
+fail:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+  }
+  goto exit;
 }
 
 static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
@@ -737,7 +765,7 @@ static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct
 }
 
 static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t recvParts;
+  ncclNetSGE_t recvParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   ssize_t nBytes;
@@ -779,7 +807,7 @@ static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, stru
 }
 
 static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t recvParts;
+  ncclNetSGE_t recvParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   recvParts.mhandle = recvMhandle;
@@ -796,7 +824,7 @@ static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct
 }
 
 static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) {
-  ncclNetSGE_v9_t sendParts;
+  ncclNetSGE_t sendParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   ssize_t nBytes;
@@ -835,7 +863,7 @@ static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState,
 }
 
 static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t sendParts;
+  ncclNetSGE_t sendParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   sendParts.mhandle = sendMhandle;
@@ -1150,6 +1178,7 @@ struct collnetRegInfo {
 
 static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
   ncclResult_t ret = ncclSuccess;
+  int gdrEnable = -1;
   if (regRecord) {
     if (regRecord->state & COLLNET_REG_COMPLETE) {
       // reuse previous registration
@@ -1165,6 +1194,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
 
       if (conn->flags & NCCL_DIRECT_NIC) {
         struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
+        gdrEnable = 1;
         NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
         if (handle) {
           regRecord->state |= COLLNET_REG_COMPLETE;
@@ -1174,7 +1204,8 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
           INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
         }
       } else {
-        WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
+        gdrEnable = 0;
+        goto fail;
       }
     }
   }
@@ -1183,6 +1214,7 @@ exit:
 fail:
   *outRegBufFlag = 0;
   *outHandle = NULL;
+  INFO(NCCL_REG, "rank %d - COLLNET failed to register userbuff %p, buffSize %ld, type %s, GDR %d", comm->rank, userbuff, buffSize, type == collNetRecv ? "Recv" : "Send", gdrEnable);
   goto exit;
 }
 
@@ -1268,17 +1300,20 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
 
+  int dmabuf_fd = -1;
 #if CUDART_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
-    (void)close(dmabuf_fd);
     needReg = false;
   }
 #endif
 peermem:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+    dmabuf_fd = -1;
+  }
   if (needReg) {
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
@@ -1301,17 +1336,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
 
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
+  int dmabuf_fd = -1;
   #if CUDART_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
-    (void)close(dmabuf_fd);
     needReg = false;
   }
 #endif
 peermem:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+    dmabuf_fd = -1;
+  }
   if (needReg) {
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
@@ -1600,4 +1638,4 @@ struct ncclTransport collNetTransport = {
   canConnect,
   { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
   { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
-};
\ No newline at end of file
+};
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 8760b42..40d334f 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -92,7 +92,7 @@ struct sendNetResources {
   int tpLocalRank;
   int tpRemoteRank;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int maxRecvs;
   uint64_t* gdcSync;
@@ -123,7 +123,7 @@ struct recvNetResources {
   int tpRemoteRank;
   int tpRemoteProxyRank;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int needFlush;
   int maxRecvs;
@@ -168,7 +168,7 @@ struct setupReq {
   int tpRemoteRank;
   int shared;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int needFlush;
   int channelId;
   int connIndex;
@@ -180,6 +180,16 @@ static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large
 // Forward declaration
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
 
+// Returns the flags to be used by a call to cuMemGetHandleForAddressRange.
+static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) {
+  int flags = 0;
+#if CUDA_VERSION >= 12080
+  // Force mapping on PCIe on systems with both PCI and C2C attachments.
+  if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE;
+#endif
+  return flags;
+}
+
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
@@ -204,11 +214,14 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
   if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+        req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+        req.shared ? "/Shared" : "");
   } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+        proxyRank,
+        req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+        req.shared ? "/Shared" : "");
   }
   *((int*)connectInfo) = comm->topParentRanks[proxyRank];
   memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
@@ -247,18 +260,19 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
   memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+      req.shared ? "/Shared" : "");
   return ncclSuccess;
 }
 
-static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) {
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
+static ncclResult_t netMapShm(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct connectMapMem* mem) {
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, proxyConn->rank, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
   return ncclSuccess;
 }
 
 static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) {
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
+  NCCLCHECK(ncclShmAllocateShareableBuffer(mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
   return ncclSuccess;
 }
 
@@ -292,6 +306,7 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
 
 struct netSendConnectArgs {
   ncclNetHandle_t handle;
+  int trafficClass;
 };
 
 struct netRecvConnectArgs {
@@ -315,6 +330,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
     INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
     netSendConnectArgs args = {0};
     memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t));
+    args.trafficClass = comm->config.trafficClass;
     NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId));
   } else {
     opId =  send;
@@ -343,7 +359,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
       }
     }
   } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
-    if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM));
+    if (!map->sameProcess) NCCLCHECK(netMapShm(comm, &send->proxyConn, map->mems + NCCL_NET_MAP_HOSTMEM));
     if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
       map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL;
       NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
@@ -692,9 +708,11 @@ static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version,
 
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+  ncclNetCommConfig_t commConfig = {0};
   if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError;
   ncclResult_t ret = ncclSuccess;
   netSendConnectArgs* req = (netSendConnectArgs*) reqBuff;
+  commConfig.trafficClass = req->trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : req->trafficClass;
   NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle));
   if (resources->shared) {
     // Shared buffers
@@ -714,15 +732,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
+      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
-      ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+      ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     }
   } else {
     // Connect to remote peer
-    ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+    ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -748,7 +766,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 
   if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, proxyState->buffSizes[p], buffs[p]);
+      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]);
       resources->buffSizes[p] = proxyState->buffSizes[p];
     }
   } else {
@@ -765,7 +783,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
     }
 
-    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
@@ -820,7 +838,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
-        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
         NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
@@ -904,7 +922,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
   if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, proxyState->buffSizes[p], buffs[p]);
+      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]);
       resources->buffSizes[p] = proxyState->buffSizes[p];
     }
   } else {
@@ -915,14 +933,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
           proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels,
           &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
     resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
-    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
 
   if (proxyState->allocP2pNetLLBuffers) {
-    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
+    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*devMem*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
     resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
   }
 
@@ -964,7 +982,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
-        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
         NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
@@ -1175,11 +1193,12 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
             // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
             // since size is a plain integer.
             // coverity[use_invalid:FALSE]
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot));
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
               TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
               sub->transSize += size;
               sub->transmitted += args->sliceSteps;
+              sub->profilerSteps++;
               ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
               ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait);
               args->idle = 0;
@@ -1280,6 +1299,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       size_t sizes[NCCL_PROXY_MAX_SUBS];
       int tags[NCCL_PROXY_MAX_SUBS];
       void* mhandles[NCCL_PROXY_MAX_SUBS];
+      void* phandles[NCCL_PROXY_MAX_SUBS];
       for (int i=0; i<subGroup->groupSize; i++) {
         struct ncclProxySubArgs* sub = subGroup + i;
         int postedStepId = sub->posted;
@@ -1323,6 +1343,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
           if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
           tags[subCount] = resources->tpRemoteRank;
           mhandles[subCount] = sub->recvMhandle;
+          phandles[subCount] = sub;
           subCount++;
         }
       }
@@ -1332,7 +1353,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
         bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1);
         if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
-        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr));
         if (*requestPtr) {
           subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr;
           subGroup->recvRequestsSubCount = subCount;
@@ -1341,6 +1362,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             int postedStepId = sub->posted;
             TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]);
             sub->posted += args->sliceSteps;
+            sub->profilerSteps++;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
             ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait);
           }
@@ -1558,7 +1580,7 @@ exit:
   return ret;
 fail:
   *outRegBufFlag = 0;
-  WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
+  INFO(NCCL_REG, "rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
   goto exit;
 }
 
@@ -1639,7 +1661,7 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   /* DMA-BUF support */
   if (resources->useDmaBuf) {
     int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     (void)close(dmabuf_fd);
     needReg = false;
@@ -1673,7 +1695,7 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
   /* DMA-BUF support */
   if (resources->useDmaBuf) {
     int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     (void)close(dmabuf_fd);
     needReg = false;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index bc54133..bfff6e5 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -11,6 +11,7 @@
 #include "graph.h"
 #include "utils.h"
 #include "param.h"
+#include "profiler/net_ib.h"
 
 #include <assert.h>
 #include <pthread.h>
@@ -85,6 +86,11 @@ struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 static int ncclIbRelaxedOrderingEnabled = 0;
 
+#define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
+
+#define NCCL_IB_SL_DEFAULT 0
+#define NCCL_IB_TC_DEFAULT 0
+
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
 NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1);
 NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
@@ -92,8 +98,8 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
 NCCL_PARAM(IbPkey, "IB_PKEY", 0);
 NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
-NCCL_PARAM(IbSl, "IB_SL", 0);
-NCCL_PARAM(IbTc, "IB_TC", 0);
+NCCL_PARAM(IbSl, "IB_SL", -1);
+NCCL_PARAM(IbTc, "IB_TC", -1);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
@@ -327,6 +333,9 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum,
   close(fd);
 
   if (ret == -1) {
+    // In containerized environments, read could return EINVAL if the GID index is not mapped to the
+    // container sysfs. In this case return ncclSuccess and let the caller move to next GID index.
+    if (errno == EINVAL) return ncclSuccess;
     WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
     return ncclSystemError;
   }
@@ -359,7 +368,7 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port
       return ncclSuccess;
     }
     int usrRoceVer = roceVer;
-    int gidRoceVerNum, gidRoceVerNumCandidate;
+    int gidRoceVerNum, gidRoceVerNumCandidate = -1;
     const char* deviceName = wrap_ibv_get_device_name(context->device);
     NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
     NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate));
@@ -530,8 +539,8 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
     }
     ncclIbDev* dev = ncclIbDevs + props->devs[i];
     if (dev->link != dev0->link) {
-      WARN("NET/IB : Trying to merge multiple devices together with different link_layer properties %s -> %d, %s -> %d. Try only selecting NICs with one type of link using NCCL_IB_HCA",
-        dev0->devName, dev0->link, dev->devName, dev->link);
+      WARN("NET/IB : Attempted to merge incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+        props->devs[0], dev0->devName, dev0->portNum, NCCL_IB_LLSTR(dev0->link), props->devs[i], dev->devName, dev->portNum, NCCL_IB_LLSTR(dev->link));
       return ncclInvalidUsage;
     }
   }
@@ -548,8 +557,11 @@ ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
   return res;
 }
 
-ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
+static ncclProfilerCallback_t ncclProfilerFunction;
+
+ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
   ncclResult_t ret = ncclSuccess;
+  ncclProfilerFunction = profFunction;
   if (ncclParamIbDisable()) return ncclInternalError;
   static int shownIbHcaEnv = 0;
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
@@ -571,7 +583,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
       struct ibv_device** devices;
 
       // Check if user defined which IB device:port to use
-      char* userIbEnv = getenv("NCCL_IB_HCA");
+      const char* userIbEnv = ncclGetEnv("NCCL_IB_HCA");
       if (userIbEnv != NULL && shownIbHcaEnv++ == 0) INFO(NCCL_NET|NCCL_ENV, "NCCL_IB_HCA set to %s", userIbEnv);
       struct netIf userIfs[MAX_IB_DEVS];
       bool searchNot = userIbEnv && userIbEnv[0] == '^';
@@ -634,7 +646,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
 
           TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
-              portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
+              NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
 
           PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
           ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
@@ -666,7 +678,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
     ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
     for (int d = 0; d < ncclNIbDevs; d++) {
         snprintf(line+strlen(line), sizeof(line)-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
-          ncclIbDevs[d].portNum, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          ncclIbDevs[d].portNum, NCCL_IB_LLSTR(ncclIbDevs[d].link));
     }
     char addrline[SOCKET_NAME_MAXLEN+1];
     INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
@@ -832,6 +844,8 @@ struct ncclIbConnectionMetadata {
   char devName[MAX_MERGED_DEV_NAME];
   uint64_t fifoAddr;
   int ndevs;
+  int tc;
+  int sl;
 };
 
 enum ncclIbCommState {
@@ -873,12 +887,23 @@ struct ncclIbGidInfo {
 #define NCCL_NET_IB_REQ_FLUSH 3
 const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" };
 
+#define MAX_QPS_PER_REQ 8
+struct ncclProfilerInfo {
+  void* qpEventHandles[MAX_QPS_PER_REQ];
+  int qpIndex[MAX_QPS_PER_REQ];
+  int nEventHandles;
+  ncclProfilerNetIbDescr_v1_t data;
+};
+
 struct ncclIbRequest {
   struct ncclIbNetCommBase* base;
   int type;
   struct ncclSocket* sock;
   int events[NCCL_IB_MAX_DEVS_PER_NIC];
   struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC];
+#ifdef NCCL_ENABLE_NET_PROFILING
+  struct ncclProfilerInfo pInfo[NCCL_NET_IB_MAX_RECVS];
+#endif
   int nreqs;
   union {
     struct {
@@ -1084,7 +1109,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc, int tc, int sl) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
@@ -1100,7 +1125,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
     qpAttr.ah_attr.grh.flow_label = 0;
     qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex;
     qpAttr.ah_attr.grh.hop_limit = 255;
-    qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : ncclParamIbTc();
+    qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : tc;
   } else {
     //pick lid if subnet prefixs are same, FLID if they are not
     if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) ==
@@ -1122,10 +1147,10 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
 	qpAttr.ah_attr.grh.hop_limit = 255;
     }
   }
-  qpAttr.ah_attr.sl = ncclParamIbSl();
+  qpAttr.ah_attr.sl = sl;
   qpAttr.ah_attr.src_path_bits = 0;
   qpAttr.ah_attr.port_num = info->ib_port;
-  TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port);
+  TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u sl: %d tc: %d", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port, qpAttr.ah_attr.sl, qpAttr.ah_attr.grh.traffic_class);
   NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
   return ncclSuccess;
 }
@@ -1164,12 +1189,13 @@ fail:
   goto exit;
 }
 
-ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   ncclResult_t ret = ncclSuccess;
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   struct ncclIbCommStage* stage = &handle->stage;
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
   int ready;
+  uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED;
   *sendComm = NULL;
 
   if (stage->state == ncclIbCommStateConnect)      goto ib_connect_check;
@@ -1199,7 +1225,7 @@ ib_connect_check:
   // IB Setup
   struct ncclIbMergedDev* mergedDev;
   if (dev >= ncclNMergedIbDevs) {
-    WARN("NET/IB : Trying to use non-existant virtual device %d", dev);
+    WARN("NET/IB : Trying to use non-existent virtual device %d", dev);
     return ncclInternalError;
   }
 
@@ -1305,8 +1331,17 @@ ib_recv_dev_list:
             devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
     }
+    if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = devInfo->link_layer;
+    if (link_layer != devInfo->link_layer) {
+      int ibDev0 = comm->devs[0].base.ibDevN;
+      WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           commDev->base.ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
   meta.fifoAddr = (uint64_t)comm->fifo;
+  meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
+  meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
   strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
 
   stage->state = ncclIbCommStateSend;
@@ -1332,13 +1367,16 @@ ib_connect:
 
   comm->base.nRemDevs = remMeta.ndevs;
 
-  int link_layer;
-  link_layer = remMeta.devs[0].link_layer;
-  for (int i = 1; i < remMeta.ndevs; i++) {
-    if (remMeta.devs[i].link_layer != link_layer) {
-      WARN("NET/IB : Can't connect net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d",
-      i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer);
-      return ncclInternalError;
+  // ensure that the remote devices have the same link layer than the local devices used in the connection.
+  if (comm->base.vProps.ndevs > 0) {
+    int ibDev0 = comm->devs[0].base.ibDevN;
+    link_layer = ncclIbDevs[ibDev0].portAttr.link_layer;
+    for (int i = 0; i < remMeta.ndevs; i++) {
+      if (remMeta.devs[i].link_layer != link_layer) {
+        WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+             NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+        return ncclInternalError;
+      }
     }
   }
 
@@ -1373,7 +1411,7 @@ ib_connect:
 
     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
     remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu);
-    NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false, remMeta.tc, remMeta.sl), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
   }
 
@@ -1459,6 +1497,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   struct ncclIbCommStage* stage = &lComm->stage;
   struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
   int ready;
+  int link_layer = IBV_LINK_LAYER_UNSPECIFIED;
   *recvComm = NULL;
 
   if (stage->state == ncclIbCommStateAccept)   goto ib_accept_check;
@@ -1497,7 +1536,7 @@ ib_recv_dev_list:
   ncclNetVDeviceProps_t remoteVProps;
   memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t));
   if (lComm->dev >= ncclNMergedIbDevs) {
-    WARN("NET/IB : Trying to use non-existant virtual device %d", lComm->dev);
+    WARN("NET/IB : Trying to use non-existent virtual device %d", lComm->dev);
     return ncclInternalError;
   }
 
@@ -1555,6 +1594,13 @@ ib_recv:
     ibDev = ncclIbDevs + ibDevN;
     NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
     NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail);
+    if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = ibDev->portAttr.link_layer;
+    if (link_layer != ibDev->portAttr.link_layer) {
+      int ibDev0 = rComm->devs[0].base.ibDevN;
+      WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
 
   // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
@@ -1562,6 +1608,12 @@ ib_recv:
     rComm->base.remDevs[i] = remMeta.devs[i];
     rComm->base.remDevs[i].remoteGid.global.interface_id  = rComm->base.remDevs[i].gid.global.interface_id;
     rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].gid.global.subnet_prefix;
+    if (remMeta.devs[i].link_layer != link_layer) {
+      int ibDev0 = rComm->devs[0].base.ibDevN;
+      WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
 
   // Stripe QP creation across merged devs
@@ -1598,7 +1650,7 @@ ib_recv:
       meta.qpInfo[q].ece_supported = 0;
     }
 
-    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true, remMeta.tc, remMeta.sl), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
   }
 
@@ -1629,7 +1681,7 @@ ib_recv:
       devInfo.gid.global.subnet_prefix        = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
       devInfo.gid.global.interface_id         = rCommDev->base.gidInfo.localGid.global.interface_id;
       devInfo.mtu         = ibDev->portAttr.active_mtu;
-      NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail);
+      NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false, remMeta.tc, remMeta.sl), ret, fail);
       NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail);
     }
 
@@ -1646,6 +1698,8 @@ ib_recv:
     meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey;
   }
   meta.fifoAddr = (uint64_t)rComm->sizesFifo;
+  meta.sl = remMeta.sl;
+  meta.tc = remMeta.tc;
 
   for (int q = 0; q < rComm->base.nqps; q++) {
     meta.qpInfo[q].qpn      = rComm->base.qps[q].qp->qp_num;
@@ -1842,7 +1896,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
 
 NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
 
-ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandle) {
   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
   volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
   int nreqs = slots[0].nreqs;
@@ -1860,6 +1914,9 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
     wr->wr.rdma.remote_addr = slots[r].addr;
     wr->next = wr + 1;
     wr_id += (reqs[r] - comm->base.reqs) << (r*8);
+#ifdef NCCL_ENABLE_NET_PROFILING
+    reqs[r]->pInfo[0].nEventHandles = 0;
+#endif
   }
 
   // Write size as immediate data. In the case of multi-send, only write
@@ -1929,6 +1986,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
     }
 
     struct ibv_send_wr* bad_wr;
+#ifdef NCCL_ENABLE_NET_PROFILING
+    // QP profiling loop
+    for (int r=0; r<nreqs && pHandle; r++) {
+      // Store comm qpIndex for this request
+      int nEventHandles = reqs[r]->pInfo[0].nEventHandles;
+      reqs[r]->pInfo[0].qpIndex[nEventHandles%MAX_QPS_PER_REQ] = qpIndex;
+      // Store info for profiler
+      int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+      reqs[r]->pInfo[0].data.type = ncclProfileQp;
+      reqs[r]->pInfo[0].data.qp.device = devIndex;
+      reqs[r]->pInfo[0].data.qp.wr_id = comm->wrs[r].wr_id;
+      reqs[r]->pInfo[0].data.qp.opcode = comm->wrs[r].opcode;
+      reqs[r]->pInfo[0].data.qp.qpNum = qp->qp->qp_num;
+      reqs[r]->pInfo[0].data.qp.length = comm->sges[r].length;
+      NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles%MAX_QPS_PER_REQ], 0, pHandle, pluginId, &reqs[r]->pInfo[0].data));
+      reqs[r]->pInfo[0].nEventHandles++;
+    }
+#endif
     NCCLCHECK(wrap_ibv_post_send(qp->qp, comm->wrs, &bad_wr));
 
     for (int r=0; r<nreqs; r++) {
@@ -1945,7 +2020,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -2018,7 +2093,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
     }
 
     TIME_START(0);
-    NCCLCHECK(ncclIbMultiSend(comm, slot));
+    NCCLCHECK(ncclIbMultiSend(comm, slot, phandle));
 
     // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
     memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
@@ -2109,7 +2184,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -2121,6 +2196,9 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   req->type = NCCL_NET_IB_REQ_RECV;
   req->sock = &comm->base.sock;
   req->nreqs = n;
+#ifdef NCCL_ENABLE_NET_PROFILING
+  for (int r = 0; r < n && phandles; r++) req->pInfo[r].nEventHandles = 0;
+#endif
 
   for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     req->devBases[i] = &comm->devs[i].base;
@@ -2141,6 +2219,19 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   for (int i = 0; i < nqps; i++) {
     struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
     ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+#ifdef NCCL_ENABLE_NET_PROFILING
+    // Start a QP event for every request in the multirecv and every qp
+    for (int r = 0; r < n && phandles; r++) {
+      // Store info for profiler
+      int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+      req->pInfo[r].data.type = ncclProfileQp;
+      req->pInfo[r].data.qp.device = qp->devIndex;
+      req->pInfo[r].data.qp.wr_id = wr.wr_id;
+      req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
+      NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[i], 0, phandles[r], pluginId, &req->pInfo[r].data));
+      req->pInfo[r].nEventHandles++;
+    }
+#endif
     NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr));
     comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
   }
@@ -2196,6 +2287,16 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
 
 #define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name)
 
+#ifdef NCCL_ENABLE_NET_PROFILING
+static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
+  for (int i = 0; i < MAX_QPS_PER_REQ; i++) {
+    int qpIndex = req->pInfo[request].qpIndex[i];
+    if (req->base->qps[qpIndex].qp->qp_num == qpNumber) return i;
+  }
+  return 0;
+}
+#endif
+
 ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
   *done = 0;
@@ -2205,11 +2306,24 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
       TRACE(NCCL_NET, "r=%p done", r);
       *done = 1;
       if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
-        for (int i=0; i<r->nreqs; i++) sizes[i] = r->recv.sizes[i];
+        for (int i=0; i<r->nreqs; i++) {
+          sizes[i] = r->recv.sizes[i];
+#ifdef NCCL_ENABLE_NET_PROFILING
+          for (int j = 0; j < r->pInfo[i].nEventHandles; j++) {
+            NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], 1, NULL, 0, NULL));
+          }
+#endif
+        }
       }
       if (sizes && r->type == NCCL_NET_IB_REQ_SEND) {
         sizes[0] = r->send.size;
+#ifdef NCCL_ENABLE_NET_PROFILING
+        for (int j = 0; j < r->pInfo[0].nEventHandles; j++) {
+          NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], 1, NULL, 0, NULL));
+        }
+#endif
       }
+      // Stop all remaining Qp events for this event
       NCCLCHECK(ncclIbFreeRequest(r));
       return ncclSuccess;
     }
@@ -2264,6 +2378,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
                 return ncclInternalError;
               }
               sendReq->events[i]--;
+#ifdef NCCL_ENABLE_NET_PROFILING
+              // Stop Qp event for sendReq
+              NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[getReqQpIndex(sendReq, j, wc->qp_num)], 1, NULL, 0, NULL));
+#endif
             }
           } else {
             if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
@@ -2276,6 +2394,12 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
               }
             }
             req->events[i]--;
+#ifdef NCCL_ENABLE_NET_PROFILING
+            // Stop Qp event for workFifo
+            for (int j = 0; j < req->nreqs; j++) {
+              NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[getReqQpIndex(req, j, wc->qp_num)], 1, NULL, 0, NULL));
+            }
+#endif
           }
         }
         // Once the IB fatal event is reported in the async thread, we want to propagate this error
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 235dee8..8034d95 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -9,6 +9,7 @@
 #include "socket.h"
 #include "net.h"
 #include "param.h"
+#include "profiler/net_socket.h"
 
 #include <pthread.h>
 #include <stdlib.h>
@@ -35,7 +36,10 @@ static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
+static ncclProfilerCallback_t ncclProfilerFunction;
+
+ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+  ncclProfilerFunction = profFunction;
   if (ncclNetIfs == -1) {
     pthread_mutex_lock(&ncclNetSocketLock);
     if (ncclNetIfs == -1) {
@@ -158,6 +162,11 @@ struct ncclNetSocketTask {
   ncclResult_t result;
 };
 
+struct ncclProfilerInfo {
+  void* eHandle;
+  void* pHandle;
+};
+
 struct ncclNetSocketRequest {
   int op;
   void* data;
@@ -168,6 +177,7 @@ struct ncclNetSocketRequest {
   struct ncclNetSocketComm* comm;
   struct ncclNetSocketTask* tasks[MAX_SOCKETS];
   int nSubs;
+  struct ncclProfilerInfo pInfo;
 };
 
 struct ncclNetSocketTaskQueue {
@@ -180,6 +190,7 @@ struct ncclNetSocketThreadResources {
   struct ncclNetSocketTaskQueue threadTaskQueue;
   int stop;
   struct ncclNetSocketComm* comm;
+  struct ncclProfilerInfo* pInfo;
   pthread_mutex_t threadLock;
   pthread_cond_t  threadCond;
 };
@@ -210,6 +221,9 @@ void* persistentSocketThread(void *args_) {
   struct ncclNetSocketComm* comm = resource->comm;
   struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue;
   int nSocksPerThread = comm->nSocks / comm->nThreads;
+#ifdef NCCL_ENABLE_NET_PROFILING
+  void* eHandle[MAX_REQUESTS*MAX_SOCKETS] = { 0 };
+#endif
   while (1) {
     int idle = 1;
     int mark = myQueue->next; // mark newest task seen
@@ -220,13 +234,33 @@ void* persistentSocketThread(void *args_) {
         for (int j=0; j<nSocksPerThread; j++) {
           struct ncclNetSocketTask* r = myQueue->tasks+i+j;
           if (r != NULL && r->used == 1 && r->offset < r->size) {
+#ifdef NCCL_ENABLE_NET_PROFILING
+            if (!eHandle[i+j]) {
+              ncclProfilerNetSockDescr_v1_t data;
+              data.type = ncclProfileSocket;
+              data.sock.fd = r->sock->fd;
+              data.sock.op = r->op;
+              data.sock.length = r->size;
+              ncclProfilerFunction(&eHandle[i+j], 0, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
+            }
+#endif
             r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
             if (r->result != ncclSuccess) {
+#ifdef NCCL_ENABLE_NET_PROFILING
+              ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL);
+              eHandle[i+j] = NULL;
+#endif
               WARN("NET/Socket : socket progress error");
               return NULL;
             }
             idle = 0;
             if (r->offset < r->size) repeat = 1;
+#ifdef NCCL_ENABLE_NET_PROFILING
+            if (repeat == 0) {
+              ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL);
+              eHandle[i+j] = NULL;
+            }
+#endif
           }
         }
       } while (repeat);
@@ -326,7 +360,7 @@ fail:
   goto exit;
 }
 
-ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     return ncclInternalError;
   }
@@ -444,7 +478,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi
   return ncclInternalError;
 }
 
-ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) {
+ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclProfilerInfo* pInfo, int op, void* data, int size, struct ncclNetSocketTask** req) {
   int tid = comm->nextSock % comm->nThreads;
   struct ncclNetSocketThreadResources* res = comm->threadResources+tid;
   struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue;
@@ -457,6 +491,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void*
     NCCLCHECK(ncclCalloc(&queue->tasks, queue->len));
     queue->next = 0;
     res->comm = comm;
+#ifdef NCCL_ENABLE_NET_PROFILING
+    res->pInfo = pInfo;
+#endif
     pthread_mutex_init(&res->threadLock, NULL);
     pthread_cond_init(&res->threadCond, NULL);
     PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create");
@@ -520,7 +557,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
       int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
       while (chunkOffset < r->size) {
         int chunkSize = std::min(taskSize, r->size-chunkOffset);
-        NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+        NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
         chunkOffset += chunkSize;
       }
     }
@@ -544,6 +581,16 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
         }
       }
     } else { // progress request using main thread
+#ifdef NCCL_ENABLE_NET_PROFILING
+      if (!r->pInfo.eHandle) {
+        ncclProfilerNetSockDescr_v1_t data;
+        data.type = ncclProfileSocket;
+        data.sock.fd = r->ctrlSock->fd;
+        data.sock.op = r->op;
+        data.sock.length = r->size;
+        ncclProfilerFunction(&r->pInfo.eHandle, 0, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
+      }
+#endif
       if (r->offset < r->size) {
         NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset));
       }
@@ -551,6 +598,10 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
         if (size) *size = r->size;
         *done = 1;
         r->used = 0;
+#ifdef NCCL_ENABLE_NET_PROFILING
+        ncclProfilerFunction(&r->pInfo.eHandle, 1, NULL, 0, NULL);
+        r->pInfo.eHandle = NULL;
+#endif
       }
     }
   }
@@ -562,16 +613,26 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v
 }
 ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
 
-ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
   NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request));
+#ifdef NCCL_ENABLE_NET_PROFILING
+  // NCCL core profiler callback
+  struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request;
+  req->pInfo.pHandle = phandle;
+#endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
   if (n != 1) return ncclInternalError;
   NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request));
+#ifdef NCCL_ENABLE_NET_PROFILING
+  // NCCL core profiler callback
+  struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request;
+  if (phandles) req->pInfo.pHandle = phandles[0];
+#endif
   return ncclSuccess;
 }
 
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 3fe25a3..d99f7cb 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -108,29 +108,29 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
-  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size));
-  CUCHECK(cuMemUnmap(ptr, size));
-  CUCHECK(cuMemAddressFree(ptr, size));
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) {
+  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize));
+  CUCHECK(cuMemUnmap(ptr, mcsize));
+  CUCHECK(cuMemAddressFree(ptr, mcsize));
   CUCHECK(cuMemRelease(*mcHandler));
-  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size);
+  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d ucsize %ld mcsize %ld", comm->rank, (void*)ptr, dev, ucsize, mcsize);
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) {
-  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr);
+ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t ucsize, void* ucptr, CUmemGenericAllocationHandle* ucHandle, size_t mcsize, void* mcptr, CUmemGenericAllocationHandle* mcHandle) {
+  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) ucsize %zu MC handle 0x%llx(%p) mcsize %zd", *ucHandle, ucptr, ucsize, *mcHandle, mcptr, mcsize);
 
   // Release the UC memory and mapping
   if (ucptr) {
-    CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
-    CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
+    CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, ucsize));
+    CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, ucsize));
     CUCHECK(cuMemRelease(*ucHandle));
   }
 
   // Release the MC memory and mapping
   if (mcptr) {
-    CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
-    CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
+    CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, mcsize));
+    CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, mcsize));
     CUCHECK(cuMemRelease(*mcHandle));
   }
 
@@ -197,25 +197,27 @@ fail:
   goto exit;
 }
 
-static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularity_flags mcOption, const CUmemAccessDesc* desc, size_t* sizePtr, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr) {
+static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc* desc, size_t size, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr, size_t* ucsizePtr, size_t* mcsizePtr) {
   char shareableHandle[NVLS_HANDLE_SIZE];
   CUmulticastObjectProp mcprop;
   CUmemAllocationProp ucprop;
   ncclResult_t ret = ncclSuccess;
-  size_t size = *sizePtr;
-  size_t originSize = size;
+  size_t mcsize;
+  size_t ucsize;
   size_t ucgran, mcgran;
   int allocMcHandle = 0;
 
+  mcsize = ucsize = size;
   *ucptr = *mcptr = NULL;
+  memset(shareableHandle, '\0', sizeof(shareableHandle));
   memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
   mcprop.numDevices = comm->localRanks;
   mcprop.handleTypes = ncclCuMemHandleType;
   mcprop.flags = 0;
   mcprop.size = size;
-  CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, mcOption), ret, fail);
-  ALIGN_SIZE(size, mcgran);
-  *sizePtr = mcprop.size = size;
+  CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+  ALIGN_SIZE(mcsize, mcgran);
+  mcprop.size = mcsize;
 
   if (comm->localRank == 0) {
     NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
@@ -235,26 +237,29 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
   ucprop.location.id = comm->cudaDev;
   ucprop.requestedHandleTypes = ncclCuMemHandleType;
   CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-  // Map a VA for UC memory
-  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, size, ucgran, 0U, 0), ret, fail);
+  ALIGN_SIZE(ucsize, ucgran);
+  // Map a VA for UC memory with MC alignment and size
+  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, ucsize, ucgran, 0U, 0), ret, fail);
 
   // Alloc local physical mem for this NVLS group
-  CUCHECKGOTO(cuMemCreate(ucHandle, size, &ucprop, 0), ret, fail);
-  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, size, 0, *ucHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail);
-  CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail);
+  CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail);
+  CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail);
 
   // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
   // Bind physical memory to the Multicast group
   // NB: It will block until all ranks have been added to the Group
-  CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail);
+  CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/), ret, fail);
 
   // Map mc virtual address
-  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, size, mcgran, 0U, 0), ret, fail);
-  CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, size, 0, *mcHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, size, desc, 1), ret, fail);
-  INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld size %ld (%ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, size, originSize);
+  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, mcsize, mcgran, 0U, 0), ret, fail);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, mcsize, 0, *mcHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, mcsize, desc, 1), ret, fail);
+  *ucsizePtr = ucsize;
+  *mcsizePtr = mcsize;
+  INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld ucsize %ld mcsize %ld (inputsize %ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, ucsize, mcsize, size);
 
 exit:
   return ret;
@@ -273,6 +278,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
   size_t nvlsTotalSize = 0;
   struct ncclNvlsSharedRes* resources = NULL;
   int nChannels = -1;
+  cudaStream_t deviceStream, hostStream;
 
   if (comm->nvlsSupport == 0 || comm->nvlsResources->inited) return ncclSuccess;
   // initialize after checking comm->nvlsSupport
@@ -288,10 +294,10 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
   INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu",
        comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize);
 
-  NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_RECOMMENDED, &resources->accessDesc, &nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff), res, fail);
-  resources->buffSize = nvlsTotalSize;
+  NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff, &resources->buffUCSize, &resources->buffMCSize), res, fail);
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail);
   for (int h = 0; h < nHeads; h++) {
     int nvlsPeer = comm->nRanks + 1 + h;
     for (int c = 0; c < nChannels; c++) {
@@ -306,15 +312,16 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
       peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * buffSize;
       peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * buffSize;
 
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
     }
   }
 
-  NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail);
-  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail);
   // For now, the barrier is a must that guarantees all buffers are mc-mapped before accessing peer's buffer
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, fail);
   comm->nvlsResources->inited = true;
@@ -374,6 +381,7 @@ setup:
     size_t memSize = 64;
     size_t creditSize = nChannels * 2 * memSize * nHeads;
     int nvlsStepSize = comm->nvlsChunkSize;
+    cudaStream_t hostStream, deviceStream;
 
     NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail);
     comm->nvlsResources->inited = false;
@@ -398,11 +406,11 @@ setup:
     resources->accessDesc.location.id = comm->cudaDev;
     resources->dev = comm->cudaDev;
 
-    NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_MINIMUM, &resources->accessDesc, &creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit), res, fail);
-    resources->creditSize = creditSize;
+    NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit, &resources->creditUCSize, &resources->creditMCSize), res, fail);
 
     // Set up head and tail only for now
-    NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail);
     for (int h = 0; h < nHeads; h++) {
       int nvlsPeer = comm->nRanks + 1 + h;
       for (int c = 0; c < nChannels; c++) {
@@ -440,14 +448,15 @@ setup:
         peer->send[0].conn.stepSize = nvlsStepSize;
         peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
 
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
       }
     }
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail);
-    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail);
   }
 
   // MNNVL does not support NVLS buffer registration
@@ -488,13 +497,13 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
       NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
 
     if (resources->ucCredit || resources->mcCredit) {
-      NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle));
-      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle));
+      NCCLCHECK(nvlsGroupUnbind(comm, resources->creditUCSize, &resources->mcCreditHandle));
+      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditUCSize, resources->ucCredit, &resources->ucCreditHandle, resources->creditMCSize, resources->mcCredit, &resources->mcCreditHandle));
     }
 
     if (comm->nvlsResources->inited) {
-      NCCLCHECK(nvlsGroupUnbind(comm, resources->buffSize, &resources->mcBuffHandle));
-      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffSize, resources->ucBuff, &resources->ucBuffHandle, resources->mcBuff, &resources->mcBuffHandle));
+      NCCLCHECK(nvlsGroupUnbind(comm, resources->buffUCSize, &resources->mcBuffHandle));
+      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffUCSize, resources->ucBuff, &resources->ucBuffHandle, resources->buffMCSize, resources->mcBuff, &resources->mcBuffHandle));
     }
     free(resources);
     comm->nvlsResources = NULL;
@@ -513,7 +522,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   size_t minSize = SIZE_MAX;
   struct localRegData* regData = NULL;
   cudaPointerAttributes attr;
-  size_t ucgran, mcgran;
+  size_t ucgran, mcgran, ucsize, mcsize;
 
   NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
 
@@ -538,13 +547,12 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
         CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
 
         CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr), ret, fail);
-        if (regSize % mcgran == 0) {
-          regRecord->regSize = regSize;
-        } else {
-          regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr);
-        }
-
-        if (regRecord->addr % ucgran == 0 && regRecord->regSize % mcgran == 0) {
+        if (regRecord->addr % ucgran == 0) {
+          if (regSize % ucgran != 0) {
+            regRecord->regUCSize = ALIGN_SIZE(regSize, ucgran);
+          } else {
+            regRecord->regUCSize = regSize;
+          }
           regRecord->state |= NVLS_REG_POSSIBLE;
           memcpy(&regData[comm->localRank].reg, regRecord, sizeof(struct ncclReg));
           regData[comm->localRank].offset = userBuff - regRecord->addr;
@@ -564,13 +572,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
       goto fail;
     }
     /* get minimal reg size of nvls buffers */
-    if (minSize > regData[i].reg.regSize)
-      minSize = regData[i].reg.regSize;
+    if (minSize > regData[i].reg.regUCSize)
+      minSize = regData[i].reg.regUCSize;
   }
 
   /* start registration */
+  mcsize = ucsize = minSize;
   mcprop.size = minSize;
   CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+  ALIGN_SIZE(mcsize, mcgran);
+  mcprop.size = mcsize;
+
   if (comm->localRank == 0) {
     NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
@@ -583,16 +595,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   // Coverity complains that regRecord could be NULL.  That won't in practice be the case because we've already checked
   // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
   // coverity[var_deref_op]
-  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
+  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, ucsize, 0), ret, fail);
 
   // Create a VA for the NVLS
-  CUCHECKGOTO(cuMemAddressReserve(&regPtr, minSize, mcgran, 0U, 0), ret, fail);
+  CUCHECKGOTO(cuMemAddressReserve(&regPtr, mcsize, mcgran, 0U, 0), ret, fail);
   // Map the VA locally
-  CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
+  CUCHECKGOTO(cuMemMap(regPtr, mcsize, 0, mcHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess(regPtr, mcsize, &comm->nvlsResources->accessDesc, 1), ret, fail);
 
   regRecord->regAddr = regPtr;
-  regRecord->regSize = minSize;
+  regRecord->regUCSize = ucsize;
+  regRecord->regMCSize = mcsize;
   regRecord->dev = comm->nvlsResources->dev;
   regRecord->mcHandle = mcHandle;
   regRecord->state |= NVLS_REG_COMPLETE;
@@ -706,7 +719,7 @@ exit:
   return ncclSuccess;
 fail:
   regBufUsed = 0;
-  WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
+  INFO(NCCL_REG, "rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
   goto exit;
 }
 
@@ -843,7 +856,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) {
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index dac7621..aed84c5 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -407,6 +407,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 	  comm->peerInfo[intermediateRank].nvmlDev, useReadStr);
   }
 
+  memset(&req, '\0', sizeof(req));
   req.size = sendSize;
   req.refcount = 0;
   if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
@@ -466,6 +467,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = intermediateRank;
   }
 
+  memset(&req, '\0', sizeof(req));
   req.size = recvSize;
   req.refcount = 0;
   if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
@@ -527,7 +529,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
 
   if (useMemcpy) {
     // Attach to peer's SHM segment
-    NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
+    NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
 
     recv->conn.tail = &resources->devShm->recvMem.tail;
     recv->conn.head = &resources->devShm->sendMem.head;
@@ -634,7 +636,7 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
 
     // Create a SHM segment for the peer to attach to
     shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
-    NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
+    NCCLCHECK(ncclShmAllocateShareableBuffer(shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
 
     NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
     memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
@@ -805,7 +807,7 @@ static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size
 ncclResult_t ret = ncclSuccess;
   struct ncclIpcRegInfo* newInfo = NULL;
   uintptr_t* peerRmtAddrs = NULL;
-  bool legacyIpcCap = false;
+  int legacyIpcCap = 0;
   size_t baseSize = 0;
   void* baseAddr = NULL;
   bool needUpdate = false;
@@ -916,13 +918,16 @@ ncclResult_t ret = ncclSuccess;
       if (type == NCCL_IPC_COLLECTIVE) {
         // for collective, store registered remote buffers into dev memory for future reference
         if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
-          NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+          cudaStream_t hostStream, deviceStream;
+          NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
           if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
-            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
           if (needUpdate)
-            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-          NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
+            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), ret, fail);
         }
         peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
       } else {
@@ -941,7 +946,7 @@ fail:
   *offsetOut = 0;
   *peerRmtAddrsOut = NULL;
   if (newInfo) free(newInfo);
-  WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc);
+  INFO(NCCL_REG, "rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %d type %s", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc ? *isLegacyIpc : -1, ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR ? "POSIX_FD" : "FABRIC");
   goto exit;
 }
 
diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc
new file mode 100644
index 0000000..3e32843
--- /dev/null
+++ b/src/transport/profiler.cc
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "transport.h"
+#include "proxy.h"
+#include "profiler.h"
+
+static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->shared = 1;
+  return ncclSuccess;
+}
+
+// The following ncclProxySubArgs are overloaded by the profiler progress function:
+// - base       : is set to the current value of workCounter[channelId]
+// - posted     : is set to sub->nsteps to indicate that the profiler has started the event
+// - transmitted: is set to sub->nsteps to indicate that the profiler has stopped the event
+static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s = 0; s < args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs + s;
+      sub->base = sub->workCounter;
+      sub->posted = sub->transmitted = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    for (int s = 0; s < args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs + s;
+      uint64_t* workStarted = (uint64_t *)sub->sendbuff;
+      uint64_t* workCompleted = (uint64_t *)sub->recvbuff;
+      if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId]) {
+        ncclProfilerStartKernelChEvent(args, s);
+        sub->posted = sub->nsteps;
+        continue; // allow events on every channel to start
+      }
+      if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId]) {
+        ncclProfilerStopKernelChEvent(args, s);
+        sub->transmitted = sub->nsteps;
+        args->done++;
+      }
+    }
+    if (args->done == args->nsubs) args->state = ncclProxyOpNone;
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport profilerTransport = {
+  "Prof",
+  NULL,
+  { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+  { NULL, NULL, NULL, NULL, NULL, profilerProxyConnect, NULL, profilerProxyProgress, NULL, NULL }
+};
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index d2d6906..aa3e6c4 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -18,6 +18,7 @@ struct shmBuffInfo {
 };
 
 struct shmConnectInfo {
+  int rank;
   ncclShmIpcDesc_t desc;
   struct shmBuffInfo buf;
 };
@@ -120,6 +121,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn));
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
 
+  info->rank = comm->rank;
   resources->hostMem = (struct ncclSendMem*)info->buf.hptr;
   resources->devHostMem = (struct ncclSendMem*)info->buf.dptr;
 
@@ -150,6 +152,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn));
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
 
+  info->rank = comm->rank;
   resources->hostMem = (struct ncclRecvMem*)info->buf.hptr;
   resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr;
 
@@ -163,7 +166,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
   char* buff;
 
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
 
   buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -197,7 +200,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   char* buff;
 
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
 
   buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -464,7 +467,7 @@ static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
 exit:
@@ -485,7 +488,7 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
 exit:
@@ -517,9 +520,9 @@ static void initCeOperation() {
   }
 }
 
-ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
-  if (desc == NULL || hptr == NULL || tpProxyRank < -1) {
-    WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank);
+ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
+  if (desc == NULL || hptr == NULL) {
+    WARN("Invalid argument desc %p, hptr %p", desc, hptr);
     return ncclInvalidArgument;
   }
 #if CUDART_VERSION >= 12020
@@ -532,7 +535,6 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
     if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
       // Return the native cuMem handle for later Export/Import via UDS
       memcpy(&desc->shmci.data, &handle, sizeof(handle));
-      desc->shmci.tpProxyRank = tpProxyRank;
     } else {
       CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0));
     }
@@ -560,7 +562,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
   return ncclSuccess;
 }
 
-ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
   if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) {
     WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut);
     return ncclInvalidArgument;
@@ -584,7 +586,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
       // UDS fd support
       int fd = -1;
       // Send cuMem handle to remote for conversion to an fd
-      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd));
+      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, proxyRank, &desc->shmci.data, &fd));
       CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
       (void) close(fd);
     } else {
@@ -625,7 +627,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
     descOut->shmci.ptr = *hptr = (void *)hostptr;
     descOut->legacy = false;
     if (dptr) *dptr = (void *)hostptr;
-    INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
+    INFO(NCCL_SHM, "CUMEM imported shareable host buffer from proxyRank %d size %zi ptr %p, granularity %ld", proxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
   } else {
     char shmPath[SHM_PATH_MAX];
     snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);