From f44ac759fee12ecb3cc6891e9e739a000f66fd70 Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Wed, 12 Mar 2025 13:46:21 -0700
Subject: [PATCH] NCCL 2.26.2-1

Profiler improvements
 * Add events for CUDA kernel start and end.
 * Allow network plugins to generate profiling events
 * Enable profiling on a per-operation basis, rather than per-communicator.
 * Add support for graph capturing.

Add implicit launch order
 * Allow to prevent deadlocks when using multiple NCCL communicators per
   device by implicitly ordering NCCL operations using the host program
   order. Disabled by default, set NCCL_LAUNCH_ORDER_IMPLICIT=1 to enable.
 * Add a complementary mechanism to detect host threads racing to launch
   to the same device. Enabled by default, set NCCL_LAUNCH_RACE_FATAL=0 to
   disable.

Optimize the PAT algorithm
 * Separate the computation and execution of PAT steps on different warps,
   allowing to run up to 16 PAT steps in parallel to significantly
   accelerate PAT and reduce its linear part.

Add support for setting QoS per communicator
 * Add a new trafficClass field to the communicator configuration, to
   allow the application to select a particular traffic class for a
   given communicator. The meaning of the traffic class is
   network-specific and should be set in accordance with the network
   configuration.
 * For the IB/RoCE plugin, existing config variables such as NCCL_IB_SL
   and NCCL_IB_TC take precedence.

Allow to enable GPU Direct RDMA specifically on C2C platforms
 * Disabled by default, set NCCL_NET_GDR_C2C=1 to enable.

Do not disable user buffer registration unless PXN is really used
 * Only disable UB when a communicator has more than one rank per
   node on any node.

RAS subsystem improvements
 * Report operation counts separately for each collective operation type.
 * Provide details about missing communicator ranks and reliably
   distinguish ranks that are no longer a given communicator's members
   (now reported as NOCOMM) from those that failed to respond.

Add support for timestamps to NCCL diagnostic messages
 * On by default for WARN messages; NCCL_DEBUG_TIMESTAMP_LEVELS can be
   used to enable them for other debug levels as well.
 * The format can be changed using the NCCL_DEBUG_TIMESTAMP_FORMAT config
   variable.

Reduce the memory usage with NVLink SHARP (NVLS)
 * Potentially save hundreds of MBs of device memory, considering the
   multicast buffer size granularity separately from the address alignment.

Update performance tuning for recent Intel CPUs
 * Improve algorithm/protocol selection on recent CPUs such as Emerald
   Rapids and Sapphire Rapids.

Improve channel scheduling when mixing LL and Simple operations.
 * Make LL operations account for 4x more traffic to ensure LL and simple
   operations complete at the same time.

Refactor the plugin code
 * Clean up and harmonize the support code across the network, tuner,
   and profiler plugins.

Add support for comment lines (starting with #) in the nccl.conf file
* Issue #1540.

Make user buffer registration problems print an INFO instead of a WARN.

Drop support for network plugin interface version 5.

Fix a race condition with split-shared communicators
 * NCCL could hang during connection setup if multiple communicators
   were grouped together that share resources.

Fix a performance regression when using NCCL_CROSS_NIC=1
 * NCCL would unnecessarily alternate rings, breaking the GPU-NIC
   associations.

Make GID index detection code more resilient
 * Dynamic GID detection code was giving up too soon if the
   detected index was not available (e.g., wasn't mapped to the
   container's sysfs).
 * Issues #1538, #1573.

Fix a race condition with non-blocking operation
 * Fix issue when creating a non-blocking communicator after a non-
   blocking collective operation on another communicator.

Fix shared memory usage on recent Blackwell GPUs.
 * Issues NVIDIA/nccl-tests#287, NVIDIA/nccl-tests#291, #1637.

Fix an error with NIC fusion and IB SHARP when recreating communicators
 * Disable the unloading of network plugins

Make the auto-merge failures in the NIC fusion non-fatal
 * This could happen when trying to merge IB and RoCE devices.

Fixes to ncclCommAbort
 * Fix hangs due to the progress thread spinning indefinitely on the
   network progress.
 * Reduce the abort time by up to two orders of magnitude.

Fix a crash when libnccl.so was dynamically unloaded
 * The RAS subsystem was missing a clean-up handler.

Fix a hang if the network plugin's test() call returns an error.

Fix a hang on heterogeneous architectures
 * Ensure we harmonize the tuning to avoid different tuning choices,
   causing a hang.

Fix double-free on failed ncclCommInitRank and ncclCommFinalize.

Fix a potential list traversal bug during a group launch of multiple
communicators
 * Issue #1599.

Unify the handling of NCCL configuration variables
 * Under rare circumstances, some variables specified in the config file
   could be ignored.
---
 ext-net/README.md                           |   30 +-
 ext-net/example/nccl/net.h                  |   13 +-
 ext-net/example/nccl/net_device.h           |    3 +-
 ext-net/example/nccl/net_v10.h              |  101 ++
 ext-net/example/nccl/net_v2.h               |    4 +-
 ext-net/example/nccl/net_v3.h               |    4 +-
 ext-net/example/nccl/net_v4.h               |    4 +-
 ext-net/example/nccl/net_v5.h               |    4 +-
 ext-net/example/nccl/net_v6.h               |    6 +-
 ext-net/example/nccl/net_v7.h               |    6 +-
 ext-net/example/nccl/net_v8.h               |    6 +-
 ext-net/example/nccl/net_v9.h               |   12 +-
 ext-net/example/plugin.c                    |   77 +-
 makefiles/common.mk                         |    5 +
 makefiles/version.mk                        |    4 +-
 src/Makefile                                |   11 +-
 src/bootstrap.cc                            |   16 +-
 src/channel.cc                              |   37 +-
 src/debug.cc                                |  159 ++-
 src/device/all_gather.h                     |   62 +-
 src/device/all_reduce.h                     |   10 +-
 src/device/broadcast.h                      |    2 +-
 src/device/common.h                         |   37 +-
 src/device/primitives.h                     |   16 +-
 src/device/prims_ll.h                       |   17 +-
 src/device/prims_ll128.h                    |   17 +-
 src/device/prims_simple.h                   |  399 ++++---
 src/device/reduce_scatter.h                 |   61 +-
 src/device/sendrecv.h                       |    2 +-
 src/enqueue.cc                              |  240 ++--
 src/graph/connect.cc                        |    2 +-
 src/graph/paths.cc                          |   82 +-
 src/graph/search.cc                         |   68 +-
 src/graph/topo.cc                           |  114 +-
 src/graph/topo.h                            |   38 +-
 src/graph/tuning.cc                         |    3 +-
 src/group.cc                                |   64 +-
 src/include/bitops.h                        |   53 +-
 src/include/collectives.h                   |  450 +++----
 src/include/comm.h                          |   10 +-
 src/include/device.h                        |    8 +-
 src/include/graph.h                         |   16 +-
 src/include/group.h                         |    6 +
 src/include/nccl_net.h                      |  604 ----------
 src/include/nccl_profiler.h                 |  235 ----
 src/include/nccl_tuner.h                    |  149 ---
 src/include/net.h                           |    1 -
 src/include/net_device.h                    |    3 +-
 src/include/nvtx.h                          |    3 +-
 src/include/plugin/nccl_net.h               |   54 +
 src/include/plugin/nccl_profiler.h          |   69 ++
 src/include/plugin/nccl_tuner.h             |   22 +
 src/include/plugin/net/net_v10.h            |  158 +++
 src/include/plugin/net/net_v6.h             |  113 ++
 src/include/plugin/net/net_v7.h             |  120 ++
 src/include/plugin/net/net_v8.h             |  134 +++
 src/include/plugin/net/net_v9.h             |  152 +++
 src/include/plugin/plugin.h                 |   18 +
 src/include/plugin/profiler/net_ib.h        |   13 +
 src/include/plugin/profiler/net_ib_v1.h     |   34 +
 src/include/plugin/profiler/net_socket.h    |   13 +
 src/include/plugin/profiler/net_socket_v1.h |   32 +
 src/include/plugin/profiler/profiler_v1.h   |  107 ++
 src/include/plugin/profiler/profiler_v2.h   |  104 ++
 src/include/plugin/profiler/profiler_v3.h   |  112 ++
 src/include/plugin/tuner/tuner_v2.h         |   53 +
 src/include/plugin/tuner/tuner_v3.h         |   55 +
 src/include/plugin/tuner/tuner_v4.h         |   56 +
 src/include/profiler.h                      |   20 +
 src/include/proxy.h                         |    7 +-
 src/include/ras.h                           |    2 +
 src/include/register.h                      |    2 +-
 src/include/shm.h                           |    5 +-
 src/include/socket.h                        |    2 +-
 src/include/strongstream.h                  |   98 +-
 src/include/transport.h                     |   10 +-
 src/init.cc                                 |  123 +-
 src/misc/ipcsocket.cc                       |    3 +-
 src/misc/param.cc                           |    1 +
 src/misc/socket.cc                          |   11 +-
 src/misc/strongstream.cc                    |  493 ++++----
 src/misc/tuner.cc                           |  267 -----
 src/nccl.h.in                               |    4 +-
 src/net.cc                                  | 1033 -----------------
 src/plugin/net.cc                           |  319 +++++
 src/plugin/net/net_v10.cc                   |   32 +
 src/plugin/net/net_v6.cc                    |  178 +++
 src/plugin/net/net_v7.cc                    |  174 +++
 src/plugin/net/net_v8.cc                    |  196 ++++
 src/plugin/net/net_v9.cc                    |  121 ++
 src/plugin/plugin_open.cc                   |  134 +++
 src/{misc => plugin}/profiler.cc            |  426 +++----
 src/plugin/profiler/profiler_v1.cc          |  133 +++
 src/plugin/profiler/profiler_v2.cc          |   45 +
 src/plugin/profiler/profiler_v3.cc          |   20 +
 src/plugin/tuner.cc                         |   99 ++
 src/plugin/tuner/tuner_v2.cc                |   66 ++
 src/plugin/tuner/tuner_v3.cc                |   38 +
 src/plugin/tuner/tuner_v4.cc                |   22 +
 src/proxy.cc                                |   69 +-
 src/ras/client_support.cc                   |  851 ++++++++------
 src/ras/collectives.cc                      |  720 ++++++++----
 src/ras/peers.cc                            |  192 ++-
 src/ras/ras.cc                              |  182 +--
 src/ras/ras_internal.h                      |  139 ++-
 src/ras/rasnet.cc                           | 1158 +++++++++++--------
 src/register/register.cc                    |    4 +-
 src/transport.cc                            |   18 +-
 src/transport/coll_net.cc                   |  104 +-
 src/transport/net.cc                        |   78 +-
 src/transport/net_ib.cc                     |  186 ++-
 src/transport/net_socket.cc                 |   73 +-
 src/transport/nvls.cc                       |  147 +--
 src/transport/p2p.cc                        |   23 +-
 src/transport/profiler.cc                   |   55 +
 src/transport/shm.cc                        |   24 +-
 116 files changed, 7522 insertions(+), 5278 deletions(-)
 create mode 100644 ext-net/example/nccl/net_v10.h
 delete mode 100644 src/include/nccl_net.h
 delete mode 100644 src/include/nccl_profiler.h
 delete mode 100644 src/include/nccl_tuner.h
 create mode 100644 src/include/plugin/nccl_net.h
 create mode 100644 src/include/plugin/nccl_profiler.h
 create mode 100644 src/include/plugin/nccl_tuner.h
 create mode 100644 src/include/plugin/net/net_v10.h
 create mode 100644 src/include/plugin/net/net_v6.h
 create mode 100644 src/include/plugin/net/net_v7.h
 create mode 100644 src/include/plugin/net/net_v8.h
 create mode 100644 src/include/plugin/net/net_v9.h
 create mode 100644 src/include/plugin/plugin.h
 create mode 100644 src/include/plugin/profiler/net_ib.h
 create mode 100644 src/include/plugin/profiler/net_ib_v1.h
 create mode 100644 src/include/plugin/profiler/net_socket.h
 create mode 100644 src/include/plugin/profiler/net_socket_v1.h
 create mode 100644 src/include/plugin/profiler/profiler_v1.h
 create mode 100644 src/include/plugin/profiler/profiler_v2.h
 create mode 100644 src/include/plugin/profiler/profiler_v3.h
 create mode 100644 src/include/plugin/tuner/tuner_v2.h
 create mode 100644 src/include/plugin/tuner/tuner_v3.h
 create mode 100644 src/include/plugin/tuner/tuner_v4.h
 delete mode 100644 src/misc/tuner.cc
 delete mode 100644 src/net.cc
 create mode 100644 src/plugin/net.cc
 create mode 100644 src/plugin/net/net_v10.cc
 create mode 100644 src/plugin/net/net_v6.cc
 create mode 100644 src/plugin/net/net_v7.cc
 create mode 100644 src/plugin/net/net_v8.cc
 create mode 100644 src/plugin/net/net_v9.cc
 create mode 100644 src/plugin/plugin_open.cc
 rename src/{misc => plugin}/profiler.cc (57%)
 create mode 100644 src/plugin/profiler/profiler_v1.cc
 create mode 100644 src/plugin/profiler/profiler_v2.cc
 create mode 100644 src/plugin/profiler/profiler_v3.cc
 create mode 100644 src/plugin/tuner.cc
 create mode 100644 src/plugin/tuner/tuner_v2.cc
 create mode 100644 src/plugin/tuner/tuner_v3.cc
 create mode 100644 src/plugin/tuner/tuner_v4.cc
 create mode 100644 src/transport/profiler.cc

diff --git a/ext-net/README.md b/ext-net/README.md
index aa1a394..90fe89b 100644
--- a/ext-net/README.md
+++ b/ext-net/README.md
@@ -60,20 +60,20 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v9)
+# API (v10)
 
-Below is the main `ncclNet_v9` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
   // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -83,13 +83,13 @@ typedef struct {
   // should return successfully with sendComm == NULL with the expectation that
   // it will be called again until sendComm != NULL.
   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
   // Finalize connection establishment after remote peer has called connect.
   // This call must not block for the connection to be established, and instead
   // should return successfully with recvComm == NULL with the expectation that
   // it will be called again until recvComm != NULL.
   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -98,10 +98,10 @@ typedef struct {
   ncclResult_t (*deregMr)(void* comm, void* mhandle);
   // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
   // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
@@ -200,6 +200,9 @@ the plugin code adding the following definitions:
 #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 ```
 
+The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
+record its own events with the NCCL profiler plugin.
+
 `devices`
 
 Once the plugin is initialized, NCCL will query the number of devices available. It should not
@@ -301,6 +304,11 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
 should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
 succeeds.
 
+The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+This field can be used by the network plugin to specify the QoS level of the connection. By default,
+`trafficClass` is set to -1 but can be configured by the application during communicator initialization
+to select a plugin-supported QoS level.
+
 `closeListen`/`closeSend`/`closeRecv`
 
 Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
@@ -354,6 +362,9 @@ The `isend` operation returns a handle in the `request` argument for further cal
 the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
 `isend` again later.
 
+The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
+to support network defined events.
+
 `irecv`
 
 To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
@@ -375,6 +386,9 @@ of irecv and is resilient to redundant network writes. This allows the plugin to
 completions on such irecvs (for example, complete the request immediately). The plugin is still
 expected to set a valid request pointer on return which NCCL can poll to check for completion.
 
+The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
+network plugin to support network defined events.
+
 Note: for a given connection, send/receive operations should always match in the order they were
 posted. Tags provided for receive operations are only used to assign a given send operation to one
 of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 112967a..85ea79e 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -2,14 +2,15 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
+#ifndef NET_H_
+#define NET_H_
 
 #include <stdint.h>
 #include <stdlib.h>
 
 #include "common.h"
 #include "err.h"
+#include "net_device.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
@@ -22,6 +23,9 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
+#include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
 #include "net_v7.h"
@@ -31,4 +35,9 @@
 #include "net_v3.h"
 #include "net_v2.h"
 
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
 #endif // end include guard
diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h
index 874fb59..d693101 100644
--- a/ext-net/example/nccl/net_device.h
+++ b/ext-net/example/nccl/net_device.h
@@ -26,6 +26,7 @@ typedef struct {
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/ext-net/example/nccl/net_v10.h b/ext-net/example/nccl/net_v10.h
new file mode 100644
index 0000000..809e7c0
--- /dev/null
+++ b/ext-net/example/nccl/net_v10.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+#endif // end include guard
diff --git a/ext-net/example/nccl/net_v2.h b/ext-net/example/nccl/net_v2.h
index 0d9c906..dd9f39b 100644
--- a/ext-net/example/nccl/net_v2.h
+++ b/ext-net/example/nccl/net_v2.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V2_H_
-#define NCCL_NET_V2_H_
+#ifndef NET_V2_H_
+#define NET_V2_H_
 
 typedef struct {
   // Name of the network (mainly for logs)
diff --git a/ext-net/example/nccl/net_v3.h b/ext-net/example/nccl/net_v3.h
index db1287b..9002165 100644
--- a/ext-net/example/nccl/net_v3.h
+++ b/ext-net/example/nccl/net_v3.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V3_H_
-#define NCCL_NET_V3_H_
+#ifndef NET_V3_H_
+#define NET_V3_H_
 
 #define NCCL_NET_MAX_REQUESTS_V3 16
 
diff --git a/ext-net/example/nccl/net_v4.h b/ext-net/example/nccl/net_v4.h
index efe4824..41cef56 100644
--- a/ext-net/example/nccl/net_v4.h
+++ b/ext-net/example/nccl/net_v4.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V4_H_
-#define NCCL_NET_V4_H_
+#ifndef NET_V4_H_
+#define NET_V4_H_
 
 #define NCCL_NET_HANDLE_MAXSIZE_V4 64
 
diff --git a/ext-net/example/nccl/net_v5.h b/ext-net/example/nccl/net_v5.h
index b96b6fc..47f446c 100644
--- a/ext-net/example/nccl/net_v5.h
+++ b/ext-net/example/nccl/net_v5.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V5_H_
-#define NCCL_NET_V5_H_
+#ifndef NET_V5_H_
+#define NET_V5_H_
 
 typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
 typedef struct {
diff --git a/ext-net/example/nccl/net_v6.h b/ext-net/example/nccl/net_v6.h
index fffaf8c..de90f29 100644
--- a/ext-net/example/nccl/net_v6.h
+++ b/ext-net/example/nccl/net_v6.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V6_H_
-#define NCCL_NET_V6_H_
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
+#ifndef NET_V6_H_
+#define NET_V6_H_
 
 typedef struct {
   char* name;     // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v7.h b/ext-net/example/nccl/net_v7.h
index d607095..3802a3d 100644
--- a/ext-net/example/nccl/net_v7.h
+++ b/ext-net/example/nccl/net_v7.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V7_H_
-#define NCCL_NET_V7_H_
-
-#include "net_device.h"
+#ifndef NET_V7_H_
+#define NET_V7_H_
 
 typedef struct {
   char* name;                      // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v8.h b/ext-net/example/nccl/net_v8.h
index 54a61f6..74eb72d 100644
--- a/ext-net/example/nccl/net_v8.h
+++ b/ext-net/example/nccl/net_v8.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V8_H_
-#define NCCL_NET_V8_H_
-
-#include "net_device.h"
+#ifndef NET_V8_H_
+#define NET_V8_H_
 
 typedef struct {
   char* name;                      // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h
index 61035ec..ca60ad6 100644
--- a/ext-net/example/nccl/net_v9.h
+++ b/ext-net/example/nccl/net_v9.h
@@ -2,18 +2,14 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V9_H_
-#define NCCL_NET_V9_H_
-
-#include "net_device.h"
+#ifndef NET_V9_H_
+#define NET_V9_H_
 
 #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
 typedef struct {
   int ndevs;
   int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
 } ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
 
 typedef struct {
   char* name;                      // Used mostly for logging.
@@ -35,8 +31,6 @@ typedef struct {
   size_t maxCollBytes;             // Max transfer size for collective operations
 } ncclNetProperties_v9_t;
 
-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
@@ -93,7 +87,7 @@ typedef struct {
 
   // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
   // what index this new vNIC exists at
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
 } ncclNet_v9_t;
 
 #endif // end include guard
diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c
index 2852242..97a2987 100644
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@@ -11,7 +11,7 @@
 
 int max_requests = NCCL_NET_MAX_REQUESTS;
 
-__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -52,13 +52,13 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
 }
 
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
@@ -70,7 +70,7 @@ __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
 
 #define PLUGIN_NAME "Plugin"
 
-ncclNet_v9_t ncclNetPlugin_v9 = {
+const ncclNet_v10_t ncclNetPlugin_v10 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
@@ -93,6 +93,51 @@ ncclNet_v9_t ncclNetPlugin_v9 = {
   .makeVDevice   = pluginMakeVDevice,
 };
 
+__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
+  return pluginInit(logFunction, NULL);
+}
+
+__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
+  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+}
+
+__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
+  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+}
+
+__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+  return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
+}
+
+__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; }
+
+const ncclNet_v9_t ncclNetPlugin_v9 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v9,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v9,
+  .irecv = pluginIrecv_v9,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v9,
+};
+
 __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
   ncclNetProperties_t props;
   ncclResult_t ret = pluginGetProperties(dev, &props);
@@ -113,22 +158,22 @@ __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* pr
 }
 
 __hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
-  return pluginIsend(sendComm, data, (int)size, tag, mhandle, request);
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request);
 }
 
 __hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
   for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
-  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, request);
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, NULL, request);
 }
 
 const ncclNet_v8_t ncclNetPlugin_v8 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v8,
   .listen = pluginListen,
-  .connect = pluginConnect,
+  .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr,
   .regMrDmaBuf = pluginRegMrDmaBuf,
@@ -168,11 +213,11 @@ __hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int t
 
 const ncclNet_v7_t ncclNetPlugin_v7 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v7,
   .listen = pluginListen,
-  .connect = pluginConnect,
+  .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr_v7,
   .regMrDmaBuf = pluginRegMrDmaBuf,
@@ -209,7 +254,7 @@ __hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { retur
 
 const ncclNet_v6_t ncclNetPlugin_v6 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v6,
   .listen = pluginListen,
@@ -230,7 +275,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
 /* v5 Compat */
 const ncclNet_v5_t ncclNetPlugin_v5 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v6,
   .listen = pluginListen,
@@ -275,7 +320,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
   ncclResult_t ret;
   do {
     ncclNetDeviceHandle_v7_t* handle = NULL;
-    ret = pluginConnect(dev, handle, sendComm, &handle);
+    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
   } while (ret == ncclSuccess && *sendComm == NULL);
   return ret;
 }
@@ -289,7 +334,7 @@ static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
 }
 const ncclNet_v4_t ncclNetPlugin_v4 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v4,
   .listen = pluginListen,
@@ -318,7 +363,7 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
 }
 static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
   max_requests = NCCL_NET_MAX_REQUESTS_V3;
-  return pluginInit(logFunction);
+  return pluginInit(logFunction, NULL);
 }
 #include <string.h>
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 1b1bb86..545203a 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -16,6 +16,7 @@ WERROR ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
 RDMA_CORE ?= 0
+NET_PROFILER ?= 0
 
 NVCC = $(CUDA_HOME)/bin/nvcc
 
@@ -137,3 +138,7 @@ endif
 ifneq ($(RDMA_CORE), 0)
 CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
 endif
+
+ifneq ($(NET_PROFILER), 0)
+CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1
+endif
diff --git a/makefiles/version.mk b/makefiles/version.mk
index b02cf90..df3ee5c 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 25
-NCCL_PATCH   := 1
+NCCL_MINOR   := 26
+NCCL_PATCH   := 2
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index b66ebef..65da630 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,11 +10,15 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
 	$(wildcard register/*.cc) \
+	$(wildcard plugin/*.cc) \
+	$(wildcard plugin/net/*.cc) \
+	$(wildcard plugin/tuner/*.cc) \
+	$(wildcard plugin/profiler/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc
 
@@ -49,6 +53,7 @@ LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 BINOBJ     := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
+INCPLUGIN  := include/plugin
 
 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
 
@@ -126,8 +131,8 @@ $(PKGDIR)/%.pc : %.pc
 $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
-	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 675bcfc..9e24faa 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -153,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*sendReq) {
-    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq));
+    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq));
   }
   if (*sendReq) {
     NCCLCHECK(net->test(*sendReq, done, NULL));
@@ -167,8 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*recvReq) {
-    size_t size64 = size; 
-    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq));
+    size_t size64 = size;
+    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, NULL, recvReq));
   }
   if (*recvReq) {
     NCCLCHECK(net->test(*recvReq, done, NULL));
@@ -484,7 +484,7 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
   if (devOOB < 0) {
     pthread_mutex_lock(&bootstrapNetLock);
     if (devOOB < 0) {
-      char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME");
+      const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
       if (userIfEnv && strlen(userIfEnv) > 0) {
         INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv);
         bool searchNot = userIfEnv && userIfEnv[0] == '^';
@@ -540,7 +540,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
   do {
     NCCLCHECK(checkAbort(abortFlag, &abortCounter));
     if (!*sendComm)
-      NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle));
+      NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
     if (!*recvComm)
       NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
   } while (!*sendComm || !*recvComm);
@@ -736,6 +736,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
     rasRanks[rank].pid = getpid();
     rasRanks[rank].cudaDev = comm->cudaDev;
     rasRanks[rank].nvmlDev = comm->nvmlDev;
+    rasRanks[rank].hostHash = getHostHash();
+    rasRanks[rank].pidHash = getPidHash();
     if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
       INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
       // We should still participate in the ringAllInfo below as the peers will be waiting for us.
@@ -967,7 +969,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s
   NCCLCHECK(socketAccept(commState, peer, tag, &sock));
   TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size);
   NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail);
-  NCCLCHECK(ncclSocketClose(&sock));
+  NCCLCHECKGOTO(ncclSocketClose(&sock, /*wait*/true), ret, fail);
   return ret;
 fail:
   (void)ncclSocketClose(&sock);
@@ -1062,7 +1064,7 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i
    * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
    * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
    */
-  int data[1];
+  int data[1] = {0};
   for (int mask = 1; mask < nranks; mask <<= 1) {
     int src = (rank - mask + nranks) % nranks;
     int dst = (rank + mask) % nranks;
diff --git a/src/channel.cc b/src/channel.cc
index b3a8f29..bc48986 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -20,8 +20,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
   channel->workFifoProduced = 0;
 
   struct ncclSharedResources* sharedRes = comm->sharedRes;
-
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  cudaStream_t deviceStream;
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   if (channel->peers == NULL) {
     // The extra on nRanks+1 is for collnet root (i.e. network)
@@ -39,33 +39,33 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
 
   if (channel->devPeers == NULL) {
     if (sharedRes->devPeers[channelId] == NULL) {
-      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream));
     }
     /* channel->devPeers is not shared, so just free it when calling commFree() */
-    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream));
     ncclCommPushCudaFree(comm, channel->devPeers);
     NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
     for (int r = 0; r < nRanks; r++) {
       uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
     }
   }
 
   channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, deviceStream));
   ncclCommPushCudaFree(comm, channel->devRingUserRanks);
 
   /* guarantee addr has been copied into channel->devPeers */
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
-
   return ncclSuccess;
 }
 
 ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
   struct ncclChannel* channel = &comm->channels[channelId];
   struct ncclSharedResources* sharedRes = comm->sharedRes;
+  cudaStream_t deviceStream;
 
   if (channel->nvlsPeers != NULL)
     return ncclSuccess;
@@ -73,7 +73,7 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
   if (channel->id == -1)
     NCCLCHECK(initChannel(comm, channelId));
 
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   int nvlsRanks = comm->localRanks;
 
@@ -84,24 +84,24 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
       int tr = comm->topParentLocalRanks[r];
       uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
       channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
       ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
     }
   } else {
     NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks));
-    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, deviceStream));
     for (int r = 0; r < nvlsRanks; ++r) {
       uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
       channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
       ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
     }
   }
 
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
 
   return ncclSuccess;
 }
@@ -110,6 +110,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
   struct ncclChannel* channel = &comm->channels[channelId];
   struct ncclSharedResources* sharedRes = comm->sharedRes;
   uintptr_t addr;
+  cudaStream_t deviceStream;
 
   if (channel->collnetPeers != NULL)
     return ncclSuccess;
@@ -117,28 +118,28 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
   if (channel->id == -1)
     NCCLCHECK(initChannel(comm, channelId));
 
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   if (share) {
     channel->collnetPeers = parent->channels[channelId].collnetPeers;
     channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
     addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
     channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
-    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
     channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
     ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
   } else {
     NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
-    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, deviceStream));
     addr = (uintptr_t)channel->collnetDevPeers;
     channel->peers[comm->nRanks] = channel->collnetPeers;
-    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
     channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
     ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
   }
 
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
 
   return ncclSuccess;
 }
diff --git a/src/debug.cc b/src/debug.cc
index 2ea6eab..2eb8d77 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -6,6 +6,7 @@
 
 #include "core.h"
 #include "nccl_net.h"
+#include <ctime>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -16,6 +17,11 @@
 #include "param.h"
 
 int ncclDebugLevel = -1;
+static uint32_t ncclDebugTimestampLevels = 0;     // bitmaps of levels that have timestamps turned on
+static char ncclDebugTimestampFormat[256];        // with space for subseconds
+static int ncclDebugTimestampSubsecondsStart;     // index where the subseconds starts
+static uint64_t ncclDebugTimestampMaxSubseconds;  // Max number of subseconds plus 1, used in duration ratio
+static int ncclDebugTimestampSubsecondDigits;     // Number of digits to display
 static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
@@ -112,6 +118,84 @@ static void ncclDebugInit() {
       ncclWarnSetDebugInfo = value;
   }
 
+  // Determine which debug levels will have timestamps.
+  const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS");
+  if (timestamps == nullptr) {
+    ncclDebugTimestampLevels = (1<<NCCL_LOG_WARN);
+  } else {
+    int invert = 0;
+    if (timestamps[0] == '^') { invert = 1; ++timestamps; }
+    ncclDebugTimestampLevels = invert ? ~0U : 0U;
+    char *timestampsDup = strdup(timestamps);
+    char *level = strtok(timestampsDup, ",");
+    while (level != NULL) {
+      uint32_t mask = 0;
+      if (strcasecmp(level, "ALL") == 0) {
+        mask = ~0U;
+      } else if (strcasecmp(level, "VERSION") == 0) {
+        mask = (1<<NCCL_LOG_VERSION);
+      } else if (strcasecmp(level, "WARN") == 0) {
+        mask = (1<<NCCL_LOG_WARN);
+      } else if (strcasecmp(level, "INFO") == 0) {
+        mask = (1<<NCCL_LOG_INFO);
+      } else if (strcasecmp(level, "ABORT") == 0) {
+        mask = (1<<NCCL_LOG_ABORT);
+      } else if (strcasecmp(level, "TRACE") == 0) {
+        mask = (1<<NCCL_LOG_TRACE);
+      } else {
+        // Silently fail.
+      }
+      if (mask) {
+        if (invert) ncclDebugTimestampLevels &= ~mask;
+        else ncclDebugTimestampLevels |= mask;
+      }
+      level = strtok(NULL, ",");
+    }
+    free(timestampsDup);
+  }
+
+  // Store a copy of the timestamp format with space for the subseconds, if used.
+  const char* tsFormat = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_FORMAT");
+  if (tsFormat == nullptr) tsFormat = "[%F %T] ";
+  ncclDebugTimestampSubsecondsStart = -1;
+  // Find where the subseconds are in the format.
+  for (int i=0; tsFormat[i] != '\0'; ++i) {
+    if (tsFormat[i]=='%' && tsFormat[i+1]=='%') { // Next two chars are "%"
+      // Skip the next character, too, and restart checking after that.
+      ++i;
+      continue;
+    }
+    if (tsFormat[i]=='%' &&                               // Found a percentage
+        ('1' <= tsFormat[i+1] && tsFormat[i+1] <= '9') && // Next char is a digit between 1 and 9 inclusive
+        tsFormat[i+2]=='f'                                // Two characters later is an "f"
+        ) {
+      constexpr int replaceLen = sizeof("%Xf") - 1;
+      ncclDebugTimestampSubsecondDigits = tsFormat[i+1] - '0';
+      if (ncclDebugTimestampSubsecondDigits + strlen(tsFormat) - replaceLen > sizeof(ncclDebugTimestampFormat) - 1) {
+        // Won't fit; fall back on the default.
+        break;
+      }
+      ncclDebugTimestampSubsecondsStart = i;
+      ncclDebugTimestampMaxSubseconds = 1;
+
+      memcpy(ncclDebugTimestampFormat, tsFormat, i);
+      for (int j=0; j<ncclDebugTimestampSubsecondDigits; ++j) {
+        ncclDebugTimestampFormat[i+j] = ' ';
+        ncclDebugTimestampMaxSubseconds *= 10;
+      }
+      strcpy(ncclDebugTimestampFormat+i+ncclDebugTimestampSubsecondDigits, tsFormat+i+replaceLen);
+      break;
+    }
+  }
+  if (ncclDebugTimestampSubsecondsStart == -1) {
+    if (strlen(tsFormat) < sizeof(ncclDebugTimestampFormat)) {
+      strcpy(ncclDebugTimestampFormat, tsFormat);
+    } else {
+      strcpy(ncclDebugTimestampFormat, "[%F %T] ");
+    }
+  }
+
+
   // Cache pid and hostname
   getHostName(hostname, 1024, '.');
   pid = getpid();
@@ -192,39 +276,86 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
     tid = syscall(SYS_gettid);
   }
 
+  char buffer[1024];
+  size_t len = 0;
+
+  // WARNs come with an extra newline at the beginning.
+  if (level == NCCL_LOG_WARN) {
+    buffer[len++] = '\n';
+  };
+
+  // Add the timestamp to the buffer if they are turned on for this level.
+  if (ncclDebugTimestampLevels & (1<<level)) {
+    if (ncclDebugTimestampFormat[0] != '\0') {
+      struct timespec ts;
+      clock_gettime(CLOCK_REALTIME, &ts);   // clock_gettime failure should never happen
+      std::tm nowTm;
+      localtime_r(&ts.tv_sec, &nowTm);
+
+      // Add the subseconds portion if it is part of the format.
+      char localTimestampFormat[sizeof(ncclDebugTimestampFormat)];
+      const char* pformat = ncclDebugTimestampFormat;
+      if (ncclDebugTimestampSubsecondsStart != -1) {
+        pformat = localTimestampFormat;   // Need to use the local version which has subseconds
+        memcpy(localTimestampFormat, ncclDebugTimestampFormat, ncclDebugTimestampSubsecondsStart);
+        snprintf(localTimestampFormat + ncclDebugTimestampSubsecondsStart,
+                 ncclDebugTimestampSubsecondDigits+1,
+                 "%0*ld", ncclDebugTimestampSubsecondDigits,
+                 ts.tv_nsec / (1000000UL/ncclDebugTimestampMaxSubseconds));
+        strcpy(    localTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits,
+               ncclDebugTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits);
+      }
+
+      // Format the time. If it runs out of space, fall back on a simpler format.
+      int adv = std::strftime(buffer+len, sizeof(buffer)-len, pformat, &nowTm);
+      if (adv==0 && ncclDebugTimestampFormat[0] != '\0') {
+        // Ran out of space. Fall back on the default. This should never fail.
+        adv = std::strftime(buffer+len, sizeof(buffer)-len, "[%F %T] ", &nowTm);
+      }
+      len += adv;
+    }
+  }
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+
+  // Add hostname, pid and tid portion of the log line.
+  if (level != NCCL_LOG_VERSION) {
+    len += snprintf(buffer+len, sizeof(buffer)-len, "%s:%d:%d ", hostname, pid, tid);
+    len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+  }
+
   int cudaDev = 0;
   if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
     (void)cudaGetDevice(&cudaDev);
   }
 
-  char buffer[1024];
-  size_t len = 0;
+  // Add level specific formatting.
   if (level == NCCL_LOG_WARN) {
-    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
-                   hostname, pid, tid, cudaDev, filefunc, line);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %s:%d NCCL WARN ", cudaDev, filefunc, line);
     if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
   } else if (level == NCCL_LOG_INFO) {
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] NCCL INFO ", cudaDev);
   } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "NCCL CALL ");
   } else if (level == NCCL_LOG_TRACE) {
     auto delta = std::chrono::steady_clock::now() - ncclEpoch;
     double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
-                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line);
   }
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
 
+  // Add the message as given by the call site.
   va_list vargs;
   va_start(vargs, fmt);
   len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
   va_end(vargs);
   // vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
-  // Rewind len so that we can replace the final \0 by \n
-  if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
-  if (len) {
-    buffer[len++] = '\n';
-    fwrite(buffer, 1, len, ncclDebugFile);
-  }
+  // Rewind len so that we can replace the final \0 by "\n"
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+
+  // Add a newline and write it to the debug file. No terminating null is
+  // necessary since we write bytes instead of the string.
+  buffer[len++] = '\n';
+  fwrite(buffer, 1, len, ncclDebugFile);
 }
 
 NCCL_API(void, ncclResetDebugInit);
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index 5d79d73..854ebbf 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -67,7 +67,7 @@ namespace {
         offset = dataOffset + rankDest * count;
 
         // Final wait/copy.
-        prims.directRecv(offset, offset, nelem);
+        prims.directRecv(offset, nelem);
       }
     } else if (inputBuf != outputBuf + ringRanks[0] * count) {
       inputBuf = inputBuf + partOffset;
@@ -111,25 +111,63 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+#if __CUDA_ARCH__ >= 600
     using Proto = ProtoSimple<1, 1>;
     const int nranks = ncclShmem.comm.nRanks;
     const int rank = ncclShmem.comm.rank;
     size_t count, channelOffset, channelCount, chunkCount;
     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
 
-    T *inputBuf = (T*)work->sendbuff;
-    T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg);
+    static constexpr int nworkers = NCCL_PAT_NWORKERS;
+    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
+    uint64_t pollCount = 0;
+    __syncthreads(); // Don't start using shared mem until everyone arrives
+    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
+    if (tid == 0) shmem->localAccSize = 0;
+    if (tid == nworkers) shmem->parallelFactor = 0;
+    __syncthreads();
 
-    PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
-    int last = 0;
-    while (!last) {
-      int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
-      size_t inpIx, outIx;
-      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
-      prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
+    if (tid == nworkers) { // Algo computation thread
+      PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      int step = 0;
+      while (1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        patAlgo.getNextOp(ps);
+        int last = ps->last;
+        step++;
+        if (last == 2) break;
+      }
+    } else if (tid < nworkers) { // Worker threads
+      T *inputBuf = (T*)work->sendbuff;
+      T *outputBuf = (T*)work->recvbuff;
+      int parallelFactor = 0;
+      volatile int* pfPtr = &shmem->parallelFactor;
+      while (parallelFactor == 0) parallelFactor = *pfPtr;
+
+      int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
+      int group = tid / groupSize;
+      int nGroups = nworkers / groupSize;
+      int tidInGroup = tid - group*groupSize;
+      // We don't use recvPeers/sendPeers so let's pass shmem structs instead
+      Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+        (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatAg);
+
+      int step = group;
+      while(1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
+        int last = ps->last;
+        prims.patCopy(ps, shmem);
+        if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
+        if (last) break;
+        step += nGroups;
+      }
     }
+#endif
   }
 };
 
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 2161597..81da554 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -78,7 +78,7 @@ namespace {
       offset = gridOffset + elemOffset + chunkOffset;
       nelem = (int)min(chunkCount, remCount - chunkOffset);
 
-      prims.directRecv(offset, offset, nelem);
+      prims.directRecv(offset, nelem);
     }
   }
 
@@ -132,7 +132,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         }
       }
       else {
@@ -215,7 +215,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         }
       }
       else {
@@ -710,7 +710,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
             for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
               ssize_t offset = gridOffset + bid * int(chunkSize);
               int nelem = min(chunkSize, size - offset);
-              prims.directRecv(offset, offset, nelem, /*postOp*/true);
+              prims.directRecv(offset, nelem, /*postOp*/true);
             }
           }
         } else {
@@ -737,7 +737,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid*int(chunkSize);
             int nelem = min(chunkSize, size-offset);
-            prims.directRecv(offset, offset, nelem);
+            prims.directRecv(offset, nelem);
           }
         } else {
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 017d379..5948891 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -46,7 +46,7 @@ namespace {
             prims.directCopySend(offset, offset, nelem);
           }
         } else if (nextRank == root) {
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         } else {
           prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
diff --git a/src/device/common.h b/src/device/common.h
index 05465ff..2dca70d 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -53,6 +53,7 @@ struct ncclShmemData {
   int nWorks;
   int workSize;
   uint32_t workConsumed;
+  uint64_t workCounter;
   struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
   uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
 
@@ -113,24 +114,6 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
       : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
   return bool(ans);
 }
-__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
-  int ans;
-  asm volatile("{ .reg .pred p;"
-      "  setp.ne.s32 p, %1, 0;"
-      "  barrier.red.or.pred.aligned p, %2, p; "
-      "  selp.s32 %0, 1, 0, p; }"
-      : "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
-  return bool(ans);
-}
-__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
-  int ans;
-  asm("{ .reg .pred p;"
-      "  setp.ne.s32 p, %1, 0;"
-      "  barrier.red.or.pred.aligned p, %2, %3, p; "
-      "  selp.s32 %0, 1, 0, p; }"
-      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
-  return bool(ans);
-}
 
 // Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
 inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
@@ -331,7 +314,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   /* set abort flag to 0 */
   if (tid == 0) ncclShmem.aborted = 0;
 
-  // Use first 2 warps to load comm and channel, and reamaining load work batch.
+  // Use first 2 warps to load comm and channel, and remaining load work batch.
   switch (tid/WARP_SIZE) {
   case 0:
     { void* dst = &ncclShmem.comm;
@@ -364,7 +347,8 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
   }
 
-  while (true) {
+  while (ncclShmem.aborted == 0) {
+    if (tid == 0) ncclShmem.comm.workStarted[ncclShmem.channelId] = (ncclShmem.channel.workCounter += ncclShmem.nWorks);
     if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
       SpecializedRunWorkBatch().run();
     } else {
@@ -374,17 +358,18 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     if (ncclShmem.nextBatchIx == -1) break;
     int batchIx = ncclShmem.nextBatchIx;
     __syncthreads();
+    if (tid == 0) ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
     loadWorkBatchToShmem(tid, tn, args, batchIx);
+    __syncthreads();
 
-    // Check whether the last operation was aborted and make sure all threads exit
-    bool aborted = false;
-    if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
-    aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work
     if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
-      // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
+      // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
       ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
     }
-    if (aborted) break;
+  }
+  if (tid == 0) {
+    ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
   }
 }
 
diff --git a/src/device/primitives.h b/src/device/primitives.h
index 73c10c2..3b9f169 100644
--- a/src/device/primitives.h
+++ b/src/device/primitives.h
@@ -12,7 +12,7 @@
 #include "common_kernel.h"
 #include "common.h"
 
-#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
+#define NCCL_SPINS_BEFORE_CHECK_ABORT 10000
 
 /* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128
  * We use these as template args to the Primtiives class instead of integral
@@ -115,7 +115,7 @@ struct PrimitivesWithoutDirect {
   __device__ void directSendFromOutput(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
   }
-  __device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
+  __device__ void directRecv(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
   }
   __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
@@ -139,6 +139,18 @@ struct PrimitivesWithoutDirect {
   }
 };
 
+__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
+  if (abortCache & abortValue) return 1;
+  if (++spins < NCCL_SPINS_BEFORE_CHECK_ABORT) return 0;
+  spins = 0;
+  int abort = *ncclShmem.comm.abortFlag;
+  if (abort) {
+    ncclShmem.aborted = abort;
+    abortCache |= abortValue;
+  }
+  return abort;
+}
+
 #include "prims_simple.h"
 #include "prims_ll.h"
 #include "prims_ll128.h"
diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h
index 3e00f3b..2a0f556 100644
--- a/src/device/prims_ll.h
+++ b/src/device/prims_ll.h
@@ -51,23 +51,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     }
   }
 
-  uint32_t abort = 0;
-
-  inline __device__ int checkAbort(int &spins, int send) {
-    spins++;
-    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = *ncclShmem.comm.abortFlag;
-      spins = 0;
-    }
-    return abort;
-  }
+  int abort = 0;
 
   inline __device__ void waitSend(int nbytes) {
     if (sendConnHeadPtr) {
       int spins = 0;
       while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
         sendConnHeadCache = *sendConnHeadPtr;
-        if (checkAbort(spins, 1)) break;
+        if (checkAbort(abort, 1, spins)) break;
       }
       if (sendConnFifo) {
         int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
@@ -102,7 +93,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     int spins = 0;
     do {
       asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory");
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
     } while ((flag1 != flag) || (flag2 != flag));
     uint64_t val64 = data1 + (((uint64_t)data2) << 32);
     return val64;
@@ -126,7 +117,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     int spins = 0;
     while (line[i].flag1 != flag || line[i].flag2 != flag) {
       asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
     }
     uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
     return val64;
diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h
index 617b7ac..6985e67 100644
--- a/src/device/prims_ll128.h
+++ b/src/device/prims_ll128.h
@@ -53,23 +53,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
     barrier_sync(15-group, nthreads);
   }
 
-  uint32_t abort = 0;
-
-  inline __device__ int checkAbort(int &spins, int i, int send) {
-    spins++;
-    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = *ncclShmem.comm.abortFlag;
-      spins = 0;
-    }
-    return abort;
-  }
+  int abort = 0;
 
   inline __device__ void waitSend(int nbytes) {
     if (sendConnHeadPtr) {
       int spins = 0;
       while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
         sendConnHeadCache = *sendConnHeadPtr;
-        if (checkAbort(spins, wid, 1)) break;
+        if (checkAbort(abort, 1, spins)) break;
       }
       if (sendConnFifo) {
         sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes;
@@ -201,7 +192,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
           load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
           needReload |= flagThread && (vr[u+1] != flag);
         }
-        needReload &= (0 == checkAbort(spins, 0, 0));
+        needReload &= (0 == checkAbort(abort, 1, spins));
       } while (__any_sync(WARP_MASK, needReload));
 
       #pragma unroll
@@ -248,7 +239,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
             load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
             needReload |= flagThread && (vr[u+1] != flag);
           }
-          needReload &= (0 == checkAbort(spins, i, 0));
+          needReload &= (0 == checkAbort(abort, 1, spins));
         } while (__any_sync(WARP_MASK, needReload));
 
         #pragma unroll
diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h
index 0051019..cf3ba9b 100644
--- a/src/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -52,7 +52,7 @@ class Primitives<
   uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
   int      connStepSize; // Connection step size
   void*    netDeviceHandle;
-  uint64_t accSize; // Accumulated size. Used by PAT operations
+  uint64_t accSize;
 
   // Don't use barrier 0 as it's used by the final sync
   __device__ void barrier() {
@@ -70,6 +70,11 @@ class Primitives<
     }
   }
 
+  // PAT uses a single barrier across all groups
+  __device__ void patBarrier() {
+    barrier_sync(15, NCCL_PAT_NWORKERS);
+  }
+
   __device__ bool barrierAny(int vote) {
     if (nthreads == WARP_SIZE) {
       return __any_sync(~0u, vote);
@@ -87,18 +92,6 @@ class Primitives<
     }
   }
 
-  inline __device__ bool checkAbort(int &spins) {
-    spins++;
-    if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      if (*ncclShmem.comm.abortFlag) {
-        flags |= Aborted;
-        ncclShmem.aborted = 1;
-      }
-      spins = 0;
-    }
-    return flags & Aborted;
-  }
-
   inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
     #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
     if (flags & NvlsMinPolling) {
@@ -121,7 +114,7 @@ class Primitives<
       int spins = 0;
       while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
         connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
+        if (checkAbort(flags, Aborted, spins)) break;
         //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
       }
     }
@@ -338,13 +331,8 @@ public:
     peerPtr->recv[connIndex].step += steps;
     st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
     while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
-      if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-        if (*ncclShmem.comm.abortFlag) {
-          ncclShmem.aborted = 1;
-          break;
-        }
-        spins = 0;
-      }
+      int abort = 0;
+      if (checkAbort(abort, 1, spins)) break;
     }
   }
 
@@ -359,7 +347,7 @@ public:
           int spins = 0;
           while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
             connStepCache = loadStepValue(connStepPtr);
-            if (checkAbort(spins)) break;
+            if (checkAbort(flags, Aborted, spins)) break;
           }
           void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts
                                       : ncclShmem.groups[group].srcs;
@@ -601,13 +589,13 @@ private:
     tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
     stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
 
-    // For send operations, we need an extra warp to overlap the threadfence and the copy
-    this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
-
     int peer = -1;
     flags = 0;
     index = -1;
     if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
+      // For send operations, we need an extra warp to overlap the threadfence and the copy
+      this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
+
       int nrecv=0, nsend=0;
       // Yes, for some template arguments this code will be unreachable.  That's fine.
       // coverity[dead_error_line]
@@ -637,68 +625,84 @@ private:
 
       if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
       if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
+
+      // Coverity thinks that index could be -1 here but that's not actually the case.
+      // coverity[negative_returns:FALSE]
+      int sendIpcReg;
+      int recvIpcReg;
+      int sendNetReg;
+      int recvNetReg;
+      if (P2p) {
+        sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
+        recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
+        sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
+        recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
+      } else {
+        recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
+        recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
+      }
+
+      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+      if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
+      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+      if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
+
+      if (barrierAny(flags & NetDeviceUnpack)) {
+        flags |= AnyNetDeviceUnpack;
+        // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
+        // have NetDeviceUnpack.
+        uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
+        if (tid == 0) {
+          ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+        }
+      }
+
+      // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
+      // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
+      setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
+      // coverity[uninit_member] => coverity thinks fan.n is not initialized
     } else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
       flags |= PatMode;
-      accSize = 0;
+      const int roles[5] = { RoleWaitRecv, RolePostRecv, RoleWaitSend, RolePostSend, RoleInput | RoleOutput };
+      if (tid < 5) flags |= roles[tid];
+
       int nranks = ncclShmem.comm.nRanks;
-      int rank = ncclShmem.comm.rank;
-      // A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
-      index = tid % 32;
-      uint32_t delta = 1 << index;
-      const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
-      int block = tid / 32;
-      if (block < 4 && delta < nranks) {
-        int role = roles[block];
-        if (mode == primsModePatRs) {
-          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
-          if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
-        } else if (mode == primsModePatAg) {
-          if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
-          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
-        }
-        flags |= role;
-      } else if (tid == 128) {
-        flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
+      if (tid < 32 && ((1UL<<tid) < nranks)) {
+        int rank = ncclShmem.comm.rank;
+        uint32_t delta = 1 << tid;
+        // Load recv peer
+        int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
+        struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
+        struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
+        peer->step = conn->step;
+        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+        peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
+        peer->headPtr = conn->head;
+        peer->accSize = 0;
+        peer->connStepSize = conn->stepSize/sizeof(T);
+        // Load send peer
+        int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
+        peer = ((struct ncclPatPeer*)sendPeers)+tid;
+        conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
+        peer->step = conn->step;
+        peer->connFifo = conn->connFifo;
+        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+        peer->stepCache = loadStepValue(peer->headPtr = conn->head);
+        peer->tailPtr = conn->tail;
+        peer->accSize = 0;
+        peer->connStepSize = conn->stepSize/sizeof(T);
       }
-    }
-
-    // Coverity thinks that index could be -1 here but that's not actually the case.
-    // coverity[negative_returns:FALSE]
-    int sendIpcReg;
-    int recvIpcReg;
-    int sendNetReg;
-    int recvNetReg;
-    if (P2p) {
-      sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
-      recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
-      sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
-      recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
-    } else {
-      recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
-      recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
-    }
-    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
-    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
-
-    if (barrierAny(flags & NetDeviceUnpack)) {
-      flags |= AnyNetDeviceUnpack;
-      // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
-      // have NetDeviceUnpack.
-      uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
-      if (tid == 0) {
-        ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+      if (tid==0) {
+        ncclShmem.groups[group].userInput = (void*)inputBuf;
+        ncclShmem.groups[group].userOutput = (void*)outputBuf;
+        ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
       }
+      patBarrier();
     }
-
-    // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
-    // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
-    // coverity[uninit_member] => coverity thinks fan.n is not initialized
   }
 
   __device__ ~Primitives() {
+    if (flags&PatMode) return;
     // Save steps for the next operation
     if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
     if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
@@ -708,7 +712,7 @@ private:
       uint64_t prevStep = step - StepPerSlice;
       volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
       int spins = 0;
-      while (*ptr != -1) if (checkAbort(spins)) break;
+      while (*ptr != -1) if (checkAbort(flags, Aborted, spins)) break;
     }
 
     if (flags & NetDeviceUnpack) {
@@ -726,7 +730,7 @@ private:
       int spins = 0;
       volatile uint64_t* tail = conn->tail;
       volatile uint64_t* head = conn->head;
-      while (*tail > *head) if (checkAbort(spins)) break;
+      while (*tail > *head) if (checkAbort(flags, Aborted, spins)) break;
     }
   }
 
@@ -749,7 +753,7 @@ private:
         if (slot) {
           T* exchgPtr;
           directBuff = (T*)outputBuf;
-          while (*slot != nullptr && !checkAbort(spins));
+          while (*slot != nullptr && !checkAbort(flags, Aborted, spins));
           if (P2p) {
             exchgPtr = (T*)outputBuf;
           } else {
@@ -766,7 +770,7 @@ private:
         void* ptr;
         while (slot) {
           ptr = *slot;
-          if (ptr != nullptr || checkAbort(spins)) break;
+          if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
         }
 
         if (slot) {
@@ -785,7 +789,7 @@ private:
         // Wait for consumer to consume previous value before trampling it.
         if (slot && argSlot0 && argSlot1) {
           T* exchgPtr;
-          while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
+          while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(flags, Aborted, spins));
           // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
           // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
           directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
@@ -815,7 +819,7 @@ private:
         void* ptr;
         while (slot) {
           ptr = *slot;
-          if (ptr != nullptr || checkAbort(spins)) break;
+          if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
         }
 
         if (slot && argSlot0 && argSlot1) {
@@ -826,7 +830,7 @@ private:
             while (true) {
               arg0 = *argSlot0;
               arg1 = *argSlot1;
-              if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+              if ((arg0 != 0 && arg1 != 0) || checkAbort(flags, Aborted, spins)) break;
             }
             ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
           }
@@ -866,8 +870,8 @@ private:
   __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
+  __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 0, -1, Output>(outIx, outIx, eltN, postOp);
   }
   __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
     genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
@@ -945,54 +949,65 @@ private:
     ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
   }
 
-  __device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
-    nelem = nelem < 0 ? 0 : nelem;
+  __device__ __forceinline__ void patReduce(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
+    if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
+    int nelem = ps->nelem < 0 ? 0 : ps->nelem;
     T* userInput = (T*)ncclShmem.groups[group].userInput;
     T* userOutput = (T*)ncclShmem.groups[group].userOutput;
 
-    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
-      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
-      int spins = 0;
-      while (connStepCache < step + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      if (postRecv) step += StepPerSlice;
+    bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
+    bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
+    bool postRecv = ps->postRecv && recv;
+    bool postSend = ps->postSend && send;
+    struct ncclPatPeer* peer = NULL;
+    if (recv) {
+      peer = shmem->recvDims+ps->recvDim;
+      step = peer->step;
     }
-    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
-      int spins = 0;
-      while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
-      if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
-        // New data, add our own data to it.
-        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
-        accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
-        if (flags & ConnFifoEnabled)
-          connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
-      } else {
-        // There is already data in there, accumulate instead of writing to it.
-        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
-      }
-      if (postSend) step += StepPerSlice;
+    if (send) {
+      peer = shmem->sendDims+ps->sendDim;
+      step = peer->step;
     }
-    if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
-      ncclShmem.groups[group].dsts[0] = userOutput + outIx;
-      if (accSize < outIx + nelem) {
+
+    if (recv && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
+      int spins = 0;
+      while (peer->stepCache < step + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->tailPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+    }
+    if (send && (flags & RoleWaitSend)) {
+      int spins = 0;
+      while (peer->stepCache + NCCL_STEPS < step + ps->stepOffset + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->headPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
+      if (peer->accSize < ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
         // New data, add our own data to it.
-        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
-        accSize = outIx + nelem;
+        ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
       } else {
         // There is already data in there, accumulate instead of writing to it.
         ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
       }
     }
-    barrier();
+    long long int localAccSize = shmem->localAccSize;
+    if (ps->sendDim < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
+      ncclShmem.groups[group].dsts[0] = userOutput + ps->outIx;
+      if (localAccSize < ps->outIx + nelem) {
+        // New data, add our own data to it.
+        ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
+        localAccSize = ps->outIx + nelem;
+      } else {
+        // There is already data in there, accumulate instead of writing to it.
+        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
+      }
+    }
+    patBarrier();
     int nSrcs = 2;
     void** srcs = ncclShmem.groups[group].srcs;
-    if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
+    if (ps->recvDim < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
 
     int workSize = ncclShmem.aborted ? 0 : nelem;
 
@@ -1000,59 +1015,92 @@ private:
       (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
        nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);
 
-    barrier();
-    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
-    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+    // Store conn step here inside the two barriers to make sure next reload will see the update.
+    if (postSend && (flags & RolePostSend)) {
+      if (peer->connFifo) {
+        peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
+      }
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
+    }
+
+    // Update accSize
+    if (ps->sendDim < 0 && (flags & RoleOutput)) atomicMax(&shmem->localAccSize, localAccSize);
+    if (ps->sendDim >= 0 && (flags & RoleWaitSend)) atomicMax(&peer->accSize, ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
+
+    patBarrier();
+
+    if (postSend && (flags & RolePostSend)) {
+      if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
+      st_relaxed_sys_global(peer->tailPtr, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      st_relaxed_sys_global(peer->headPtr, step);
+    }
   }
 
-  __device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
-    nelem = nelem < 0 ? 0 : nelem;
+  __device__ __forceinline__ void patCopy(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
+    if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
+    int nelem = ps->nelem < 0 ? 0 : ps->nelem;
     T* userInput = (T*)ncclShmem.groups[group].userInput;
     T* userOutput = (T*)ncclShmem.groups[group].userOutput;
 
-    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
-      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
-      int spins = 0;
-      while (connStepCache < step + recvStepOffset + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
-        // New data, copy to our output buffer.
-        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
-        accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
-      } else {
-        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
-      }
-      if (postRecv) step += StepPerSlice;
+    bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
+    bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
+    bool postRecv = ps->postRecv && recv;
+    bool postSend = ps->postSend && send;
+    struct ncclPatPeer* peer = NULL;
+    if (recv) {
+      peer = shmem->recvDims+ps->recvDim;
+      step = peer->step;
     }
-    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
-      int spins = 0;
-      while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
-      if (postSend) {
-        if (flags & ConnFifoEnabled)
-          connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
-        step += StepPerSlice;
-      }
+    if (send) {
+      peer = shmem->sendDims+ps->sendDim;
+      step = peer->step;
     }
-    if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
-      ncclShmem.groups[group].srcs[0] = userInput + inpIx;
-      if (accSize < inpIx + nelem) {
+
+    if (recv && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
+      int spins = 0;
+      while (peer->stepCache < step + ps->stepOffset + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->tailPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      if (peer->accSize < ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
         // New data, copy to our output buffer.
-        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
-        accSize = inpIx + nelem;
+        ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
       } else {
         ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
       }
     }
-    barrier();
+    if (send && (flags & RoleWaitSend)) {
+      int spins = 0;
+      while (peer->stepCache + NCCL_STEPS < step + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->headPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
+    }
+    long long int localAccSize = shmem->localAccSize;
+    if (ps->recvDim < 0 && (flags & RoleInput)) { // Source is our own local buffer
+      ncclShmem.groups[group].srcs[0] = userInput + ps->inpIx;
+      if (localAccSize < ps->inpIx + nelem) {
+        // New data, copy to our output buffer.
+        ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
+        localAccSize = ps->inpIx + nelem;
+      } else {
+        // Already done
+        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0];
+      }
+    }
+    patBarrier();
     int nDsts = 2;
     void** dsts = ncclShmem.groups[group].dsts;
-    if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
+    if (ps->sendDim < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
     if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.
 
     int workSize = ncclShmem.aborted ? 0 : nelem;
@@ -1061,9 +1109,32 @@ private:
       (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
        1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);
 
-    barrier();
-    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
-    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+    // Store conn step here inside the two barriers to make sure next reload will see the update.
+    if (postSend && (flags & RolePostSend)) {
+      if (peer->connFifo) {
+        peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
+      }
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
+    }
+
+    // Update accSize
+    if (ps->recvDim < 0 && (flags & RoleInput)) atomicMax(&shmem->localAccSize, localAccSize);
+    if (ps->recvDim >= 0 && (flags & RoleWaitRecv)) atomicMax(&peer->accSize, ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
+
+    patBarrier();
+
+    if (postSend && (flags & RolePostSend)) {
+      if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
+      st_relaxed_sys_global(peer->tailPtr, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      st_relaxed_sys_global(peer->headPtr, step);
+    }
   }
 
 };
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index 70538b1..5d8de28 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -80,29 +80,66 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+#if __CUDA_ARCH__ >= 600
     using Proto = ProtoSimple<1, 1>;
     const int nranks = ncclShmem.comm.nRanks;
     const int rank = ncclShmem.comm.rank;
     size_t count, channelOffset, channelCount, chunkCount;
     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
 
-    T *inputBuf = (T*)work->sendbuff;
-    T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs);
+    static constexpr int nworkers = NCCL_PAT_NWORKERS;
+    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
+    uint64_t pollCount = 0;
+    __syncthreads(); // Don't start using shared mem until everyone arrives
+    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
+    if (tid == 0) shmem->localAccSize = 0;
+    if (tid == nworkers) shmem->parallelFactor = 0;
+    __syncthreads();
 
-    PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
-    int last = 0;
-    while (!last) {
-      int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
-      size_t inpIx, outIx;
-      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
-      prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
+    if (tid == nworkers) { // Algo computation thread
+      PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      int step = 0;
+      while (1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        patAlgo.getNextOp(ps);
+        int last = ps->last;
+        step++;
+        if (last == 2) break;
+      }
+    } else if (tid < nworkers) { // Worker threads
+      T *inputBuf = (T*)work->sendbuff;
+      T *outputBuf = (T*)work->recvbuff;
+      int parallelFactor = 0;
+      volatile int* pfPtr = &shmem->parallelFactor;
+      while (parallelFactor == 0) parallelFactor = *pfPtr;
+
+      int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
+      int group = tid / groupSize;
+      int nGroups = nworkers / groupSize;
+      int tidInGroup = tid - group*groupSize;
+      // We don't use recvPeers/sendPeers so let's pass shmem structs instead
+      Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+        (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatRs);
+
+      int step = group;
+      while(1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
+        int last = ps->last;
+        prims.patReduce(ps, shmem);
+        if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
+        if (last) break;
+        step += nGroups;
+      }
     }
+#endif
   }
 };
 
-
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index fe3b9ca..f36a511 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -41,7 +41,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
-      prims.directRecv(cursor, cursor, n);
+      prims.directRecv(cursor, n);
       cursor += n;
     } while (cursor < bytes);
   }
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 23f4633..5e0b213 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -23,7 +23,6 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 // Returns maximum kernel stack size of all CUDA kernels
 ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
   ncclResult_t result = ncclSuccess;
-  int print = 0;
 
   if (maxStackSize) *maxStackSize = 0;
   int carveout = ncclParamL1SharedMemoryCarveout();
@@ -48,11 +47,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
     if (ncclMaxSharedMem != 0) {
       int sharedMemSize = ncclMaxSharedMem;
       if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
-        if (print++ == 0)
-          INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
-               sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
-        // Reduce requested MaxDynamicSharedMemorySize attribute
-        sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
+        WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
+             cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
+        return ncclSystemError;
       }
       CUDACHECKGOTO(cudaFuncSetAttribute(fn,
         cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
@@ -388,6 +385,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
         struct ncclTaskColl* next = aggBeg->next;
         aggBeg->algorithm = agg.algorithm;
         aggBeg->protocol = agg.protocol;
+        if (aggBeg->protocol == NCCL_PROTO_LL) aggBeg->trafficBytes *= 4;
         aggBeg->nMaxChannels = agg.nMaxChannels;
         aggBeg->nWarps = agg.nWarps;
         aggBeg->devFuncId = agg.devFuncId;
@@ -478,6 +476,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
   return ncclSuccess;
 }
 
+static ncclResult_t addProfilerProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) {
+  int tmp = op->pattern;
+  op->pattern = ncclPatternProfiler;
+  ncclResult_t ret = addProxyOpIfNeeded(comm, plan, op);
+  op->pattern = tmp;
+  return ret;
+}
+
 static ncclResult_t scheduleCollTasksToPlan(
     struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget
   ) {
@@ -550,11 +556,16 @@ static ncclResult_t scheduleCollTasksToPlan(
         proxyOp.opCount = proxyOpId;
         proxyOp.task.coll = task;
         proxyOp.rank = comm->rank;
+        proxyOp.eActivationMask = task->eActivationMask;
+        proxyOp.workCounter = ++comm->profiler.workCounter[c];
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
+        // Set pattern to profiler to add a proxy profiler for kernel events
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOp));
       }
     } else { // not task->isCollnet
       int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
+      if (task->protocol == NCCL_PROTO_LL) trafficPerByte *= 4;
       size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
       int elementsPerCell = cellSize/elementSize;
       size_t cells = divUp(task->count*elementSize, cellSize);
@@ -669,11 +680,14 @@ static ncclResult_t scheduleCollTasksToPlan(
           }
           proxyOp->ringAlgo->incRefCount();
         }
+        proxyOp->eActivationMask = task->eActivationMask;
+        proxyOp->workCounter = ++comm->profiler.workCounter[c];
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
         // determine if that's actually true but it's also not clear if that would be an issue.
         // coverity[uninit_use_in_call:FALSE]
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, proxyOp));
       }
     }
 
@@ -797,7 +811,8 @@ static ncclResult_t addP2pToPlan(
     if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
 
     if (network[dir]) {
-      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) {
+      bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1;
+      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) {
         int regFlag = 0;
         NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
         for (int part = 0; part < nChannelsMax; part++) {
@@ -888,6 +903,7 @@ static ncclResult_t addP2pToPlan(
     op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
     op->task.p2p = p2pTasks[dir];
     op->rank = comm->rank;
+    op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0;
     // The following are modified per channel part in addWorkToChannels():
     // op->buffer, op->nbytes, op->nsteps = ...;
   }
@@ -898,7 +914,6 @@ static ncclResult_t addP2pToPlan(
     plan->channelMask |= uint64_t(1)<<channelId;
     // Add batch first.
     addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, ncclDevFuncId_P2p(), workOffset, p2pRound);
-    // Add proxy ops.
     for (int dir=0; dir < nProxyOps; dir++) {
       // Partition steps across channels.
       int nParts = dir ? work->nSendChannels : work->nRecvChannels;
@@ -935,9 +950,12 @@ static ncclResult_t addP2pToPlan(
         // equal one plus the batch index this p2p settled in.
         proxyOps[dir].channelId = channelId;
         proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
+        proxyOps[dir].workCounter = comm->profiler.workCounter[channelId]+1;
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
       }
     }
+    comm->profiler.workCounter[channelId] += (proxyOps[0].nsteps || proxyOps[1].nsteps) ? 1 : 0;
   }
 
   return ncclSuccess;
@@ -1157,22 +1175,23 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
       struct uploadWork_cleanup_t* cleanup = nullptr;
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
       void* fifoBufDev = nullptr;
+      cudaStream_t deviceStream;
+
       CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail);
 
-      // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
-      // user's graph will be launched later, and it also acquires the deviceStream,
-      // it will observe this upload.
-      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail);
+      // Acquire deviceStream. Since the user's graph will be launched later and it also
+      // acquires the deviceStream, it will observe this upload.
+      NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, fail);
 
-      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, deviceStream), result, fail);
       plan->workBufPersistent = fifoBufDev;
       plan->kernelArgs->workBuf = fifoBufDev;
 
       // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL
-      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, deviceStream), result, fail);
       cudaEvent_t memcpyDone;
       CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail);
-      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaEventRecord(memcpyDone, deviceStream), result, fail);
 
       NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail);
       cleanup->base.fn = uploadWork_cleanup_fn;
@@ -1180,7 +1199,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
       cleanup->hostBuf = fifoBufHost;
       ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);
 
-      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail);
+      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail);
       NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);
 
     finish_scope:
@@ -1254,14 +1273,15 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
   if (result != ncclSuccess) {
     WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
   }
-  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs);
+  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs);
   return;
 }
 
 static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
   struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
   if (plan->persistent) {
-    comm->persistentRefs -= 1;
+    comm->sharedRes->persistentRefs -= 1;
+    comm->localPersistentRefs -= 1;
     if (plan->workStorageType == ncclDevWorkStorageTypePersistent) {
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
       CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@@ -1317,6 +1337,28 @@ static void persistentDestructor(void* plans_) {
   }
 }
 
+NCCL_PARAM(LaunchOrderImplicit, "LAUNCH_ORDER_IMPLICIT", 0);
+
+namespace {
+  enum ncclImplicitOrder {
+    ncclImplicitOrderNone,
+    ncclImplicitOrderSerial,
+    ncclImplicitOrderLaunch
+  };
+}
+
+static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) {
+  if (ncclParamLaunchOrderImplicit()) {
+    // Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs
+    if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
+    if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); }
+    *mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
+    return ncclSuccess;
+  }
+  *mode = ncclImplicitOrderNone;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
   ncclResult_t result = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
@@ -1364,58 +1406,60 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
 
     if (nPlans == 0) return ncclSuccess;
 
-    // Semantically we want these dependencies for the kernels launched:
-    //   1. Launch host task on hostStream.
-    //   2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...}
-    //   3. {deviceStream, userStream[i]...} depend on kernel.
-    // We achieve this by:
-    //   1. userStream[0] waits on deviceStream
-    //   2. deviceStream waits on each of userStream[1...]
-    //   3. host task launch on hostStream
-    //   4. userStream[0] waits on hostStream
-    //   5. kernel launch on userStream[0]
-    //   6. deviceStream waits on userStream[0]
-    //   7. userStream[1...] each waits on deviceStream
-    // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
-    // at least one of the two streams to be strong-stream.
     cudaStream_t launchStream = planner->streams->stream;
-    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
+    cudaStream_t deviceStream, launchOrder;
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, failure);
 
-    // Create dependency for device stream on user streams. First from extra user
-    // streams to deviceStream. Then deviceStream to first user stream.
+    // userStream[0] waits on each userStream[i]...
     for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
+      CUDACHECKGOTO(cudaEventRecord(comm->sharedRes->scratchEvent, l->stream), result, failure);
+      CUDACHECKGOTO(cudaStreamWaitEvent(launchStream, comm->sharedRes->scratchEvent, 0), result, failure);
     }
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
+    // userStream[0] waits on deviceStream
+    NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, deviceStream, comm->sharedRes->scratchEvent), result, failure);
 
-    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) {
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure);
+
+    if (implicitOrder != ncclImplicitOrderNone) {
+      // userStream[0] waits on per-device (context) launchOrder. Concurrent strong stream access is
+      // required if this is a graph capture, non-captured cannot be concurrent because that would violate
+      // deterministic program order of launches.
+      bool concurrent = capturing;
+      NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder), result, failure);
+      NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure);
+    }
+
+    if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) {
       // We have to launch host tasks to push proxy args. We are careful to only
       // do this if necessary since host tasks impose a high performance cost in CUDA.
       bool acquired = false;
+      cudaStream_t hostStream;
       for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
         if (plan->hasProxyOps) {
           if (!acquired) {
             acquired = true;
-            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure);
           }
-          if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs);
+          if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs);
           plan->isHostCbEnq = true;
-          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
+          CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure);
         }
       }
       if (acquired) {
         // Make to-be-launched kernels dependent on just-launched host stream tasks.
-        NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
-        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+        NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, hostStream, comm->sharedRes->scratchEvent), result, failure);
+        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false), result, failure);
       }
     }
 
     if (persistent) {
-      comm->persistentRefs += nPlans;
+      comm->sharedRes->persistentRefs += nPlans;
+      comm->localPersistentRefs += nPlans;
       NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
     }
   }
-
 failure:
   return result;
 }
@@ -1434,6 +1478,7 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif
 
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  ncclResult_t ret = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
   int nChannels = countOneBits(plan->channelMask);
   void* sym = plan->kernelFn;
@@ -1447,18 +1492,19 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     CU_LAUNCH_PARAM_END
   };
 
-  CUfunction fn;
-  CUDACHECK(cudaGetFuncBySymbol(&fn, sym));
-
-  #if CUDART_VERSION >= 11080
   int driverVersion;
-  NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
-  if (driverVersion >= 11080) {
+  NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, do_return);
+
+  CUfunction fn;
+  CUDACHECKGOTO(cudaGetFuncBySymbol(&fn, sym), ret, do_return);
+
+  if (CUDART_VERSION >= 11080 && driverVersion >= 11080) {
+  #if CUDART_VERSION >= 11080
     int compCap = comm->compCap;
     unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
 
     CUlaunchConfig launchConfig = {0};
-    CUlaunchAttribute launchAttrs[3];
+    CUlaunchAttribute launchAttrs[4] = {};
     int attrs = 0;
     /* Cooperative Group Array (CGA)
      * On sm90 and later we have an extra level of hierarchy where we
@@ -1485,6 +1531,17 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
       launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain();
     }
     #endif
+    #if CUDART_VERSION >= 12030
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return);
+    if (implicitOrder == ncclImplicitOrderLaunch) {
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT;
+      launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent;
+      launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
+      attrs++;
+    }
+    #endif
     launchConfig.gridDimX = grid.x;
     launchConfig.gridDimY = grid.y;
     launchConfig.gridDimZ = grid.z;
@@ -1496,15 +1553,15 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     launchConfig.numAttrs = attrs;
     launchConfig.hStream = launchStream;
 
-    //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args));
-    CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra));
-    return ncclSuccess;
-  }
+    CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return);
   #endif
-  // Standard kernel launch
-  CUCHECK(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra));
-  //CUDACHECK(cudaLaunchKernel(fnAddr, grid, block, args, smem, launchStream));
-  return ncclSuccess;
+  } else {
+    // Standard kernel launch
+    CUCHECKGOTO(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra), ret, do_return);
+  }
+
+do_return:
+  return ret;
 }
 
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
@@ -1524,34 +1581,39 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
 }
 
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
-  ncclResult_t result = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
-
   if (!ncclIntruQueueEmpty(&planner->planQueue)) {
     // Reset queue to empty without destroying plans since those will be sent
     // back to us for reclaiming via callbackQueue.
     ncclIntruQueueConstruct(&planner->planQueue);
+
     cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
-    // Create dependency for deviceStream on launchStream. We know that deviceStream
-    // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
-    // so we can say that launchStream subsumes it.
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
-  resume1:
-    // Create dependency for other user streams (skip launch stream) on deviceStream.
-    // Again, the user streams haven't been touched since deviceStream waited on them
-    // so we can say they are subsumed by deviceStream.
-    struct ncclCudaStreamList* sl = planner->streams->next;
-    planner->streams = nullptr; // Reset comm->planner.streams to empty.
-    while (sl != nullptr) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
-    resume2:
-      sl = sl->next;
+    cudaStream_t deviceStream, launchOrder;
+    CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream));
+    // deviceStream waits on userStream[0]
+    NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+    CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
+    // Each userStream[i] waits on userStream[0]
+    for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
+      CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0));
     }
-    // Release device stream as acquired in ncclLaunchPrepare()
-    NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
-  resume3:;
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECK(getImplicitOrder(&implicitOrder, capturing));
+    if (implicitOrder != ncclImplicitOrderNone) {
+      // As in ncclLaunchPrepare, strong stream can be non-concurrent when non-captured.
+      bool concurrent = capturing;
+      // Incorporate launch event into per-device (context) launch order.
+      NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder));
+      // If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution).
+      CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent));
+      // Release launchOrder as acquired in ncclLaunchPrepare()
+      NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent));
+    }
+    // Release deviceStream as acquired in ncclLaunchPrepare()
+    NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false));
   }
-  return result;
+  return ncclSuccess;
 }
 
 /*****************************************************************************/
@@ -1655,11 +1717,11 @@ static ncclResult_t topoGetAlgoInfo(
   if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
     char ncclAlgoEnvStr[1024] = "";
     char ncclProtoEnvStr[1024] = "";
-    char* algoEnv = getenv("NCCL_ALGO");
+    const char* algoEnv = ncclGetEnv("NCCL_ALGO");
     if (algoEnv) {
       snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv);
     }
-    char* protoEnv = getenv("NCCL_PROTO");
+    const char* protoEnv = ncclGetEnv("NCCL_PROTO");
     if (protoEnv) {
       snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv);
     }
@@ -2007,7 +2069,7 @@ static ncclResult_t hostToDevRedOp(
   uint64_t allBits = uint64_t(-1)>>(64-nbits);
   uint64_t signBit = allBits^(allBits>>1);
   bool datatype_signed = false;
-  
+
   switch (int(op)) {
   case ncclSum:  opFull->op = ncclDevSum;  break;
   case ncclProd: opFull->op = ncclDevProd; break;
@@ -2097,6 +2159,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     p2p->datatype = info->datatype;
     p2p->root = info->root;
     p2p->bytes = nBytes;
+    p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
     ncclIntruQueueEnqueue(
       isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
       p2p);
@@ -2105,6 +2168,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     // Mark channels that need pre-connect
     if (comm->rank != peer) {
       if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
+        // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
         (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
         int round = 0;
         while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
@@ -2115,12 +2179,17 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
         for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
           int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c);
           if (isSendNotRecv) {
-            if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
+              // the send/recv connector is shared among split shared comms. We need to set hasSeen to
+              // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
+              // shared comms together.
+              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
               comm->connectSend[peer] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
           } else {
-            if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
+              comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
               comm->connectRecv[peer] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
@@ -2168,6 +2237,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
       t->opDev = opDev; // C++ struct assignment
       t->chunkSteps = info->chunkSteps;
       t->sliceSteps = info->sliceSteps;
+      t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
 
       planner->nTasksColl += 1;
       ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 64fc1c5..76b508c 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -390,7 +390,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
 
   // Alternate rings to avoid crossing rails
-  if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
+  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
     for (int r=0; r<comm->nRanks; r++) {
       if (comm->rankToNode[r] % 2 == 1) {
         // Exchange rings
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 587a8b2..ace4476 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -376,9 +376,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
 
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;
+const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" };
 
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
-  *useGdr = 0;
+NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0);
+
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) {
+  *gdrMode = ncclTopoGdrModeDisable;
 
   // Get GPU and NET
   int n, g;
@@ -418,25 +421,37 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
   int distance = gpu->paths[NET][n].type;
   if (distance == PATH_PXN) {
     // In case of PXN, use the intermediate GPU distance instead
-    int proxyRank, g;
+    int proxyRank;
     NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
     NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
-    struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
-    distance = proxyGpu->paths[NET][n].type;
+    gpu = system->nodes[GPU].nodes+g;
+    distance = gpu->paths[NET][n].type;
   }
+
+  int c;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+  if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) {
+    // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
+    INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c);
+    distance = PATH_C2C;
+  }
+
   if (distance > netGdrLevel) {
     INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
     return ncclSuccess;
   }
 
-  *useGdr = 1;
-  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
+  // Force PCIe mapping if path goes through PCI on a C2C system
+  if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci;
+  else *gdrMode = ncclTopoGdrModeDefault;
+
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
   int netNum = system->nodes[NET].count;
-  int useGdr = 0;
+  enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDisable;
   *avail = false;
   for (int n = 0; n < netNum; n++) {
     int64_t netId = system->nodes[NET].nodes[n].id;
@@ -469,6 +484,14 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int*
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
   // Flush is required on Ampere and earlier
   if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
+  // On C2C platforms, data could go through a PCI switch while completions and
+  // flags would go through C2C. In that case, force a flush.
+  int c, n;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+  if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) {
+    *flush = 1;
+  }
   return ncclSuccess;
 }
 
@@ -538,7 +561,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);
 int ncclPxnDisable(struct ncclComm* comm) {
   static int pxnDisable = -1;
   if (pxnDisable == -1) {
-    if (comm && ncclNetVersion(comm) == 4) {
+    if (comm && comm->ncclNetVer == 4) {
       INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
       pxnDisable = 1;
     } else {
@@ -561,9 +584,9 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
     int proxyRank;
     NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
     if (proxyRank == comm->rank) continue;
-    int useGdr;
+    enum ncclTopoGdrMode useGdr;
     NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
-    if (useGdr == 0) continue;
+    if (useGdr == ncclTopoGdrModeDisable) continue;
     int found = 0;
     for (int r=0; r<nr; r++) {
       if (ranks[r] == proxyRank) found = 1;
@@ -664,7 +687,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       }
       if (gpu->paths[NET][n].type < PATH_PHB) {
         // Update path when we dont want to / can't use GPU Direct RDMA.
-        int gdr;
+        enum ncclTopoGdrMode gdr;
         NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
         if (gdr == 0) {
           // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
@@ -862,3 +885,38 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink
   *allNvLink = maxPath >= PATH_PIX ? 0 : 1;
   return ncclSuccess;
 }
+
+// Check whether we are in a split NVLink situation, with two NVLink domains, not
+// connected through NVLink (e.g. QPI).
+ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink) {
+  ncclResult_t res = ncclSuccess;
+  int nvlDomains = 0;
+  int *nvlDomain = NULL, *nvlDomainCount = NULL;
+  // Compute NVLink domains
+  NCCLCHECKGOTO(ncclCalloc(&nvlDomain, system->nodes[GPU].count), res, exit);
+  for (int g=0; g<system->nodes[GPU].count; g++) nvlDomain[g] = g;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    int domain = nvlDomain[g];
+    for (int p=g+1; p<system->nodes[GPU].count; p++) {
+      if (gpu->paths[GPU][p].type == PATH_NVL) {
+        nvlDomain[p] = domain;
+      }
+    }
+  }
+  // Compute number of GPUs per NVLink domain.
+  NCCLCHECKGOTO(ncclCalloc(&nvlDomainCount, system->nodes[GPU].count), res, exit);
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    nvlDomainCount[nvlDomain[g]]++;
+  }
+  // Count the number of NVLink domains
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (nvlDomainCount[g] > 1) nvlDomains++;
+  }
+  *splitNvLink = nvlDomains == 2 ? 1 : 0;
+
+exit:
+  if(nvlDomain) free(nvlDomain);
+  if(nvlDomainCount) free(nvlDomainCount);
+  return res;
+}
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 0185b3f..15a0124 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -446,12 +446,11 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
 // 2. add other NETs satisfying typeInter but not already in the list.
 
-ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
+ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
   ncclResult_t ret = ncclSuccess;
   int netCount = 0;
   int localNetCount;
-  int* localNets;
-  NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS));
+  int localNets[MAXCHANNELS];
 
   // First add the preferred NICs
   for (int g=0; g<system->nodes[GPU].count; g++) {
@@ -460,8 +459,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
     for (int c = 0; c<MAXCHANNELS; c++) {
       int64_t netId;
-      NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
-      NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
       if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
       localNetCount++;
     }
@@ -469,7 +468,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     for (int i=0; i<localNetCount; i++) {
       int n = localNets[i];
       int found = 0;
-      while (nets[found] != n && found<netCount) found++;
+      while (found<netCount && nets[found] != n) found++;
       if (found == netCount) nets[netCount++] = n;
     }
   }
@@ -488,22 +487,17 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
       for (int i=0; i<localNetCount; i++) {
         int n = localNets[i];
         int found = 0;
-        while (nets[found] != n && found<netCount) found++;
+        while (found<netCount && nets[found] != n) found++;
         if (found == netCount) nets[netCount++] = n;
       }
     }
   }
 
   *netCountRet = netCount;
-exit:
-  free(localNets);
   return ret;
-fail:
-  goto exit;
 }
 
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
-  ncclResult_t ret = ncclSuccess;
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
 
@@ -525,7 +519,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   }
   graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
   int g = gpu - system->nodes[GPU].nodes;
-  int* nets = NULL;
+  int nets[NCCL_TOPO_MAX_NODES];
   if (step == backToNet) {
     // first get back to NIC
     if (system->nodes[NET].count) {
@@ -533,8 +527,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
       int netCount;
-      NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-      NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
       for (int i=0; i<netCount; i++) {
         int n = nets[i];
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
@@ -555,14 +548,14 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
           graph->bwInter /= 2;
         }
 
-        NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
+        NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
         graph->bwInter = bwInterSave;
         if (net) {
           graph->inter[graph->nChannels*2+1] = net->id;
-          NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
+          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
 
           if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
-          NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
+          NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
           graph->bwInter = bwInterSave;
         }
       }
@@ -601,21 +594,15 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
     // Next path
     NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
   }
-exit:
-  if (nets) free(nets);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
-  ncclResult_t ret = ncclSuccess;
   const int bw = graph->bwInter;
-  int* nets;
-  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+  int nets[NCCL_TOPO_MAX_NODES];
   int netCount;
   int graphFound = 0;
-  NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
   for (int i=0; i<netCount; i++) {
     if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
     int n = nets[(graph->nChannels+i)%netCount];
@@ -639,7 +626,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       // NVLS search only tries to find NIC:GPU combinations to compute the heads.
       if (graph->nChannels < netCount) {
         int gpu;
-        NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
+        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
         if (gpu != -1) {
           int duplicate = 0;
           // check whether there is duplicate head when one GPU connects with multiple NICs
@@ -650,7 +637,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
             }
           }
           if (!duplicate) {
-            NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
+            NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
             graphFound = 1;
           }
         }
@@ -659,14 +646,14 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       if (graph->nChannels > 0) {
         // Try to replay the last channel
         int g;
-        NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
-        NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
+        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
       }
       if (graph->nChannels == 0 || graph->sameChannels == 0) {
         if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
           // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
           int t = 1 << 10;
-          NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
+          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
           if (t == -1) *time = -1;
         }
 
@@ -686,7 +673,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
           for (int i=0; i<system->nodes[GPU].count; i++) {
             int g = (graph->nChannels+i)%system->nodes[GPU].count;
             if (paths[g].bw == maxBw && paths[g].count == minHops) {
-              NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
+              NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
             }
           }
         }
@@ -700,11 +687,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       }
     }
   }
-exit:
-  free(nets);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 /* Search Patterns
@@ -999,6 +982,15 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     graph->minChannels = graph->maxChannels;
   }
 
+  int splitNvLink;
+  NCCLCHECK(ncclTopoSplitNvLink(system, &splitNvLink));
+  if (graph->pattern == NCCL_TOPO_PATTERN_RING && splitNvLink) {
+    // We have two sockets with NVLink and a slower link in between (typically QPI).
+    // Tree is likely going to work better but it needs at least 2 channels.
+    // Since Tree needs to have the same number of channels as Ring, also force Ring to use 2 channels.
+    if (graph->maxChannels >= 2 && graph->minChannels == 1) graph->minChannels = 2;
+  }
+
   struct ncclTopoGraph tmpGraph;
   memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
 
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index ba82caf..9499f39 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -22,8 +22,8 @@
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI",    "",    "",    "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "C2C", "PCI",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
@@ -45,7 +45,7 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
   return ncclSuccess;
 }
 
-static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
+static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu, struct ncclTopoNode* from) {
   *cpu = NULL;
   if (node->type == CPU) {
     *cpu = node;
@@ -54,9 +54,10 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
   for (int l=0; l<node->nlinks; l++) {
     // Go up the PCI tree to find the CPU. Follow only PCI switches.
     if (node->links[l].type == LINK_PCI
+	&& node->links[l].remNode != from
 	&& (node->links[l].remNode->type == PCI
 	    || node->links[l].remNode->type == CPU)) {
-      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu, node));
     }
     if (*cpu != NULL) return ncclSuccess;
   }
@@ -77,13 +78,17 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
     return ncclSuccess;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-    *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
+    *bw =
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_ERP ? ERP_QPI_BW :
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SRP ? SRP_QPI_BW :
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SKL ? SKL_QPI_BW :
+      BDW_QPI_BW;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
     *bw = AMD_BW;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
-    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
+    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_MODEL_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
   }
   return ncclSuccess;
 }
@@ -511,12 +516,16 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
       int familyId, modelId;
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
-      cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+      cpu->cpu.model =
+        (familyId == 6 && modelId >= 0xCF) ? NCCL_TOPO_CPU_MODEL_INTEL_ERP :
+        (familyId == 6 && modelId >= 0x8F) ? NCCL_TOPO_CPU_MODEL_INTEL_SRP :
+        (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_MODEL_INTEL_SKL :
+        NCCL_TOPO_CPU_MODEL_INTEL_BDW;
     } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
       int familyId, modelId;
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
-      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
+      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_MODEL_YONGFENG;
     }
   }
   for (int s=0; s<xmlCpu->nSubs; s++) {
@@ -565,7 +574,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
       NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId)));
     } else if (targetType == CPU) {
       // NVL connection to the local CPU
-      NCCLCHECK(findLocalCpu(gpu, &remote));
+      NCCLCHECK(findLocalCpu(gpu, &remote, NULL));
     } else {
       if (system->nodes[NVS].count == 0) {
         NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
@@ -642,10 +651,10 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys
     NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
     double c2cBw = (bw*count)/1000.0;
     struct ncclTopoNode* cpu = NULL;
-    NCCLCHECK(findLocalCpu(gpu, &cpu));
+    NCCLCHECK(findLocalCpu(gpu, &cpu, NULL));
     if (cpu == NULL) return ncclSuccess;
-    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
-    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_C2C, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_C2C, c2cBw));
   } else {
     if (strcmp(node->name, "cpu") == 0) {
       NCCLCHECK(ncclGetSystemId(system, node, &systemId));
@@ -961,26 +970,31 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*
   // Trigger the merge, then get the new device's properties
   int vDevIndex = 0;
   ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
-  if (ret == ncclInvalidUsage) {
-    WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC");
-    NCCLCHECK(ret);
+  if (ret != ncclSuccess) {
+    INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
+      vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
+    return ret;
   }
 
   INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  ncclResult_t ret = ncclSuccess;
   INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  char* ncStr;
+  NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1));
+  strcpy(ncStr, str);
   char* semi_token;
-  char* semi = strtok_r(str, ";", &semi_token);
+  char* semi = strtok_r(ncStr, ";", &semi_token);
   while (semi) {
     TRACE(NCCL_NET, "Fusing %s", semi);
     struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
     int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
     if (nUserIfs == 0) {
       INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
-        str, semi);
+        ncStr, semi);
       continue;
     }
 
@@ -994,26 +1008,37 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str,
     if (vProps.ndevs != nUserIfs) {
       WARN("TOPO/NET : Only matched %d devices, %d requested from %s",
         vProps.ndevs, nUserIfs, semi);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
       WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     struct ncclXmlNode* netNode;
-    NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice));
-
-    // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
-    for (int i = 0; i < vProps.ndevs; i++) {
-      placedDevs[vProps.devs[i]] = 1;
+    ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+    if (ret == ncclSuccess) {
+      // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
+      for (int i = 0; i < vProps.ndevs; i++) {
+        placedDevs[vProps.devs[i]] = 1;
+      }
+    } else {
+      WARN("TOPO/NET : Could not force merge NICs %s. Please specify a valid NCCL_NET_FORCE_MERGE string.", semi);
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     semi = strtok_r(NULL, ";", &semi_token);;
   }
 
-  return ncclSuccess;
+exit:
+  free(ncStr);
+  return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
@@ -1061,7 +1086,24 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe
       }
 
       struct ncclXmlNode* netNode;
-      NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out);
+      ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+
+      // Merging failed.
+      // Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
+      // Set i to 0 to restart the automatic merging process and ensure all are placed
+      if (ret != ncclSuccess) {
+        INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "Marking physical devices as unplaced, increasing distance and restarting search.");
+        placedDevs[i] = 0;
+        TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, keeping distance -> self as PATH_LOC", i);
+        for (int k = 1; k < vProps.ndevs; k++) {
+          int dev = vProps.devs[k];
+          placedDevs[dev] = 0;
+          paths[i*nPhysDevs + dev] = PATH_DIS;
+          paths[dev*nPhysDevs + i] = PATH_DIS;
+          TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, setting distance -> %d as PATH_DIS", dev, i);
+        }
+        i = 0;
+      }
     }
   }
 
@@ -1125,16 +1167,16 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
   // By default, don't merge any devices
   int mergeLevel;
   mergeLevel = PATH_PORT;
-  char* mergeLevelEnv;
-  mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL");
-  if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
-  char* forceMerge;
-  forceMerge = getenv("NCCL_NET_FORCE_MERGE");
-  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
-  memset(placedDevs, 0, sizeof(int)*physicalDevs);
+  { // Avoids warnings related to jumping to "out"
+    const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
+    if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
+    const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
+    NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+    memset(placedDevs, 0, sizeof(int)*physicalDevs);
 
-  if (forceMerge) {
-    NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+    if (forceMerge) {
+      NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+    }
   }
   NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 2be029b..921a7f5 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -18,9 +18,11 @@
 #define SM86_NVLINK_BW 12.0
 #define SM100_NVLINK_BW 40.0
 #define PCI_BW 12.0           // PCI Gen3 x16
-#define QPI_BW 6.0
 #define AMD_BW 16.0
+#define BDW_QPI_BW 6.0
 #define SKL_QPI_BW 10.0
+#define SRP_QPI_BW 22.0
+#define ERP_QPI_BW 40.0
 #define ZPI_BW 6.0
 #define YONGFENG_ZPI_BW 9.0
 #define P9_BW 32.0
@@ -44,12 +46,13 @@ extern const char* topoNodeTypeStr[];
 #define LINK_LOC 0
 #define LINK_NVL 1
 // Skipping 2 for PATH_NVB
-#define LINK_PCI 3
-// Skipping 4 for PATH_PXB
-// Skipping 5 for PATH_PXN
-// Skipping 6 for PATH_PHB
-#define LINK_SYS 7
-#define LINK_NET 8
+#define LINK_C2C 3
+#define LINK_PCI 4
+// Skipping 5 for PATH_PXB
+// Skipping 6 for PATH_PXN
+// Skipping 7 for PATH_PHB
+#define LINK_SYS 8
+#define LINK_NET 9
 extern const char* topoLinkTypeStr[];
 
 // Local (myself)
@@ -61,29 +64,32 @@ extern const char* topoLinkTypeStr[];
 // Connection through NVLink using an intermediate GPU
 #define PATH_NVB 2
 
+// Connection through C2C
+#define PATH_C2C 3
+
 // Connection traversing at most a single PCIe bridge
-#define PATH_PIX 3
+#define PATH_PIX 4
 
 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
-#define PATH_PXB 4
+#define PATH_PXB 5
 
 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
-#define PATH_PXN 5
+#define PATH_PXN 6
 
 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
-#define PATH_PHB 6
+#define PATH_PHB 7
 
 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
-#define PATH_SYS 7
+#define PATH_SYS 8
 
 // Connection through the network
-#define PATH_NET 8
+#define PATH_NET 9
 
 // New type of path which should precede PATH_PIX
 #define PATH_PORT PATH_NVL
 
 // Disconnected
-#define PATH_DIS 9
+#define PATH_DIS 10
 extern const char* topoPathTypeStr[];
 
 struct ncclTopoNode;
@@ -103,9 +109,6 @@ struct ncclTopoLinkList {
   int type;
 };
 
-#define NCCL_TOPO_CPU_INTEL_BDW 1
-#define NCCL_TOPO_CPU_INTEL_SKL 2
-
 #define NCCL_TOPO_UNDEF (-1)
 
 #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
@@ -176,6 +179,7 @@ ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem*
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
 ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
 ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
+ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);
 
 #define NCCL_TOPO_XML_MAX_NODES 256
 #define NCCL_GRAPH_XML_MAX_NODES 4096
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 8da4aeb..68085b8 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -177,6 +177,7 @@ static const double perChMaxTreeBws[][3] = {
 NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
 static int ncclPatEnable(struct ncclComm* comm) {
   int patEnable = ncclParamPatEnable();
+  if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics
   if (patEnable != 2) return patEnable;
   if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
   if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0;   // PAT doesn't support net device offload
@@ -257,7 +258,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
-        if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
+        if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
         if (a == NCCL_ALGO_PAT) busBw *= .75;
         if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
diff --git a/src/group.cc b/src/group.cc
index e387db7..c48c0de 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -193,7 +193,6 @@ fail:
 
 static ncclResult_t doLaunches(struct ncclComm* head) {
   ncclResult_t result = ncclSuccess;
-  struct ncclComm* cliqueComm0 = head->intraComm0;
   struct ncclComm* cliqueHead = head;
   struct ncclComm* cliqueNextHead;
   bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
@@ -209,7 +208,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
       NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
       if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
       comm = comm->groupNext;
-    } while (comm != nullptr && comm->intraComm0 == cliqueComm0);
+    } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
     cliqueNextHead = comm;
 
     if (capturingYes && capturingNo) {
@@ -424,38 +423,47 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
 
   /* Connect channels at runtime if cumem is supported */
   if (groupCommHeadMain != nullptr) {
-    struct ncclComm* comm = groupCommHeadMain;
+    struct ncclComm* cliqueHead = groupCommHeadMain;
+    struct ncclComm* comm = NULL;
     struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
     ncclIntruQueueConstruct(&asyncCollJobs);
     do {
-      bool needConnect = false;
-      bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
-      memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+      // We need to preconnect connections for collectives clique by clique to avoid
+      // race condition for split shared comms which can connect the same connections
+      // at the same time.
+      comm = cliqueHead;
+      do {
+        bool needConnect = false;
+        bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+        memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
 
-      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-      NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
+        CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+        NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
 
-      if (comm->cuMemSupport && needConnect) {
-        struct ncclPreconnectJob* job;
-        NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-        job->base.func = ncclCollPreconnectFunc;
-        job->base.undo = nullptr;
-        job->base.destructor = free;
-        job->base.state = ncclGroupJobRunning;
-        job->base.abortFlag = comm->abortFlag;
-        job->base.abortFlagDev = comm->abortFlagDev;
-        job->comm = comm;
-        NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
-        memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-        ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
+        if (comm->cuMemSupport && needConnect) {
+          struct ncclPreconnectJob* job;
+          NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+          job->base.func = ncclCollPreconnectFunc;
+          job->base.undo = nullptr;
+          job->base.destructor = free;
+          job->base.state = ncclGroupJobRunning;
+          job->base.abortFlag = comm->abortFlag;
+          job->base.abortFlagDev = comm->abortFlagDev;
+          job->comm = comm;
+          NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
+          memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+          ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
+        }
+        comm = comm->groupNext;
+      } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
+      // connect
+      NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
+      while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
+        struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
+        if (job->destructor) job->destructor((void*)job);
       }
-      comm = comm->groupNext;
-    } while (comm);
-    NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
-    while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
-      struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
-      if (job->destructor) job->destructor((void*)job);
-    }
+      cliqueHead = comm;
+    } while (cliqueHead != nullptr);
 
     // done with all buffer allocation, start registration and enqueue
     comm = groupCommHeadMain;
diff --git a/src/include/bitops.h b/src/include/bitops.h
index a650aa7..dcf0e2e 100644
--- a/src/include/bitops.h
+++ b/src/include/bitops.h
@@ -8,6 +8,7 @@
 #define NCCL_BITOPS_H_
 
 #include <stdint.h>
+#include <string.h>
 
 #if !__NVCC__
   #ifndef __host__
@@ -276,13 +277,53 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
   return u32fpDecode(x, 3);
 }
 
-inline __host__ __device__ uint64_t getHash(const char* string, int n) {
-  // Based on DJB2a, result = result * 33 ^ char
-  uint64_t result = 5381;
-  for (int c = 0; c < n; c++) {
-    result = ((result << 5) + result) ^ string[c];
+// The hash isn't just a function of the bytes but also where the bytes are split
+// into different calls to eatHash().
+inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
+  char const* ptr = (char const*)bytes;
+  acc[0] ^= size;
+  while (size != 0) {
+    // Mix the accumulator bits.
+    acc[0] += acc[1];
+    acc[1] ^= acc[0];
+    acc[0] ^= acc[0] >> 31;
+    acc[0] *= 0x9de62bbc8cef3ce3;
+    acc[1] ^= acc[1] >> 32;
+    acc[1] *= 0x485cd6311b599e79;
+    // Read in a chunk of input.
+    size_t chunkSize = size < sizeof(uint64_t) ? size : sizeof(uint64_t);
+    uint64_t x = 0;
+    memcpy(&x, ptr, chunkSize);
+    ptr += chunkSize;
+    size -= chunkSize;
+    // Add to accumulator.
+    acc[0] += x;
   }
-  return result;
+}
+
+template<typename T>
+inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
+  eatHash(acc, (const void*)bytes, sizeof(T));
+}
+
+inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
+  uint64_t h = acc[0];
+  h ^= h >> 31;
+  h *= 0xbac3bd562846de6b;
+  h += acc[1];
+  h ^= h >> 32;
+  h *= 0x995a187a14e7b445;
+  return h;
+}
+
+inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
+  uint64_t acc[2] = {1, 1};
+  eatHash(acc, bytes, size);
+  return digestHash(acc);
+}
+template<typename T>
+inline __host__ __device__ uint64_t getHash(const T* bytes) {
+  return getHash((const void*)bytes, sizeof(T));
 }
 
 #endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index c82ebce..c68b041 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include "device.h"
+
 #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 // CHUNKSIZE must be a multiple of SLICESIZE
@@ -382,6 +383,42 @@ public:
   ~RingBCAlgorithm() {}
 };
 
+#if !defined (__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#include <cuda/atomic>
+#endif
+
+// Need a power of two to ensure it divides by parallelFactor (which is also a power of two)
+#define NCCL_PAT_NWORKERS 512
+
+static constexpr int PatUsed = 0x1,
+                     PatSkipped = 0x2;
+
+struct ncclPatStep {
+  int recvDim, sendDim, recvOffset, sendOffset, stepOffset, postRecv, postSend, nelem, last, flags;
+  size_t inpIx, outIx;
+};
+
+struct ncclPatPeer {
+    uint64_t step;
+    struct ncclConnInfo* conn;
+    struct ncclConnFifo* connFifo;
+    void* buff;
+    uint64_t *headPtr;
+    uint64_t *tailPtr;
+    uint64_t stepCache;
+    long long int accSize;
+    int connStepSize;
+};
+
+#define NCCL_SHMEM_PAT_STEPS 32
+struct ncclPatShmem {
+  struct ncclPatStep patSteps[NCCL_SHMEM_PAT_STEPS];
+  int parallelFactor;
+  long long int localAccSize;
+  struct ncclPatPeer sendDims[32]; // Should cover 2^32 ranks
+  struct ncclPatPeer recvDims[32];
+};
+
 template<typename T>
 class PatRSAlgorithm{
   size_t offset;
@@ -394,18 +431,17 @@ class PatRSAlgorithm{
   int nrPow2;
   int postFreq;
   int lastA;
-
+  int parallelFactor;
   int aggFactor;
   int as; // aggregated steps
   int a; // step inside aggregated step
   int sendSkipped; // number of skipped steps during aggregation
-  int recvSkipped; // number of skipped steps during aggregation
-  int phase2recv;  // receive offset for phase 2
+  int stepOffset;
   int aggDelta;
   int scale;
   int phase;
 
-  __device__ __host__ int min(int a, int b) {
+  __device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
     return (a<b)?a:b;
   }
 
@@ -433,16 +469,16 @@ class PatRSAlgorithm{
 
   __device__ __host__ void resetA() {
     a = 0;
-    sendSkipped = recvSkipped = 0;
+    sendSkipped = stepOffset = 0;
     lastA = aggFactor;
     if (phase >= 2) lastA /= 2*scale;
+    if (phase == 4) lastA = 1;
   }
 
   __device__ __host__ void reset() {
     nelem = getNelem();
     phase = 0;
     scale = 1;
-    phase2recv = 0;
     as = aggDelta - 1;
     resetA();
   }
@@ -465,8 +501,9 @@ class PatRSAlgorithm{
   }
 
 public:
-   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
      offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    parallelFactor = maxParallelFactor;
     aggDelta = nrPow2 = (1<<log2Up(nranks));
 
     aggFactor = 1;
@@ -476,6 +513,7 @@ public:
       aggDelta /= 2;
     }
     postFreq = aggFactor;
+    if (postFreq < parallelFactor) parallelFactor = postFreq;
     int d = stepDepth;
     while (d > 1 && aggFactor < nranks/2) {
       d /= 2;
@@ -486,160 +524,151 @@ public:
     reset();
   }
 
-  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
-restart:
-    last = 0;
-    nelemOut = nelem;
-    outIx = offset;
+  __device__ __host__ int getParallelFactor() {
+    return parallelFactor;
+  }
+
+  __device__ __host__ void getNextOp(struct ncclPatStep* ps) {
+    ps->last = 0;
+    ps->nelem = nelem;
+    ps->outIx = offset;
+    ps->stepOffset = stepOffset;
     int skip = 0;
-    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
-    if (phase == 0) {
+    if (a >= lastA) {
+      skip = 1;
+    } else if (phase == 0) {
       int s = mirrorInvert(a, lastA)*aggDelta + as;
       if (s >= nranks) skip = 1;
       int sendDataRank = (rank + s) % nranks;
-      inpIx = sendDataRank * count + offset;
-      recvDim = -1;
-      sendDim = 0;
-      outIx = 0;
-      recvOffset = -1;
-      sendOffset = ((a - sendSkipped)%postFreq) * nelem;
-      sendStepOffset = 0;
-      if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
-        postSend = 1;
+      ps->inpIx = sendDataRank * count + offset;
+      ps->recvDim = -1;
+      ps->sendDim = 0;
+      ps->outIx = 0;
+      ps->recvOffset = -1;
+      ps->sendOffset = (a%postFreq) * nelem;
+      if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        ps->postSend = 1;
       } else {
-        postSend = 0;
+        ps->postSend = 0;
       }
-      postRecv = 0;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->postRecv = 0;
     } else if (phase == 1) {
       int s = mirrorInvert(a, lastA)*aggDelta + as;
       if (s >= nranks) skip = 1;
-      recvDim = firstBitSet(s, nrPow2);
-      sendOffset = ((a - sendSkipped)%postFreq)*nelem;
-      recvOffset = ((a - recvSkipped)%postFreq)*nelem;
-      postSend = 0;
-      if (recvDim == 0) {
-        if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
-        sendStepOffset = 0;
+      ps->recvDim = firstBitSet(s, nrPow2);
+      ps->sendOffset = (a%postFreq)*nelem;
+      ps->recvOffset = (a%postFreq)*nelem;
+      ps->postSend = 0;
+      if (ps->recvDim == 0 && (((a%postFreq) + 1 >= postFreq) || (a == lastA-1))) ps->postSend = 1;
+      if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        ps->postRecv = 1;
       } else {
-        sendStepOffset = (a - sendSkipped)/postFreq;
+        ps->postRecv = 0;
       }
-      if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
-        postRecv = 1;
-      } else {
-        postRecv = 0;
-      }
-      s -= (1<<recvDim);
+      s -= (1<<ps->recvDim);
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
-      if (sendDim == -1) {
-        sendOffset = -1;
-        sendStepOffset = 0;
-      } else if (as - (1<<recvDim) == 0) {
-        if (newPeer(a, aggFactor)) sendSkipped = a;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (ps->sendDim == -1) {
+        ps->sendOffset = -1;
+      } else if (as - (1<<ps->recvDim) == 0) {
+        if (newPeer(a, aggFactor)) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
         int foffset = a - sendSkipped;
-        sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
-        sendOffset = (foffset%postFreq)*nelem;
+        ps->sendOffset = (foffset%postFreq)*nelem;
       }
+      int recvDim = ps->recvDim;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
-        postRecv = 0;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
         skip = 0;
       }
-      if (skip || recvDim == -1) recvSkipped++;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        as--;
-        phase = as % 2 == 1 ? 0 : 1;
-        resetA();
-      }
-      if (skip == 0) return;
+      if (recvDim > 0 && (((a-sendSkipped)%postFreq) + 1 >= postFreq) && skip == 0) stepOffset++;
     } else if (phase == 2) {
       int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
-      postRecv = 0;
+      ps->postRecv = 0;
       if (s >= nranks) skip = 1;
-      recvDim = 0;
-      postSend = a == lastA-1 ? 1 : 0;
+      ps->recvDim = 0;
+      ps->postSend = a == lastA-1 ? 1 : 0;
       s -= 1;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
         skip = 0;
       } else if (!skip) {
-        int foffset = phase2recv;
-        phase2recv++;
-        postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
-        recvOffset = (foffset%postFreq) * nelem;
+        int foffset = a + aggFactor - aggFactor/scale;
+        ps->postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+        ps->recvOffset = (foffset%postFreq) * nelem;
       }
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
-      int foffset = a - sendSkipped;
-      postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
-      sendStepOffset = 0;
-      sendOffset = (foffset%postFreq) * nelem;
-      if (skip || sendDim == -1) sendSkipped++;
-      if (++a == lastA) {
-        phase = 3;
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      int foffset = a;
+      ps->postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+      ps->sendOffset = (foffset%postFreq) * nelem;
     } else if (phase == 3) {
       int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
-      postRecv = a == lastA-1 ? 1 : 0;
+      ps->postRecv = a == lastA-1 ? 1 : 0;
       if (s >= nranks) skip = 1;
-      recvDim = firstBitSet(s, nrPow2);
-      postSend = 0;
-      s -= (1<<recvDim);
-      int foffset = a - recvSkipped;
-      postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
-      recvOffset = (foffset%postFreq) * nelem;
+      ps->recvDim = firstBitSet(s, nrPow2);
+      ps->postSend = 0;
+      s -= (1<<ps->recvDim);
+      int foffset = a;
+      ps->postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
+      ps->recvOffset = (foffset%postFreq) * nelem;
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
-        postRecv = 0;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
         skip = 0;
       }
-      if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
+      if (newPeer(a, aggFactor/(2*scale))) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
       foffset = a - sendSkipped;
-      sendStepOffset = foffset / postFreq; // Accumulate on next steps
-      sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
-      if (skip || recvDim == -1) recvSkipped++;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        scale *= 2;
-        phase = scale < aggFactor ? 2 : 4;
+      if ((foffset%postFreq) + 1 >= postFreq && skip == 0) stepOffset++;
+      ps->sendOffset = ps->sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
+    } else if (phase == 4) {
+      ps->recvDim = 0;
+      ps->sendDim = -1;
+      ps->inpIx = rank * count + offset;
+      ps->recvOffset = ((aggFactor-1)%postFreq) * nelem;
+      ps->sendOffset = -1;
+      ps->postRecv = 1;
+      ps->postSend = 0;
+      offset += chunkCount;
+    }
+    a++;
+    if (a >= lastA && a >= parallelFactor) {
+      int p = phase;
+      if (p == 1) as--;
+      if (p == 3) scale *= 2;
+      phase =
+        p == 0 ? as == 1 ? (aggFactor > 1 ? 2 : 4) : 1 :
+        p == 1 ? as % 2 == 1 ? 0 : 1 :
+        p == 2 ? 3 :
+        p == 3 ? scale < aggFactor ? 2 : 4 :
+        5;
+      if (p == 4) {
+        if (offset >= end) {
+          ps->last = 2;
+        } else {
+          reset();
+        }
+      } else {
         resetA();
       }
-      if (skip == 0) return;
-    } else if (phase == 4) {
-      recvDim = 0;
-      sendDim = -1;
-      inpIx = rank * count + offset;
-      recvOffset = (phase2recv%postFreq) * nelem;
-      sendStepOffset = 0;
-      sendOffset = -1;
-      postRecv = 1;
-      postSend = 0;
-      offset += chunkCount;
-      if (offset >= end) {
-        last = 1;
-      } else {
-        reset();
-      }
-      return;
+    } else if (phase == 4 && offset >= end) {
+      ps->last = 1;
     }
-    goto restart;
+    int flags = PatUsed | (skip ? PatSkipped : 0);
+#if __CUDA_ARCH__ >= 600
+    cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
+    a.store(flags, cuda::memory_order_release);
+#else
+    ps->flags = flags;
+#endif
   }
 };
 
@@ -655,14 +684,12 @@ class PatAGAlgorithm{
   int nrPow2;
   int postFreq;
   int lastA;
-
+  int parallelFactor;
   int aggFactor;
   int as; // aggregated steps
   int a; // step inside aggregated step
   int aggDelta;
-
   int scale;
-
   int phase;
 
   // AS computation
@@ -671,7 +698,7 @@ class PatAGAlgorithm{
   int bitCount[32];
   int bitZeroStep[32];
 
-  __device__ __host__ int min(int a, int b) {
+  __device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
     return (a<b)?a:b;
   }
 
@@ -738,8 +765,9 @@ class PatAGAlgorithm{
 
 
 public:
-   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
      offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    parallelFactor = maxParallelFactor;
     aggDelta = nrPow2 = (1<<log2Up(nranks));
 
     aggFactor = 1;
@@ -749,120 +777,120 @@ public:
       aggDelta /= 2;
     }
     postFreq = aggFactor;
+    if (postFreq < parallelFactor) parallelFactor = postFreq;
     int d = stepDepth;
     while (d > 1 && aggFactor < nranks/2) {
       d /= 2;
       aggFactor *= 2;
       aggDelta /= 2;
     }
-    //printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);
 
     asDim = log2Up(aggDelta);
     reset();
   }
 
-  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
-restart:
-    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
-    last = 0;
-    nelemOut = nelem;
-    inpIx = offset;
+  __device__ __host__ int getParallelFactor() {
+    return parallelFactor;
+  }
+
+  __device__ __host__ void getNextOp(struct ncclPatStep* ps) {
+    ps->last = 0;
+    ps->nelem = nelem;
+    ps->inpIx = offset;
     int skip = 0;
-    if (phase == 0) {
+    if (a >= lastA) {
+      skip = 1;
+    } else if (phase == 0) {
       int s = a*aggDelta + as;
       if (s >= nranks) skip = 1;
-      int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
       int recvDataRank = (rank + s) % nranks;
-      outIx = recvDataRank * count + offset;
-      sendDim = -1;
-      recvDim = 0;
-      inpIx = 0;
-      sendOffset = -1;
-      recvOffset = (a % postFreq) * nelem;
-      recvStepOffset = 0;
-      postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
-      postSend = 0;
-      a++;
-      if (nextSkip) {
-        as = nextAs();
-        if (as == aggDelta/2) {
-          offset += chunkCount;
-          if (offset >= end) {
-            last = 1;
-          } else {
-            reset();
-          }
-          return;
-        }
-        phase = 1;
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->outIx = recvDataRank * count + offset;
+      ps->sendDim = -1;
+      ps->recvDim = 0;
+      ps->inpIx = 0;
+      ps->sendOffset = -1;
+      ps->recvOffset = (a % postFreq) * nelem;
+      ps->stepOffset = 0;
+      ps->postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      ps->postSend = 0;
    } else if (phase == 1) {
       int s = a*aggDelta + as;
       if (s >= nranks) skip = 1;
-      sendDim = firstBitSet(s, nrPow2);
-      s -= (1<<sendDim);
+      ps->sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<ps->sendDim);
       int sendDataRank = (rank + nranks + s) % nranks;
-      outIx = sendDataRank * count + offset;
-      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      sendOffset = recvOffset = (a % postFreq) * nelem;
-      postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
-      postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
-      recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
-      if (recvDim == -1) {
-        recvOffset = -1;
-        postRecv = 0;
-      } else if (as - (1<<sendDim) == 0) {
-        int foffset = (a*aggDelta) >> (recvDim+1);
-        recvOffset = (foffset%postFreq)*nelem;
-        postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
-        recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
+      ps->outIx = sendDataRank * count + offset;
+      ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      ps->sendOffset = ps->recvOffset = (a % postFreq) * nelem;
+      ps->postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      ps->postRecv = (ps->sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
+      ps->stepOffset = (ps->sendDim == 0) ? 0 : a/postFreq;
+      if (ps->recvDim == -1) {
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
+      } else if (as - (1<<ps->sendDim) == 0) {
+        int foffset = (a*aggDelta) >> (ps->recvDim+1);
+        ps->recvOffset = (foffset%postFreq)*nelem;
+        ps->postRecv = (ps->sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<ps->recvDim) >= nranks) ? 1 : 0;
+        ps->stepOffset = (ps->sendDim == 0) ? 0 : foffset/postFreq;
       }
-      if (s < nranks && sendDim == 0 && skip) {
+      if (s < nranks && ps->sendDim == 0 && skip) {
         // Don't forget to receive at least once even if we don't send afterwards
-        sendDim = -1;
-        sendOffset = -1;
-        postSend = 0;
+        ps->sendDim = -1;
+        ps->sendOffset = -1;
+        ps->postSend = 0;
         skip = 0;
       }
-      if (++a == lastA) {
-        if (as % 2 == 1) {
-          phase = 0;
-        } else {
-          as = nextAs();
-        }
-        resetA();
-      }
-      if (skip == 0) return;
     } else if (phase == 2) {
       int s = (2*a+1)*scale*aggDelta;
-      postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
-      postRecv = 0;
+      ps->postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
+      ps->postRecv = 0;
       if (s >= nranks) skip = 1;
-      sendDim = firstBitSet(s, nrPow2);
-      s -= (1<<sendDim);
-      sendOffset = (a%postFreq) * nelem;
-      recvStepOffset = a / postFreq;
+      ps->sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<ps->sendDim);
+      ps->sendOffset = (a%postFreq) * nelem;
+      ps->stepOffset = a / postFreq;
       int sendDataRank = (rank + nranks + s) % nranks;
-      outIx = sendDataRank * count + offset;
-      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      if (recvDim == -1) {
-        recvOffset = -1;
+      ps->outIx = sendDataRank * count + offset;
+      ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (ps->recvDim == -1) {
+        ps->recvOffset = -1;
       } else {
-        s -= (1<<recvDim);
-        int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
-        recvOffset = (foffset%postFreq)*nelem;
-        recvStepOffset = foffset / postFreq;
+        s -= (1<<ps->recvDim);
+        int foffset = (a*2*scale*aggDelta) >> (ps->recvDim+1);
+        ps->recvOffset = (foffset%postFreq)*nelem;
+        ps->stepOffset = foffset / postFreq;
       }
-      if (++a == lastA) {
-        scale /= 2;
-        phase = scale ? 2 : 1;
+    }
+    a++;
+    if (a >= lastA && a >= parallelFactor) {
+      int p = phase;
+      if (p == 2) scale /= 2;
+      phase =
+        p == 2 ? scale ? 2 : 1 :
+        p == 1 ? as % 2 == 1 ? 0 : 1 :
+        1;
+      if (p == 0 || (p == 1 && as % 2 == 0)) as = nextAs();
+      if (p == 0 && as == aggDelta/2) {
+        offset += chunkCount;
+        if (offset >= end) {
+          ps->last = 2;
+        } else {
+          reset();
+        }
+      } else {
         resetA();
       }
-      if (skip == 0) return;
+    } else if (phase == 0 && as == 1 && offset + chunkCount >= end && a-1 >= ((lastA-1) / parallelFactor) * parallelFactor) {
+      ps->last = 1;
     }
-    goto restart;
+    int flags = PatUsed | (skip ? PatSkipped : 0);
+#if __CUDA_ARCH__ >= 600
+    cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
+    a.store(flags, cuda::memory_order_release);
+#else
+    ps->flags = flags;
+#endif
   }
 };
 #endif
diff --git a/src/include/comm.h b/src/include/comm.h
index c3f4eb4..4095187 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -131,6 +131,9 @@ struct ncclSharedResources {
   int* tpRankToLocalRank;
   // Internal streams
   struct ncclStrongStream deviceStream, hostStream;
+  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
+  int persistentRefs;
+  cudaEvent_t launchEvent, scratchEvent;
 
   /* proxy related shared res */
   struct ncclProxyState* proxyState;
@@ -407,6 +410,7 @@ struct ncclComm {
   // List of destructors to run when comm is destructed
   struct ncclDestructor* destructorHead;
 
+  struct ncclCudaContext* context;
   struct ncclSharedResources* sharedRes;
   /* map to top parent ranks. */
   int* topParentRanks;
@@ -419,6 +423,7 @@ struct ncclComm {
 
   int netPluginLoaded;
   ncclNet_t* ncclNet;
+  int ncclNetVer;
   ncclNetDeviceType netDeviceType;
   ncclCollNet_t* ncclCollNet;
   void* bootstrap;
@@ -426,6 +431,7 @@ struct ncclComm {
   uint64_t* connectSend;
   uint64_t* connectRecv;
   struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+  int maxTreePattern;
   bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
   bool runtimeConn; // if dynamic connection is supported
   bool directMode;
@@ -565,8 +571,7 @@ struct ncclComm {
   struct ncclComm* groupNext;
   // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
   struct ncclComm* preconnectNext;
-  int persistentRefs; // number of persistent plan-lists capturing this comm
-  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
+  int localPersistentRefs; // number of persistent plan-lists capturing this comm
   struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
 
   struct ncclKernelPlanner planner;
@@ -603,6 +608,7 @@ struct ncclComm {
   // Profiler plugin
   void* profilerContext;
   uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
+  struct ncclProfilerProxy profiler;
 
   // buffer registration cache
   struct ncclRegCache regCache;
diff --git a/src/include/device.h b/src/include/device.h
index 3f918ab..0763a57 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -133,6 +133,7 @@ struct ncclProxyConnector {
 
 struct ncclConnector {
   int connected;
+  int hasSeen;
   struct ncclProxyConnector proxyConn;
   struct ncclTransportComm* transportComm;
   void* transportResources;
@@ -374,6 +375,7 @@ struct alignas(16) ncclDevChannel {
   struct ncclDirect collnetDirect;
   struct ncclNvls nvls;
   uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+  uint64_t workCounter;
 };
 
 struct ncclDevComm {
@@ -396,6 +398,10 @@ struct ncclDevComm {
   // Channels, device side
   struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
   int* rankToLocalRank;
+
+  // Profiler counters
+  uint64_t* workStarted/*[MAXCHANNELS]*/;
+  uint64_t* workCompleted/*[MAXCHANNELS]*/;
 };
 
 struct alignas(16) ncclDevCommAndChannels {
@@ -468,7 +474,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int
 
 __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
   // Our collective unroll should move to the same bytes&insns model as NVLS.
-  return cudaArch >= 800 ? 8 : 4;
+  return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
 }
 
 __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
diff --git a/src/include/graph.h b/src/include/graph.h
index a22b62b..b779773 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -36,7 +36,13 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr);
+enum ncclTopoGdrMode {
+  ncclTopoGdrModeDisable = 0,
+  ncclTopoGdrModeDefault = 1,
+  ncclTopoGdrModePci = 2,
+  ncclTopoGdrModeNum = 3
+};
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode);
 ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
@@ -55,9 +61,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_VENDOR_AMD 2
 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
 #define NCCL_TOPO_CPU_VENDOR_MIXED 4
-#define NCCL_TOPO_CPU_TYPE_BDW 1
-#define NCCL_TOPO_CPU_TYPE_SKL 2
-#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
+#define NCCL_TOPO_CPU_MODEL_INTEL_BDW 1
+#define NCCL_TOPO_CPU_MODEL_INTEL_SKL 2
+#define NCCL_TOPO_CPU_MODEL_INTEL_SRP 3
+#define NCCL_TOPO_CPU_MODEL_INTEL_ERP 4
+#define NCCL_TOPO_CPU_MODEL_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
diff --git a/src/include/group.h b/src/include/group.h
index 91bc190..c06d1ef 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -112,6 +112,12 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
     struct ncclComm** pp = &ncclGroupCommHead;
     while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
       pp = &(*pp)->groupNext;
+
+    // didn't find its clique, we need to insert it with ascending order based on commHash
+    if (*pp == nullptr) {
+      pp = &ncclGroupCommHead;
+      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
+    }
     comm->groupNext = *pp;
     *pp = comm;
     // Comms gets a new memory stack scope upon joining. Each task batched for
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
deleted file mode 100644
index f165aa1..0000000
--- a/src/include/nccl_net.h
+++ /dev/null
@@ -1,604 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-#include "net_device.h"
-#include <stdint.h>
-
-#define NCCL_NET_HANDLE_MAXSIZE 128
-//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
-#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
-#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
-
-#define NCCL_PTR_HOST 0x1
-#define NCCL_PTR_CUDA 0x2
-#define NCCL_PTR_DMABUF 0x4
-
-// Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 32
-
-// Max number of ncclNet objects which can live in the same process
-#define NCCL_NET_MAX_PLUGINS 3
-
-#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
-
-typedef struct {
-  int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
-} ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int regIsGlobal;                 // regMr is not tied to a particular comm
-  int forceFlush;                  // Force a flush on receives
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-  ncclNetVDeviceProps_v9_t vProps;
-  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
-  size_t maxCollBytes;             // Max transfer size for collective operations
-} ncclNetProperties_v9_t;
-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-
-  // Create a virtual NIC given the specified properties, which can be accessed at device index d
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
-} ncclNet_v9_t;
-
-typedef ncclNet_v9_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9
-
-typedef struct {
-  void* mhandle;
-  void* address;
-  size_t size;
-} ncclNetSGE_v9_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
-                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                             void* sendMhandle, void** request);
-  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
-                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                                 ncclDataType_t dataType, ncclRedOp_t redOp,
-                                 void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Create a virtual NIC given the specified properties, which can be accessed at device index d
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
-} ncclCollNet_v9_t;
-
-typedef ncclCollNet_v9_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int regIsGlobal;                 // regMr is not tied to a particular comm
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v8_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v8_t;
-
-typedef struct {
-  void* mhandle;
-  void* address;
-  uint32_t size;
-} ncclNetSGE_v8_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
-                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                             void* sendMhandle, void** request);
-  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
-                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                                 ncclDataType_t dataType, ncclRedOp_t redOp,
-                                 void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v8_t;
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v7_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v7_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v7_t;
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
-
-// v6 struct for backwards compatibility
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  float latency;  // Network latency
-  int maxComms;   // Maximum number of comms we can create
-  int maxRecvs;   // Maximum number of grouped receives.
-} ncclNetProperties_v6_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v6_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v5_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v5_t;
-
-#endif // end include guard
diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h
deleted file mode 100644
index a8164d0..0000000
--- a/src/include/nccl_profiler.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_PROFILER_H_
-#define NCCL_PROFILER_H_
-
-#include <cstdint>
-
-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-};
-
-typedef struct {
-  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
-  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
-  int rank;                     // originating rank
-  union {
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint64_t seqNumber;
-      const char* func;
-      void const* sendBuff;
-      void* recvBuff;
-      size_t count;
-      int root;
-      const char* datatype;
-      size_t trafficBytes;
-      uint8_t nMaxChannels;
-      uint8_t nWarps;
-      const char* algo;
-      const char* proto;
-    } coll;
-
-    struct {
-      const char* name;
-      uint64_t commHash;
-      const char* func;
-      void* buff;
-      const char* datatype;
-      size_t count;
-      int peer;
-    } p2p;
-
-    struct {
-      pid_t pid;                // pid of the originating process
-      uint8_t channelId;        // channel id for this proxy operation
-      int peer;                 // remote rank for send/recv
-      int nSteps;               // number of steps for this proxy operation
-      int chunkSize;            // amount of data transferred by this proxy operation
-      int isSend;
-    } proxyOp;
-
-    struct {
-      int step;
-    } proxyStep;
-  };
-} ncclProfilerEventDescr_v2_t;
-
-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v2_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v2_t;
-
-typedef struct {
-  const char* name;
-
-  // init - initialize the profiler plugin
-  // Input
-  //  - context        : opaque profiler context object for separating profiler behavior across comms
-  // Output
-  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
-
-  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
-  // Input
-  //  - context: opaque profiler context object
-  //  - eDescr : pointer to ncclProfilerEventDescr_t object
-  // Output
-  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
-
-  // stopEvent - stop/finalize an event inside and event set
-  // Input
-  //  - eHandle: handle to event object
-  ncclResult_t (*stopEvent)(void* eHandle);
-
-  // recordEventState - record event state transitions and event attribute updates
-  // Input
-  //  - eHandle   : handle to event object created through startEvent
-  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
-  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
-
-  // finalize - finalize the profiler plugin
-  // Input
-  //  - context: opaque profiler context object
-  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v2_t;
-
-typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v2_t ncclProfiler_t;
-
-typedef struct {
-  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
-  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
-  int rank;                     // originating rank
-  union {
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint64_t seqNumber;
-      uint8_t func;
-      void const* sendBuff;
-      void* recvBuff;
-      size_t count;
-      int root;
-      uint8_t datatype;
-      uint32_t op;
-      size_t trafficBytes;
-      uint8_t nMaxChannels;
-      uint8_t nWarps;
-      uint8_t algo;
-      uint8_t proto;
-      int isCollnet;
-      int isNvls;
-    } coll;
-
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint8_t func;
-      void* buff;
-      uint8_t datatype;
-      size_t count;
-      int peer;
-    } p2p;
-
-    struct {
-      pid_t pid;                // pid of the originating process
-      uint8_t channelId;        // channel id for this proxy operation
-      int peer;                 // remote rank for send/recv
-      int nSteps;               // number of steps for this proxy operation
-      int chunkSize;            // amount of data transferred by this proxy operation
-      int isSend;
-    } proxyOp;
-
-    struct {
-      int step;
-    } proxyStep;
-  };
-} ncclProfilerEventDescr_v1_t;
-
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
-
-typedef struct {
-  const char* name;
-
-  // init - initialize the profiler plugin
-  // Input
-  //  - context        : opaque profiler context object for separating profiler behavior across comms
-  // Output
-  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
-
-  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
-  // Input
-  //  - context: opaque profiler context object
-  //  - eDescr : pointer to ncclProfilerEventDescr_t object
-  // Output
-  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
-
-  // stopEvent - stop/finalize an event inside and event set
-  // Input
-  //  - eHandle: handle to event object
-  ncclResult_t (*stopEvent)(void* eHandle);
-
-  // recordEventState - record event state transitions and event attribute updates
-  // Input
-  //  - eHandle   : handle to event object created through startEvent
-  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
-  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
-
-  // finalize - finalize the profiler plugin
-  // Input
-  //  - context: opaque profiler context object
-  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v1_t;
-
-#endif
diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h
deleted file mode 100644
index 6e61118..0000000
--- a/src/include/nccl_tuner.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TUNER_H_
-#define NCCL_TUNER_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - numPipeOps: number of operations in the group
-  //   - numAlgo: number of algorithms in collCostTable
-  //   - numProto: number of protocols in collCostTable
-  //   - regBuff: can register user buffer
-  //
-  // Outputs:
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // InOut:
-  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
-  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int regBuff, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v4_t;
-
-typedef ncclTuner_v4_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - numPipeOps: number of operations in the group
-  //   - numAlgo: number of algorithms in collCostTable
-  //   - numProto: number of protocols in collCostTable
-  //
-  // Outputs:
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // InOut:
-  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
-  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v3_t;
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - collNetTypeSupport: whether collnet supports this type
-  //   - nvlsTypeSupport: whether nvlink sharp supports this time
-  //   - numPipeOps: number of operations in the group
-  //
-  // Outputs:
-  //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the give collective
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int* algorithm, int* protocol, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v2_t;
-
-#endif
diff --git a/src/include/net.h b/src/include/net.h
index d1926cc..afc2d16 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -18,7 +18,6 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
 ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);
-int ncclNetVersion(struct ncclComm* comm);
 
 // Test whether the current GPU support GPU Direct RDMA.
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
diff --git a/src/include/net_device.h b/src/include/net_device.h
index 5fae9b5..c3a79e3 100644
--- a/src/include/net_device.h
+++ b/src/include/net_device.h
@@ -26,6 +26,7 @@ typedef struct {
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index 5d00f07..2c18b36 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -31,9 +31,10 @@
 #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommSplit            13
 #define NVTX_SID_CommFinalize         14
+// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
 
 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 15 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
 
 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h
new file mode 100644
index 0000000..d57aad5
--- /dev/null
+++ b/src/include/plugin/nccl_net.h
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "net_device.h"
+#include <stdint.h>
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
+
+#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
+#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 32
+
+// Max number of ncclNet objects which can live in the same process
+#define NCCL_NET_MAX_PLUGINS 3
+
+// NCCL core profiler callback for network defined events instrumentation
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
+
+#include "net/net_v10.h"
+#include "net/net_v9.h"
+#include "net/net_v8.h"
+#include "net/net_v7.h"
+#include "net/net_v6.h"
+
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclCollNet_v10_t ncclCollNet_t;
+typedef ncclNetSGE_v10_t ncclNetSGE_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10
+
+#endif // end include guard
diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h
new file mode 100644
index 0000000..34cf9a9
--- /dev/null
+++ b/src/include/plugin/nccl_profiler.h
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+
+#include <cstdint>
+#include "profiler/profiler_v3.h"
+#include "profiler/profiler_v2.h"
+#include "profiler/profiler_v1.h"
+
+typedef ncclProfiler_v3_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#endif
diff --git a/src/include/plugin/nccl_tuner.h b/src/include/plugin/nccl_tuner.h
new file mode 100644
index 0000000..f240189
--- /dev/null
+++ b/src/include/plugin/nccl_tuner.h
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+
+#include "tuner/tuner_v4.h"
+#include "tuner/tuner_v3.h"
+#include "tuner/tuner_v2.h"
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+#endif
diff --git a/src/include/plugin/net/net_v10.h b/src/include/plugin/net/net_v10.h
new file mode 100644
index 0000000..ada6d48
--- /dev/null
+++ b/src/include/plugin/net/net_v10.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v10_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclCollNet_v10_t;
+
+#endif // end include guard
diff --git a/src/include/plugin/net/net_v6.h b/src/include/plugin/net/net_v6.h
new file mode 100644
index 0000000..99445ce
--- /dev/null
+++ b/src/include/plugin/net/net_v6.h
@@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V6_H_
+#define NET_V6_H_
+
+#define NCCL_NET_MAX_REQUESTS_V6 8
+
+// v6 struct for backwards compatibility
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+} ncclNetProperties_v6_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v7.h b/src/include/plugin/net/net_v7.h
new file mode 100644
index 0000000..e9b19de
--- /dev/null
+++ b/src/include/plugin/net/net_v7.h
@@ -0,0 +1,120 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V7_H_
+#define NET_V7_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v8.h b/src/include/plugin/net/net_v8.h
new file mode 100644
index 0000000..a178132
--- /dev/null
+++ b/src/include/plugin/net/net_v8.h
@@ -0,0 +1,134 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V8_H_
+#define NET_V8_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  uint32_t size;
+} ncclNetSGE_v8_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v8_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v9.h b/src/include/plugin/net/net_v9.h
new file mode 100644
index 0000000..ce9d917
--- /dev/null
+++ b/src/include/plugin/net/net_v9.h
@@ -0,0 +1,152 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V9_H_
+#define NET_V9_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclNet_v9_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v9_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclCollNet_v9_t;
+
+#endif // end include guard
diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h
new file mode 100644
index 0000000..7336c34
--- /dev/null
+++ b/src/include/plugin/plugin.h
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PLUGIN_H_
+#define NCCL_PLUGIN_H_
+
+#include "nccl.h"
+
+void* ncclOpenNetPluginLib(const char* name);
+void* ncclOpenTunerPluginLib(const char* name);
+void* ncclOpenProfilerPluginLib(const char* name);
+void* ncclGetNetPluginLib(void);
+ncclResult_t ncclClosePluginLib(void* handle);
+
+#endif
diff --git a/src/include/plugin/profiler/net_ib.h b/src/include/plugin/profiler/net_ib.h
new file mode 100644
index 0000000..2ac6d5c
--- /dev/null
+++ b/src/include/plugin/profiler/net_ib.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_H_
+#define NET_IB_H_
+
+#include "nccl_profiler.h"
+#include "net_ib_v1.h"
+
+#endif
diff --git a/src/include/plugin/profiler/net_ib_v1.h b/src/include/plugin/profiler/net_ib_v1.h
new file mode 100644
index 0000000..f142de5
--- /dev/null
+++ b/src/include/plugin/profiler/net_ib_v1.h
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_V1_H_
+#define NET_IB_V1_H_
+
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/net_socket.h b/src/include/plugin/profiler/net_socket.h
new file mode 100644
index 0000000..9f57496
--- /dev/null
+++ b/src/include/plugin/profiler/net_socket.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_H_
+#define NET_SOCKET_H_
+
+#include "nccl_profiler.h"
+#include "net_socket_v1.h"
+
+#endif
diff --git a/src/include/plugin/profiler/net_socket_v1.h b/src/include/plugin/profiler/net_socket_v1.h
new file mode 100644
index 0000000..0cb664f
--- /dev/null
+++ b/src/include/plugin/profiler/net_socket_v1.h
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_V1_H_
+#define NET_SOCKET_V1_H_
+
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v1.h b/src/include/plugin/profiler/profiler_v1.h
new file mode 100644
index 0000000..3b67102
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v1.h
@@ -0,0 +1,107 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v2.h b/src/include/plugin/profiler/profiler_v2.h
new file mode 100644
index 0000000..146152a
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v2.h
@@ -0,0 +1,104 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v3.h b/src/include/plugin/profiler/profiler_v3.h
new file mode 100644
index 0000000..10c5059
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v3.h
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v2.h b/src/include/plugin/tuner/tuner_v2.h
new file mode 100644
index 0000000..ec96f60
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v2.h
@@ -0,0 +1,53 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V2_H_
+#define TUNER_V2_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetTypeSupport: whether collnet supports this type
+  //   - nvlsTypeSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the give collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int* algorithm, int* protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v2_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v3.h b/src/include/plugin/tuner/tuner_v3.h
new file mode 100644
index 0000000..4fa10e8
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v3.h
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V3_H_
+#define TUNER_V3_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v3_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v4.h b/src/include/plugin/tuner/tuner_v4.h
new file mode 100644
index 0000000..a4b38a0
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v4.h
@@ -0,0 +1,56 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V4_H_
+#define TUNER_V4_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+#endif
diff --git a/src/include/profiler.h b/src/include/profiler.h
index 2b7efe0..8d41079 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -17,6 +17,18 @@ struct ncclTaskP2p;
 struct ncclInfo;
 struct ncclComm;
 struct ncclProxyOp;
+struct ncclProxyConnector;
+
+struct ncclProfilerProxy {
+  bool initialized;
+  uint64_t* workStarted/*[MAXCHANNELS]*/;
+  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+  uint64_t workCounter[MAXCHANNELS]; // host work counter
+  struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
+  struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
+};
+
+extern int ncclProfilerEventMask;
 
 // Plugin Init/Finalize Wrappers
 ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
@@ -44,6 +56,10 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args,
 ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
 ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
 
+// Kernel Channel Start/Stop Event Wrappers
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
+
 // Record Event Wrappers
 ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
@@ -51,5 +67,9 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n
 
 // Profiler utility functions
 ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
+bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op);
+
+// Profiler callback for network plugin
+ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
 
 #endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index c97a4d7..225acb2 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -32,7 +32,8 @@ typedef enum : uint8_t {
   ncclPatternPatUp,
   ncclPatternPatDown,
   ncclPatternSend,
-  ncclPatternRecv
+  ncclPatternRecv,
+  ncclPatternProfiler,
 } ncclPattern_t;
 
 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
@@ -93,6 +94,7 @@ struct ncclProxyOp {
   int peer;
   pid_t pid;
   void* profilerContext;
+  uint64_t workCounter;
 
   struct ncclProxyOp *enqNext;
 };
@@ -129,12 +131,15 @@ struct ncclProxySubArgs {
   // Profiler plugin
   int eActivationMask;
   int rank;
+  uint64_t profilerSteps;
   pid_t pid;
   void* profilerContext;
   void* taskEventHandle;
   void* opEventHandle;
+  void* kernelEventHandle;
   void* stepEventHandles[NCCL_STEPS];
   size_t transSize;
+  uint64_t workCounter;
 
   void* recvRequestsCache[NCCL_STEPS];
   int recvRequestsSubCount;
diff --git a/src/include/ras.h b/src/include/ras.h
index 7909b3d..d27a543 100644
--- a/src/include/ras.h
+++ b/src/include/ras.h
@@ -15,6 +15,8 @@ struct rasRankInit {
   pid_t pid;
   int cudaDev;
   int nvmlDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
 };
 
 ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
diff --git a/src/include/register.h b/src/include/register.h
index 740a645..143f41b 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -42,7 +42,7 @@ struct ncclReg {
   uintptr_t baseAddr;
   size_t baseSize;
   CUdeviceptr regAddr;
-  size_t regSize;
+  size_t regUCSize, regMCSize;
   int dev;
   CUmemGenericAllocationHandle mcHandle;
   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
diff --git a/src/include/shm.h b/src/include/shm.h
index b519e5d..223d873 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -14,7 +14,6 @@ struct shmCuIpc {
     CUmemFabricHandle handle;
     CUmemGenericAllocationHandle data;
   };
-  int tpProxyRank;
   void *ptr;
   size_t size;
 };
@@ -30,8 +29,8 @@ struct shmIpcDesc {
 
 typedef struct shmIpcDesc ncclShmIpcDesc_t;
 
-ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
-ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
+ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
 ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
 
 #endif
diff --git a/src/include/socket.h b/src/include/socket.h
index f0a3237..ffa1480 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -96,5 +96,5 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
 ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
-ncclResult_t ncclSocketClose(struct ncclSocket* sock);
+ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false);
 #endif
diff --git a/src/include/strongstream.h b/src/include/strongstream.h
index 0984dfe..c56d5ac 100644
--- a/src/include/strongstream.h
+++ b/src/include/strongstream.h
@@ -10,13 +10,24 @@
 #include "nccl.h"
 #include "checks.h"
 
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <stdint.h>
 
+// ncclCudaContext: wraps a CUDA context with per-context state.
+struct ncclCudaContext;
+
+// Get a ncclCudaContext to track the currently active CUDA context.
+ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out);
+// Drop reference.
+void ncclCudaContextDrop(struct ncclCudaContext* cxt);
+
 /* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
  * easily.
  */
 struct ncclCudaGraph {
 #if CUDART_VERSION >= 11030
+  cudaStream_t origin;
   cudaGraph_t graph;
   unsigned long long graphId;
 #endif
@@ -25,6 +36,7 @@ struct ncclCudaGraph {
 inline struct ncclCudaGraph ncclCudaGraphNone() {
   struct ncclCudaGraph tmp;
   #if CUDART_VERSION >= 11030
+    tmp.origin = nullptr;
     tmp.graph = nullptr;
     tmp.graphId = ULLONG_MAX;
   #endif
@@ -33,7 +45,7 @@ inline struct ncclCudaGraph ncclCudaGraphNone() {
 
 inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
   #if CUDART_VERSION >= 11030
-    return graph.graph != nullptr;
+    return graph.graphId != ULLONG_MAX;
   #else
     return false;
   #endif
@@ -57,60 +69,37 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
  * streams unfit for the use of serializing access to a persistent resource.
  * Strong streams have been introduced to address this need.
  *
- * - All updates to a strong stream must be enclosed by a Acquire/Release pair.
+ * All updates to a strong stream must be enclosed by a Acquire/Release pair.
  *
- * - The Acquire, Release, and all updates take a ncclCudaGraph parameter
- *   indicating the currently capturing graph (or none). This parameter must be
- *   the same for the entire sequence of {Acquire; ...; Release}.
+ * Acquire retrieves a "work" stream (cudaStream_t) which may be used to add
+ * work.
  *
- * - An {Acquire; ...; Release} sequence must not be concurrent with any
- *   other operations against the strong stream including graph launches which
- *   reference this stream.
+ * Release publishes the work streams work into the strong stream. The Release
+ * must be issued by the same thread that did the Acquire.
  */
 struct ncclStrongStream;
 
 ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
 ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
 
-// Acquire-fence the strong stream.
+// Acquire the strong stream. Upon return `*workStream` will be usable to add work.
+// `concurrent` indicates if other threads may be using the strong stream.
 ncclResult_t ncclStrongStreamAcquire(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
 );
 
-// Acquire-fence the strong stream assuming no graph is capturing. This permits
-// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA
-// calls. Strong stream still must be released via:
-//   ncclStrongStreamRelease(ncclCudaGraphNone(), ss);
-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
-
-// Release-fence of the strong stream.
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
-
-// Add a host launch to the stream.
-ncclResult_t ncclStrongStreamLaunchHost(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  cudaHostFn_t fn, void* arg
-);
-// Add a kernel launch to the stream.
-ncclResult_t ncclStrongStreamLaunchKernel(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
+// Get the workStream for an already acquired strong stream.
+// `concurrent` indicates if other threads may be using the strong stream.
+ncclResult_t ncclStrongStreamAcquiredWorkStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
 );
 
-// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
-// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
-// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
-// implementation to induce few graph dependencies.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
-);
-// `b` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
-);
-// `a` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
+// Release of the strong stream.
+// `concurrent` indicates if other threads may be using the strong stream.
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent);
+
+ncclResult_t ncclStreamWaitStream(
+  cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent
 );
 
 // Synchrnoization does not need the strong stream to be acquired.
@@ -118,23 +107,28 @@ ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct ncclStrongStreamGraph; // internal to ncclStrongStream
+struct ncclStrongStreamCapture; // internal to ncclStrongStream
 
 struct ncclStrongStream {
-  // Used when not graph capturing.
-  cudaStream_t cudaStream;
+  // The stream to use for non-captured work.
+  cudaStream_t liveStream;
+  void* liveAcquiredBy;
 #if CUDART_VERSION >= 11030
+  // This stream ever appeared in a graph capture.
+  bool everCaptured;
+  pthread_mutex_t lock;
+  struct ncclStrongStreamCapture* captureHead;
   // The event used to establish order between graphs and streams. During acquire
   // this event is waited on, during release it is recorded to.
   cudaEvent_t serialEvent;
-  // This stream ever appeared in a graph capture.
-  bool everCaptured;
-  // Tracks whether serialEvent needs to be recorded to upon Release().
-  bool serialEventNeedsRecord;
-  struct ncclStrongStreamGraph* graphHead;
-#else
-  cudaEvent_t scratchEvent;
 #endif
 };
 
+struct ncclCudaContext {
+  struct ncclCudaContext* next;
+  CUcontext hcontext;
+  int refCount;
+  struct ncclStrongStream launchOrder;
+};
+
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 37187f6..c563fbb 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -18,6 +18,7 @@
 #define TRANSPORT_SHM 1
 #define TRANSPORT_NET 2
 #define TRANSPORT_COLLNET 3
+#define TRANSPORT_PROFILER 4
 
 #include "proxy.h"
 #include "comm.h"
@@ -26,6 +27,7 @@ extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
 extern struct ncclTransport netTransport;
 extern struct ncclTransport collNetTransport;
+extern struct ncclTransport profilerTransport;
 
 extern struct ncclTransport* ncclTransports[];
 // Forward declarations
@@ -65,8 +67,10 @@ struct ncclNvlsSharedRes {
   CUmulticastObjectProp signalProp;
   CUmemAccessDesc accessDesc;
   int dev;
-  size_t buffSize;
-  size_t creditSize;
+  size_t creditUCSize;
+  size_t creditMCSize;
+  size_t buffUCSize;
+  size_t buffMCSize;
   CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
   CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
   char* mcBuff; // Multicast NVLS buffer address
@@ -123,7 +127,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
 ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
 ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
 ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
 
 enum { collNetRecv=0, collNetSend=1 };
diff --git a/src/init.cc b/src/init.cc
index 3e218ab..46b02e6 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -51,17 +51,6 @@ NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
 
 static ncclResult_t commReclaim(ncclComm_t comm);
 
-static uint64_t hashUniqueId(ncclUniqueId const &id) {
-  char const *bytes = (char const*)&id;
-  uint64_t h = 0xdeadbeef;
-  for(int i=0; i < (int)sizeof(ncclUniqueId); i++) {
-    h ^= h >> 32;
-    h *= 0x8db3db47fa2994ad;
-    h += bytes[i];
-  }
-  return h;
-}
-
 // GDRCOPY support: Off by default
 NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
 
@@ -111,7 +100,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
   memset(out, 0, sizeof(*out));
   // copy to avoid alignment mismatch
   memcpy(out, &handle, sizeof(handle));
-  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
+  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(out->internal, NCCL_UNIQUE_ID_BYTES));
   return ncclSuccess;
 }
 
@@ -232,6 +221,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
       free(comm->sharedRes->tpRankToLocalRank);
       NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
       NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
+      CUDACHECK(cudaEventDestroy(comm->sharedRes->launchEvent));
+      CUDACHECK(cudaEventDestroy(comm->sharedRes->scratchEvent));
       NCCLCHECK(ncclProxyDestroy(comm));
       free(comm->sharedRes);
     }
@@ -268,6 +259,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
   NCCLCHECK(ncclProfilerPluginFinalize(comm));
   NCCLCHECK(ncclNetFinalize(comm));
   NCCLCHECK(ncclNetPluginUnload(comm));
+
+  ncclCudaContextDrop(comm->context);
+
   free(comm);
 
   return ncclSuccess;
@@ -309,17 +303,12 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
     ncclGroupJobAbort(comm->groupJob);
   } else {
     NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
-    if (ret != ncclSuccess) {
-      /* if ret is not ncclInProgress, we just keep it. */
+    if (ret == ncclInProgress) {
       WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
-      if (ret == ncclInProgress) ret = ncclInvalidArgument;
+      ret = ncclInvalidArgument;
       goto exit;
     }
-    /* if there is linked group job, we should complete it. */
-    if (comm->groupJob) {
-      NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
-      comm->groupJob = NULL;
-    }
+    /* if ret is not ncclInProgress, we just keep it. */
   }
 
 exit:
@@ -357,6 +346,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   // the device we're on (failure cause #1) , better know it early.
   CUDACHECK(cudaGetDevice(&comm->cudaDev));
 
+  NCCLCHECK(ncclCudaContextTrack(&comm->context));
+
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
   nvmlDevice_t nvmlDev;
   char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
@@ -396,6 +387,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
     NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
     NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
     NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
     comm->sharedRes = sharedRes;
     sharedRes->refCount = 1;
   } else {
@@ -437,13 +430,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   struct ncclDevCommAndChannels *devCommAndChans = NULL;
   struct ncclNvmlCCStatus ccStatus;
   bool ccEnable;
+  cudaStream_t deviceStream;
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail);
   ncclCommPushCudaFree(comm, devCommAndChans);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
   ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
   comm->devComm = &devCommAndChans->comm;
   tmpCommAndChans.comm.rank = comm->rank;
   tmpCommAndChans.comm.nRanks = nRanks;
@@ -494,10 +488,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   comm->workFifoConsumedLeast = 0;
   tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed;
 
+  // Alloc profiler counters for the kernel
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workCompleted, MAXCHANNELS), ret, fail);
+  tmpCommAndChans.comm.workStarted = comm->profiler.workStarted;
+  tmpCommAndChans.comm.workCompleted = comm->profiler.workCompleted;
+  ncclCommPushCudaHostFree(comm, comm->profiler.workStarted);
+  ncclCommPushCudaHostFree(comm, comm->profiler.workCompleted);
+
   if (comm->collNetDenseToUserRank != nullptr) {
-    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
     ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
-    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
   }
 
   for (int c=0; c < MAXCHANNELS; c++) {
@@ -510,14 +512,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
     tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
 
     if (comm->channels[c].ring.userRanks != nullptr) {
-      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, deviceStream), ret, fail);
     }
   }
 
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, deviceStream), ret, fail);
 exit:
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream));
   return ret;
 fail:
   goto exit;
@@ -1000,6 +1002,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
       graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
     }
+    comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
   }
   if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
   if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
@@ -1376,12 +1379,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
-    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex),
-    // add unique split counter and the color
-    ncclUniqueId tmpId;
-    memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
-    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color);
-    comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
+    // child hash obtained from (parent hash, split count, color)
+    uint64_t hacc[2] = {1, 1};
+    eatHash(hacc, &job->parent->commHash);
+    eatHash(hacc, &job->splitCount);
+    eatHash(hacc, &job->color);
+    comm->commHash = digestHash(hacc);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
          comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1394,8 +1397,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
     // obtain a unique hash using the first commId
-    comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
-    commIdHash = hashUniqueId(job->commId[0]);
+    comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
          comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1610,6 +1612,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d");
 
   /* assign config to communicator */
   comm->config.blocking = internalConfigPtr->blocking;
@@ -1618,6 +1621,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   comm->config.maxCTAs = internalConfigPtr->maxCTAs;
   comm->config.netName = internalConfigPtr->netName;
   comm->config.splitShare = internalConfigPtr->splitShare;
+  comm->config.trafficClass = internalConfigPtr->trafficClass;
 
   NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
 
@@ -1642,6 +1646,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
   const char* commIdEnv = NULL;
   ncclComm_t comm = NULL;
   struct ncclCommInitRankAsyncJob* job = NULL;
+  bool launchedJob = false;
   // first call ncclInit, this will setup the environment
   NCCLCHECKGOTO(ncclInit(), res, fail);
 
@@ -1695,12 +1700,13 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
     // start the bootstrap root before bootstrapping, use only the first handle
     NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
   }
+  launchedJob = true;
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
 
 exit:
   return ncclGroupErrCheck(res);
 fail:
-  if (job) ncclCommInitJobFree(job);
+  if (job && !launchedJob) ncclCommInitJobFree(job);
   if (comm) {
     free(comm->abortFlag);
     if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
@@ -1896,7 +1902,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
     NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
     NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
     // And keep polling until all graphs referencing us die.
-    while (comm->persistentRefs != 0) {
+    while (comm->localPersistentRefs != 0) {
       NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
     }
     while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
@@ -1964,7 +1970,6 @@ exit:
   }
   return ret;
 fail:
-  free(job);
   if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
   goto exit;
 }
@@ -2215,6 +2220,11 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
 
   *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
   if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
+  /* if there is linked group job, we should complete it. */
+  if (*asyncError == ncclSuccess && comm->groupJob) {
+    NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
+    comm->groupJob = NULL;
+  }
   return ncclSuccess;
 }
 
@@ -2265,16 +2275,13 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
 
 #if CUDART_VERSION >= 12010
   size_t memGran = 0;
-  size_t mcGran = 0;
   CUdevice currentDev;
   CUmemAllocationProp memprop = {};
-  CUmulticastObjectProp mcprop = {};
   CUmemAccessDesc accessDesc = {};
   CUmemGenericAllocationHandle handle;
   int cudaDev;
   int flag;
   int dcnt;
-  int mcSupport = 0;
 
   if (ptr == NULL || size == 0) goto fallback;
 
@@ -2284,6 +2291,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
   CUCHECK(cuDeviceGet(&currentDev, cudaDev));
 
   if (ncclCuMemEnable()) {
+    size_t handleSize = size;
     int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     // Query device to see if FABRIC handle support is available
     flag = 0;
@@ -2299,40 +2307,25 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
     if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
     CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
     CUDACHECK(cudaGetDeviceCount(&dcnt));
-
-    if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
-    if (mcSupport) {
-      /* mc property */
-      mcprop.size = size;
-      /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
-      mcprop.numDevices = dcnt;
-      mcprop.handleTypes = requestedHandleTypes;
-      mcprop.flags = 0;
-      CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-
-      /* only size needs to be aligned to mcGran */
-      ALIGN_SIZE(size, mcGran);
-    } else {
-      ALIGN_SIZE(size, memGran);
-    }
+    ALIGN_SIZE(handleSize, memGran);
 
     if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
       /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
-      CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
+      CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
       if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
         requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
         memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
         /* Allocate the physical memory on the device */
-        CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
       }
     } else {
       /* Allocate the physical memory on the device */
-      CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+      CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
     }
     /* Reserve a virtual address range */
-    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
     /* Map the virtual address range to the physical allocation */
-    CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+    CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
     /* Now allow RW access to the newly mapped memory */
     for (int i = 0; i < dcnt; ++i) {
       int p2p = 0;
@@ -2340,7 +2333,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
         accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
         accessDesc.location.id = i;
         accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
       }
       if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
     }
diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc
index 23746b3..3e9dfcd 100644
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@@ -169,7 +169,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
   } control_un;
 
   struct cmsghdr *cmptr;
-  char dummy_buffer[1];
+  char dummy_buffer[1] = {'\0'};
   struct sockaddr_un cliaddr;
 
   // Construct client address to send this shareable handle to
@@ -190,6 +190,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
   TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
 
   if (sendFd != -1) {
+    memset(&control_un, '\0', sizeof(control_un));
     msg.msg_control = control_un.control;
     msg.msg_controllen = sizeof(control_un.control);
 
diff --git a/src/misc/param.cc b/src/misc/param.cc
index eb50cfe..d7c324f 100644
--- a/src/misc/param.cc
+++ b/src/misc/param.cc
@@ -32,6 +32,7 @@ void setEnvFile(const char* fileName) {
   size_t n = 0;
   ssize_t read;
   while ((read = getline(&line, &n, file)) != -1) {
+    if (line[0] == '#') continue;
     if (line[read-1] == '\n') line[read-1] = '\0';
     int s=0; // Env Var Size
     while (line[s] != '\0' && line[s] != '=') s++;
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index dfb4e68..731dbce 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -171,6 +171,7 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
       strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
       // Store the IP address
       int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+      memset(addrs+found, '\0', sizeof(*addrs));
       memcpy(addrs+found, interface->ifa_addr, salen);
       found++;
     }
@@ -905,9 +906,17 @@ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
   if (sock != NULL) {
     if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
+      if (wait) {
+        char data;
+        int closed = 0;
+        do {
+          int offset = 0;
+          if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
+        } while (closed == 0);
+      }
       /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
        * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
        * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
index 61b0e4b..e6cce98 100644
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@@ -9,28 +9,61 @@
 #include "checks.h"
 #include "param.h"
 
-// Tracks the chain of graph nodes for a given graph captured identified by
-// its graph id. This state has to live for as long as captured work is being
-// submitted. CUDA doesn't have mechanism to inform us when the user ends capture
-// so the best we can do is get notified when the graph is destroyed.
-struct ncclStrongStreamGraph {
-  struct ncclStrongStreamGraph* next;
-  // Atomically exchanged to false by both the main thread or the graph destructor
-  // callback. The last to arrive deletes the node.
-  bool alive;
+// Tracks the captured work a given graph captured identified by its graph id.
+struct ncclStrongStreamCapture {
+  struct ncclStrongStreamCapture* next;
+  cudaGraph_t graph;
   unsigned long long graphId;
-  // For each graph we track the "tip" of the chain of graph nodes. A linear
-  // chain would always have just one node at its tip, but since we have to merge
-  // in chains from other streams (via ncclStrongStreamWaitStream) some spots
-  // in the chain can be wider than a single node and thus need a list, so we
-  // maintain a dynamically sized array of tip nodes.
-  int tipCount, tipCapacity;
-  cudaGraphNode_t* tipNodes;
+  cudaStream_t captureStream;
+  cudaGraphNode_t lastRecord;
+  void* acquiredBy;
 };
 
-static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) {
-  free(g->tipNodes);
-  free(g);
+////////////////////////////////////////////////////////////////////////////////
+
+static ncclCudaContext* cxtListHead = nullptr;
+static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
+  ncclResult_t result = ncclSuccess;
+  CUcontext hcontext;
+  CUCHECK(cuCtxGetCurrent(&hcontext));
+
+  pthread_mutex_lock(&cxtListLock);
+  struct ncclCudaContext* p = cxtListHead;
+  while (1) {
+    if (p == nullptr) {
+      p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext));
+      p->refCount = 1;
+      p->hcontext = hcontext;
+      p->next = cxtListHead;
+      cxtListHead = p;
+      NCCLCHECKGOTO(ncclStrongStreamConstruct(&p->launchOrder), result, leave);
+      break;
+    }
+    if (p->hcontext == hcontext) {
+      p->refCount += 1;
+      break;
+    }
+    p = p->next;
+  }
+leave:
+  pthread_mutex_unlock(&cxtListLock);
+  *out = p;
+  return ncclSuccess;
+}
+
+void ncclCudaContextDrop(struct ncclCudaContext* cxt) {
+  pthread_mutex_lock(&cxtListLock);
+  if (0 == --cxt->refCount) {
+    struct ncclCudaContext** pp = &cxtListHead;
+    while (*pp != cxt) pp = &(*pp)->next;
+    *pp = cxt->next; // remove from list
+    // Destroy resources held in cxt
+    ncclStrongStreamDestruct(&cxt->launchOrder);
+    free(cxt);
+  }
+  pthread_mutex_unlock(&cxtListLock);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -43,9 +76,9 @@ ncclResult_t ncclCudaGetCapturingGraph(
     NCCLCHECK(ncclCudaDriverVersion(&driver));
     if (CUDART_VERSION < 11030 || driver < 11030) {
       cudaStreamCaptureStatus status;
-      unsigned long long gid;
-      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid));
+      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, nullptr));
       #if CUDART_VERSION >= 11030
+        graph->origin = nullptr;
         graph->graph = nullptr;
         graph->graphId = ULLONG_MAX;
       #endif
@@ -56,13 +89,14 @@ ncclResult_t ncclCudaGetCapturingGraph(
     } else {
       #if CUDART_VERSION >= 11030
         cudaStreamCaptureStatus status;
-        unsigned long long gid;
-        CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
+        CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr));
         if (status != cudaStreamCaptureStatusActive) {
+          graph->origin = nullptr;
           graph->graph = nullptr;
-          gid = ULLONG_MAX;
+          graph->graphId = ULLONG_MAX;
+        } else {
+          graph->origin = stream;
         }
-        graph->graphId = gid;
       #endif
     }
   #endif
@@ -86,315 +120,218 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
 ////////////////////////////////////////////////////////////////////////////////
 
 ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
-  CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking));
+  CUDACHECK(cudaStreamCreateWithFlags(&ss->liveStream, cudaStreamNonBlocking));
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
     ss->everCaptured = false;
-    ss->serialEventNeedsRecord = false;
-    ss->graphHead = nullptr;
-  #else
-    CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming));
+    ss->captureHead = nullptr;
+    pthread_mutex_init(&ss->lock, nullptr);
+    CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
   #endif
   return ncclSuccess;
 }
 
-static void graphDestructor(void* arg) {
-  struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg;
-  if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
-    // Last to arrive deletes list node.
-    ncclStrongStreamGraphDelete(g);
-  }
-}
-
 ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
-  CUDACHECK(cudaStreamDestroy(ss->cudaStream));
+  CUDACHECK(cudaStreamDestroy(ss->liveStream));
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaEventDestroy(ss->serialEvent));
-    // Delete list of per-graph chains.
-    struct ncclStrongStreamGraph* g = ss->graphHead;
-    while (g != nullptr) {
-      struct ncclStrongStreamGraph* next = g->next;
-      if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
-        // Last to arrive deletes list node.
-        ncclStrongStreamGraphDelete(g);
-      }
-      g = next;
+    struct ncclStrongStreamCapture* cap = ss->captureHead;
+    while (cap) {
+      struct ncclStrongStreamCapture* next = cap->next;
+      CUDACHECK(cudaStreamDestroy(cap->captureStream));
+      free(cap);
+      cap = next;
     }
-  #else
-    CUDACHECK(cudaEventDestroy(ss->scratchEvent));
+    CUDACHECK(cudaEventDestroy(ss->serialEvent));
+    pthread_mutex_destroy(&ss->lock);
   #endif
   return ncclSuccess;
 }
 
 NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1)
+NCCL_PARAM(LaunchRaceFatal, "LAUNCH_RACE_FATAL", 1);
+constexpr char const* launchRaceFatalMsg = "Fatal: host threads racing to launch NCCL on same device.";
 
-static void ensureTips(struct ncclStrongStreamGraph* g, int n) {
-  if (g->tipCapacity < n) {
-    g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t));
-    g->tipCapacity = n;
-  }
-}
+static __thread char threadIdMarker;
+static void* localThreadId() { return &threadIdMarker; }
 
 ncclResult_t ncclStrongStreamAcquire(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss
+   struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
+   cudaStream_t* workStream
   ) {
   #if CUDART_VERSION >= 11030
     bool mixing = ncclParamGraphMixingSupport();
-    if (graph.graph == nullptr) {
-      if (mixing && ss->everCaptured) {
-        CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-        ss->serialEventNeedsRecord = false;
+    if (graph.graphId == ULLONG_MAX) {
+      *workStream = ss->liveStream;
+      ss->liveAcquiredBy = localThreadId();
+      if (mixing && __atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
+        CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
       }
     } else {
-      ss->everCaptured = true;
-      // Find the current graph in our list of graphs if it exists.
-      struct ncclStrongStreamGraph** pg = &ss->graphHead;
-      struct ncclStrongStreamGraph* g;
-      while (*pg != nullptr) {
-        g = *pg;
-        if (g->graphId == graph.graphId) {
-          // Move to front of list so that operations after acquire don't have to search the list.
-          *pg = g->next;
-          g->next = ss->graphHead;
-          ss->graphHead = g;
+      bool firstCapture = !ss->everCaptured;
+      __atomic_store_n(&ss->everCaptured, true, __ATOMIC_RELAXED);
+
+      ncclResult_t ret = ncclSuccess;
+      if (concurrent) pthread_mutex_lock(&ss->lock);
+
+      // Look for capture in our list of active captures.
+      struct ncclStrongStreamCapture** pcap = &ss->captureHead;
+      struct ncclStrongStreamCapture* cap;
+      struct ncclStrongStreamCapture* spare = nullptr;
+      while (*pcap != nullptr) {
+        cap = *pcap;
+        if (cap->graphId == graph.graphId) { // Capture node already exists.
+          *workStream = cap->captureStream;
+          cap->acquiredBy = localThreadId();
+          if (concurrent) pthread_mutex_unlock(&ss->lock);
           return ncclSuccess;
-        } else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) {
-          // Unrelated graph that has been destroyed. Remove and delete.
-          *pg = g->next;
-          ncclStrongStreamGraphDelete(g);
         } else {
-          pg = &g->next;
+          cudaStreamCaptureStatus status;
+          CUDACHECKGOTO(cudaStreamIsCapturing(cap->captureStream, &status), ret, do_unlock);
+          if (status == cudaStreamCaptureStatusActive) {
+            pcap = &cap->next; // Active capture doesn't match, on to next.
+          } else { // Capture no longer active
+            *pcap = cap->next; // Remove from current list
+            if (spare == nullptr) { // Keep one spare to reuse below.
+              spare = cap;
+            } else {
+              cudaStreamDestroy(cap->captureStream);
+              free(cap);
+            }
+          }
         }
       }
-
-      // This is a new graph so add to the list.
-      g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph));
-      g->graphId = graph.graphId;
-      g->tipNodes = nullptr;
-      g->tipCapacity = 0;
-      g->tipCount = 0;
-      g->next = ss->graphHead;
-      ss->graphHead = g;
-      g->alive = true;
-      NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g));
-
-      if (mixing && ss->serialEventNeedsRecord) {
-        // Can only be here if previous release was for uncaptured work that
-        // elided updating the event because no capture had yet occurred.
-        CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-        CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
+      // No matching capture, need a new entry.
+      cap = spare;
+      if (cap == nullptr) {
+        cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture));
+        CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
       }
-      ss->serialEventNeedsRecord = false;
+      cap->graphId = graph.graphId;
+      cap->lastRecord = nullptr;
+      cap->acquiredBy = localThreadId();
+      // Push to capturing list.
+      cap->next = ss->captureHead;
+      ss->captureHead = cap;
 
-      // First node in the chain must be a wait on the serialEvent.
+    do_unlock:
+      if (concurrent) pthread_mutex_unlock(&ss->lock);
+      if (ret != ncclSuccess) return ret;
+
+      *workStream = cap->captureStream;
+
+      // Bring captureStream into the graph but without any dependencies.
+      cudaEvent_t scratch;
+      CUDACHECK(cudaEventCreateWithFlags(&scratch, cudaEventDisableTiming));
+      CUDACHECK(cudaEventRecord(scratch, graph.origin));
+      CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0));
+      CUDACHECK(cudaEventDestroy(scratch));
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies));
+
+      if (mixing && firstCapture) {
+        CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
+      }
       if (mixing) {
-        ensureTips(g, 1);
-        CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent));
-        g->tipCount = 1;
-      } else {
-        g->tipCount = 0;
+        // First dependency is to wait on serialEvent
+        CUDACHECK(cudaStreamWaitEvent(cap->captureStream, ss->serialEvent, cudaEventWaitExternal));
       }
     }
   #endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
+ncclResult_t ncclStrongStreamAcquiredWorkStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
+    cudaStream_t* workStream
+  ) {
   #if CUDART_VERSION >= 11030
-    bool mixing = ncclParamGraphMixingSupport();
-    if (mixing && ss->everCaptured) {
-      CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
+    if (graph.graphId == ULLONG_MAX) {
+      *workStream = ss->liveStream;
+    } else {
+      if (concurrent) pthread_mutex_lock(&ss->lock);
+      struct ncclStrongStreamCapture* cap = ss->captureHead;
+      while (cap->graphId != graph.graphId) cap = cap->next;
+      *workStream = cap->captureStream;
+      if (concurrent) pthread_mutex_unlock(&ss->lock);
     }
-    ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream.
+  #else
+    *workStream = ss->liveStream
   #endif
   return ncclSuccess;
 }
 
-static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) {
-  if (g == nullptr || g->graphId != id) {
-    WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id);
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
+ncclResult_t ncclStrongStreamRelease(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent
+  ) {
   #if CUDART_VERSION >= 11030
     bool mixing = ncclParamGraphMixingSupport();
-    if (mixing && ss->serialEventNeedsRecord) {
-      if (graph.graph == nullptr) {
-        if (ss->everCaptured) {
-          CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
-          ss->serialEventNeedsRecord = false;
+    if (mixing) {
+      if (graph.graphId == ULLONG_MAX) {
+        if (__atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
+          CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
+        }
+        if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
+          WARN("%s", launchRaceFatalMsg);
+          return ncclInvalidUsage;
         }
       } else {
-        struct ncclStrongStreamGraph* g = ss->graphHead;
-        NCCLCHECK(checkGraphId(g, graph.graphId));
-        ensureTips(g, 1);
-        CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent));
-        g->tipCount = 1;
-        ss->serialEventNeedsRecord = false;
+        if (concurrent) pthread_mutex_lock(&ss->lock);
+        struct ncclStrongStreamCapture* cap = ss->captureHead;
+        while (cap->graphId != graph.graphId) cap = cap->next;
+        if (concurrent) pthread_mutex_unlock(&ss->lock);
+
+        // Add event record node with dependencies added further down.
+        cudaGraphNode_t recordNode;
+        CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
+
+        // Make this record order after previous record on this stream.
+        if (cap->lastRecord != nullptr) {
+          CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
+        }
+        cap->lastRecord = recordNode;
+
+        // Get current nodes from work stream so we can add them as dependencies.
+        cudaStreamCaptureStatus status;
+        cudaGraphNode_t const* nodes;
+        size_t count = 0;
+        cudaError_t res = cudaStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count);
+
+        #if CUDART_VERSION >= 12030
+        if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
+          cudaGraphEdgeData const* edges;
+          CUDACHECK(cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, &edges, &count));
+          for (int i=0; i < (int)count; i++) {
+            CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, &edges[i], 1));
+          }
+        }
+        #else
+        if (false) {}
+        #endif
+        else {
+          CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
+          for (int i=0; i < (int)count; i++) {
+            CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1));
+          }
+        }
+
+        if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
+          WARN("%s", launchRaceFatalMsg);
+          return ncclInvalidUsage;
+        }
       }
     }
   #endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclStrongStreamLaunchHost(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
-    } else {
-      cudaHostNodeParams p;
-      p.fn = fn;
-      p.userData = arg;
-      struct ncclStrongStreamGraph* g = ss->graphHead;
-      NCCLCHECK(checkGraphId(g, graph.graphId));
-      ensureTips(g, 1);
-      CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
-      g->tipCount = 1;
-    }
-    ss->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
-  #endif
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamLaunchKernel(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-    void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
-    } else {
-      cudaKernelNodeParams p;
-      p.func = fn;
-      p.gridDim = grid;
-      p.blockDim = block;
-      p.kernelParams = args;
-      p.sharedMemBytes = sharedMemBytes;
-      p.extra = nullptr;
-      struct ncclStrongStreamGraph* g = ss->graphHead;
-      NCCLCHECK(checkGraphId(g, graph.graphId));
-      ensureTips(g, 1);
-      CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
-      g->tipCount = 1;
-    }
-    ss->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
-  #endif
-  return ncclSuccess;
-}
-
-// Merge node list `b` into list `a` but don't add duplicates.
-static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) {
-  int an = a->tipCount;
-  ensureTips(a, an + bn);
-  for (int bi=0; bi < bn; bi++) {
-    for (int ai=0; ai < an; ai++) {
-      if (a->tipNodes[ai] == bNodes[bi]) goto next_b;
-    }
-    a->tipNodes[a->tipCount++] = bNodes[bi];
-  next_b:;
-  }
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      if (b->serialEventNeedsRecord) {
-        b->serialEventNeedsRecord = false;
-        CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
-      }
-      CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0));
-    } else {
-      struct ncclStrongStreamGraph* ag = a->graphHead;
-      NCCLCHECK(checkGraphId(ag, graph.graphId));
-      struct ncclStrongStreamGraph* bg = b->graphHead;
-      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      if (b_subsumes_a) ag->tipCount = 0;
-      mergeTips(ag, bg->tipNodes, bg->tipCount);
-    }
-    a->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
-    CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0));
-  #endif
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      // It is ok to use a->serialEvent to record b since we'll be setting
-      // a->serialEventNeedsRecord so the event won't be considered accurate
-      // until re-recorded.
-      CUDACHECK(cudaEventRecord(a->serialEvent, b));
-      CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0));
-    } else {
-      cudaStreamCaptureStatus status;
-      unsigned long long bGraphId;
-      cudaGraphNode_t const* bNodes;
-      size_t bCount = 0;
-      CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount));
-      if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) {
-        WARN("Stream is not being captured by the expected graph.");
-        return ncclInvalidUsage;
-      }
-      struct ncclStrongStreamGraph* ag = a->graphHead;
-      NCCLCHECK(checkGraphId(ag, graph.graphId));
-      if (b_subsumes_a) ag->tipCount = 0;
-      mergeTips(ag, bNodes, bCount);
-    }
-    a->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaEventRecord(a->scratchEvent, b));
-    CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0));
-  #endif
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      if (b->serialEventNeedsRecord) {
-        b->serialEventNeedsRecord = false;
-        CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
-      }
-      CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0));
-    } else {
-      struct ncclStrongStreamGraph* bg = b->graphHead;
-      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount,
-        b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies
-      ));
-    }
-  #else
-    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
-    CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0));
-  #endif
+ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent) {
+  CUDACHECK(cudaEventRecord(scratchEvent, b));
+  CUDACHECK(cudaStreamWaitEvent(a, scratchEvent, 0));
   return ncclSuccess;
 }
 
 ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-    ss->serialEventNeedsRecord = false;
+    CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
   #endif
-  CUDACHECK(cudaStreamSynchronize(ss->cudaStream));
+  CUDACHECK(cudaStreamSynchronize(ss->liveStream));
   return ncclSuccess;
 }
diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc
deleted file mode 100644
index 267e12a..0000000
--- a/src/misc/tuner.cc
+++ /dev/null
@@ -1,267 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <dlfcn.h>
-#include <errno.h>
-#include <stdlib.h>
-
-#include "checks.h"
-#include "debug.h"
-#include "tuner.h"
-
-pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static int tunerPluginRefCount;
-static void* tunerPluginLib = nullptr;
-static ncclTuner_v4_t* tunerSymbol = nullptr;
-static ncclTuner_v3_t* ncclTuner_v3 = nullptr;
-static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
-static ncclTuner_v4_t ncclTuner_v2_as_v4;
-static ncclTuner_v4_t ncclTuner_v3_as_v4;
-
-static int hasNvlsSupport(float** collCostTable) {
-  // Requirements for support of different algorithms:
-  //
-  // - NVLS intra-node: nvlsSupport
-  // - NVLS intra+inter-node: collNetSupport
-  // - NVLSTree intra-node: always disabled
-  // - NVLSTree inter-node: nvlsSupport
-  // - Collnet* inter-node: collNetSupport
-  //
-  // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
-}
-
-static int hasCollNetSupport(float** collCostTable) {
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
-}
-
-static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
-  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
-  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
-  ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo;
-  ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
-  int algorithm = NCCL_ALGO_UNDEF;
-  int protocol = NCCL_PROTO_UNDEF;
-  int nvlsSupport = hasNvlsSupport(collCostTable);
-  int collNetSupport = hasCollNetSupport(collCostTable);
-  NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
-  // set time to 0 below to make sure this algorithm/protocol is selected later on
-  if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
-    float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-    if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
-  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
-  ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo;
-  ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy;
-  return ncclSuccess;
-}
-
-#define MAX_STR_LEN 255
-
-static void* tryOpenLib(const char* name, int* err, char* errStr) {
-  *err = 0;
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = '\0';
-    // "handle" and "name" won't be NULL at the same time.
-    // coverity[var_deref_model]
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openTunerPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char tunerPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-  const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN");
-  if (envTunerPluginName && strlen(envTunerPluginName)) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName);
-    snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  } else {
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so");
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  }
-
-  const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envNetPluginName && strlen(envNetPluginName)) {
-    // Users are allowed to pack tuner into the net plugin
-    snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  } else {
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so");
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  }
-  tunerPluginLibName[0] = '\0';
-  return nullptr;
-}
-
-enum {
-  tunerPluginLoadFailed  = -1,
-  tunerPluginLoadReady   =  0,
-  tunerPluginLoadSuccess =  1,
-};
-
-#define MAX_PLUGIN_LOAD 4
-
-static int status = tunerPluginLoadReady;
-
-ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
-  // Initialize to nullptr by default if plugin tuner cannot be loaded.
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  comm->tuner = nullptr;
-  if (tunerPluginLoadFailed == status) {
-    return ncclSuccess;
-  }
-
-  pthread_mutex_lock(&tunerPluginLock);
-  if (tunerPluginLoadFailed == status) {
-    goto exit;
-  }
-
-  if (tunerPluginLoadSuccess == status) {
-    comm->tuner = tunerSymbol;
-    ++tunerPluginRefCount;
-    goto exit;
-  }
-
-  tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
-  if (nullptr == tunerPluginLib) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames);
-    } else {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
-    }
-    goto fail;
-  }
-
-  tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4");
-  if (tunerSymbol == nullptr) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
-    ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
-    if (ncclTuner_v3 == nullptr) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
-      ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
-      if (ncclTuner_v2 == nullptr) {
-        INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
-        dlclose(tunerPluginLib);
-        goto fail;
-      } else {
-        ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init;
-        ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
-        tunerSymbol = &ncclTuner_v2_as_v4;
-      }
-    } else {
-      ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init;
-      ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
-      tunerSymbol = &ncclTuner_v3_as_v4;
-    }
-  }
-
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name);
-  comm->tuner = tunerSymbol;
-  ++tunerPluginRefCount;
-  status = tunerPluginLoadSuccess;
-  comm->tunerPluginLoaded = 1;
-
-exit:
-  pthread_mutex_unlock(&tunerPluginLock);
-  return ncclSuccess;
-fail:
-  tunerPluginLib = nullptr;
-  status = tunerPluginLoadFailed;
-  goto exit;
-}
-
-ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&tunerPluginLock);
-  if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
-    INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
-    dlclose(tunerPluginLib);
-    tunerPluginLib = nullptr;
-    tunerSymbol = nullptr;
-    comm->tuner = nullptr;
-    status = tunerPluginLoadReady;
-    comm->tunerPluginLoaded = 0;
-  }
-  pthread_mutex_unlock(&tunerPluginLock);
-  return ncclSuccess;
-}
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 8a6f94e..f3ab534 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -66,6 +66,7 @@ typedef struct ncclConfig_v21700 {
   int maxCTAs;
   const char *netName;
   int splitShare;
+  int trafficClass;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -79,7 +80,8 @@ typedef struct ncclConfig_v21700 {
   NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
   NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
   NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
-  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
+  NCCL_CONFIG_UNDEF_INT,                    /* splitShare */            \
+  NCCL_CONFIG_UNDEF_INT,                    /* trafficClass */          \
 }
 
 /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
diff --git a/src/net.cc b/src/net.cc
deleted file mode 100644
index 13e8c2b..0000000
--- a/src/net.cc
+++ /dev/null
@@ -1,1033 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "net.h"
-#include "bootstrap.h"
-#include "checks.h"
-
-#include <string.h>
-#include <errno.h>
-#include <dlfcn.h>
-//#include <sys/types.h>
-//#include <sys/stat.h>
-//#include <unistd.h>
-
-static ncclNet_v9_t ncclNet_v5_as_v9;
-static ncclNet_v9_t ncclNet_v6_as_v9;
-static ncclNet_v9_t ncclNet_v7_as_v9;
-static ncclNet_v9_t ncclNet_v8_as_v9;
-static ncclNet_v5_t *ncclNet_v5;
-static ncclNet_v6_t *ncclNet_v6;
-static ncclNet_v7_t *ncclNet_v7;
-static ncclNet_v8_t *ncclNet_v8;
-static ncclCollNet_v9_t ncclCollNet_v5_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v6_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v7_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v8_as_v9;
-static ncclCollNet_v5_t *ncclCollNet_v5;
-static ncclCollNet_v6_t *ncclCollNet_v6;
-static ncclCollNet_v7_t *ncclCollNet_v7;
-static ncclCollNet_v8_t *ncclCollNet_v8;
-
-#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
-#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
-
-static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v8_t p8;
-  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
-  if (ans != ncclSuccess) return ans;
-  props->name = p8.name;
-  props->pciPath = p8.pciPath;
-  props->guid = p8.guid;
-  props->ptrSupport = p8.ptrSupport;
-  props->regIsGlobal = p8.regIsGlobal;
-  props->forceFlush = 0;
-  props->speed = p8.speed;
-  props->port = p8.port;
-  props->maxComms = p8.maxComms;
-  props->maxRecvs = p8.maxRecvs;
-  props->latency = p8.latency;
-  props->netDeviceType = p8.netDeviceType;
-  props->netDeviceVersion = p8.netDeviceVersion;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v8->init(logfn));
-  ncclNet_v8_as_v9.name = ncclNet_v8->name;
-  ncclNet_v8_as_v9.devices = ncclNet_v8->devices;
-  ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties;
-  ncclNet_v8_as_v9.listen = ncclNet_v8->listen;
-  ncclNet_v8_as_v9.connect = ncclNet_v8->connect;
-  ncclNet_v8_as_v9.accept =  ncclNet_v8->accept;
-  ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr;
-  ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
-  ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr;
-  ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend;
-  ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv;
-  ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush;
-  ncclNet_v8_as_v9.test = ncclNet_v8->test;
-  ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend;
-  ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv;
-  ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen;
-  ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr;
-  ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed;
-  ncclNet_v8_as_v9.makeVDevice   = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v7_t p7;
-  ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
-  if (ans != ncclSuccess) return ans;
-  props->name = p7.name;
-  props->pciPath = p7.pciPath;
-  props->guid = p7.guid;
-  props->ptrSupport = p7.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p7.speed;
-  props->port = p7.port;
-  props->maxComms = p7.maxComms;
-  props->maxRecvs = p7.maxRecvs;
-  props->latency = p7.latency;
-  props->netDeviceType = p7.netDeviceType;
-  props->netDeviceVersion = p7.netDeviceVersion;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v7->init(logfn));
-  ncclNet_v7_as_v9.name = ncclNet_v7->name;
-  ncclNet_v7_as_v9.devices = ncclNet_v7->devices;
-  ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v7_as_v9.listen = ncclNet_v7->listen;
-  ncclNet_v7_as_v9.connect = ncclNet_v7->connect;
-  ncclNet_v7_as_v9.accept =  ncclNet_v7->accept;
-  ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr;
-  ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
-  ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr;
-  ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend;
-  ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv;
-  ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush;
-  ncclNet_v7_as_v9.test = ncclNet_v7->test;
-  ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend;
-  ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv;
-  ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen;
-  ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr;
-  ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed;
-  ncclNet_v7_as_v9.makeVDevice  = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
-  return ncclNet_v6->connect(dev, handle, sendComm);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
-  return ncclNet_v6->accept(listenComm, recvComm);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v6->init(logfn));
-  ncclNet_v6_as_v9.name = ncclNet_v6->name;
-  ncclNet_v6_as_v9.devices = ncclNet_v6->devices;
-  ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties;
-  ncclNet_v6_as_v9.listen = ncclNet_v6->listen;
-  ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect;
-  ncclNet_v6_as_v9.accept =  ncclNet_v6_as_v9_accept;
-  ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr;
-  ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
-  ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr;
-  ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend;
-  ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv;
-  ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush;
-  ncclNet_v6_as_v9.test = ncclNet_v6->test;
-  ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend;
-  ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv;
-  ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen;
-  ncclNet_v6_as_v9.getDeviceMr = NULL;
-  ncclNet_v6_as_v9.irecvConsumed = NULL;
-  ncclNet_v6_as_v9.makeVDevice  = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
-  return ncclNet_v5->connect(dev, handle, sendComm);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
-  return ncclNet_v5->accept(listenComm, recvComm);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-// We use a wrapper around the v5 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v9.name = ncclNet_v5->name;
-  ncclNet_v5_as_v9.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties;
-  ncclNet_v5_as_v9.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect;
-  ncclNet_v5_as_v9.accept =  ncclNet_v5_as_v9_accept;
-  ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr;
-  ncclNet_v5_as_v9.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend;
-  ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv;
-  ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v9.test = ncclNet_v5->test;
-  ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen;
-  ncclNet_v5_as_v9.getDeviceMr = NULL;
-  ncclNet_v5_as_v9.irecvConsumed = NULL;
-  ncclNet_v5_as_v9.makeVDevice = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v5 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties;
-  ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr;
-  ncclCollNet_v5_as_v9.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce;
-  ncclCollNet_v5_as_v9.iallgather = nullptr;
-  ncclCollNet_v5_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v6 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v6->init(logfn));
-  ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
-  ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices;
-  ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties;
-  ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen;
-  ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect;
-  ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport;
-  ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr;
-  ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
-  ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr;
-  ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce;
-  ncclCollNet_v6_as_v9.iallgather = nullptr;
-  ncclCollNet_v6_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush;
-  ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test;
-  ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl;
-  ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v7_t p7;
-  ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
-  if (ans != ncclSuccess) return ans;
-  props->name = p7.name;
-  props->pciPath = p7.pciPath;
-  props->guid = p7.guid;
-  props->ptrSupport = p7.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p7.speed;
-  props->port = p7.port;
-  props->maxComms = p7.maxComms;
-  props->maxRecvs = p7.maxRecvs;
-  props->latency = p7.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v7 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v7->init(logfn));
-  ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
-  ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices;
-  ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties;
-  ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen;
-  ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect;
-  ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport;
-  ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr;
-  ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
-  ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr;
-  ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce;
-  ncclCollNet_v7_as_v9.iallgather = nullptr;
-  ncclCollNet_v7_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush;
-  ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test;
-  ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl;
-  ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v8_t p8;
-  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
-  if (ans != ncclSuccess) return ans;
-  props->name = p8.name;
-  props->pciPath = p8.pciPath;
-  props->guid = p8.guid;
-  props->ptrSupport = p8.ptrSupport;
-  props->regIsGlobal = p8.regIsGlobal;
-  props->forceFlush = 0;
-  props->speed = p8.speed;
-  props->port = p8.port;
-  props->maxComms = p8.maxComms;
-  props->maxRecvs = p8.maxRecvs;
-  props->latency = p8.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
-                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                           void* sendMhandle, void** request) {
-   ncclNetSGE_v8_t recvPartsInt;
-   if (nRecvParts > 1) return ncclInternalError;
-   if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
-   recvPartsInt.mhandle = recvParts->mhandle;
-   recvPartsInt.address = recvParts->address;
-   recvPartsInt.size = (int)recvParts->size;
-   ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
-                   bytesPerRank, windowOffset, windowBytes,
-                   sendMhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
-                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                               ncclDataType_t dataType, ncclRedOp_t redOp,
-                               void* recvMhandle, void** request) {
-   ncclNetSGE_v8_t sendPartsInt;
-   if (nSendParts > 1) return ncclInternalError;
-   if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
-   sendPartsInt.mhandle = sendParts->mhandle;
-   sendPartsInt.address = sendParts->address;
-   sendPartsInt.size = (int)sendParts->size;
-   ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
-                   recvData, bytesPerRank, windowOffset, windowBytes,
-                   dataType, redOp,
-                  recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v8 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v8->init(logfn));
-  ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
-  ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices;
-  ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties;
-  ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen;
-  ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect;
-  ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport;
-  ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr;
-  ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
-  ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr;
-  ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce;
-  ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather;
-  ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter;
-  ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush;
-  ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test;
-  ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl;
-  ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen;
-  return ncclSuccess;
-}
-
-static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
-ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
-enum ncclNetState {
-  ncclNetStateInit = 0,
-  ncclNetStateEnabled = 1,
-  ncclNetStateDisabled = 2
-};
-enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-
-#define MAX_STR_LEN 255
-
-static void* tryOpenLib(char* name, int* err, char* errStr) {
-  *err = 0;
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = '\0';
-    // "handle" and "name" won't be NULL at the same time.
-    // coverity[var_deref_model]
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openNetPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char netPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-  const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envNetPluginName && strlen(envNetPluginName)) {
-    snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-
-    snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-  } else {
-    snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so");
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-  }
-  return nullptr;
-}
-
-static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static int netPluginRefCount;
-static void* netPluginLib;
-
-enum {
-  netPluginLoadFailed  = -1,
-  netPluginLoadReady   =  0,
-  netPluginLoadSuccess =  1,
-};
-
-static int netPluginStatus = netPluginLoadReady;
-
-#define MAX_PLUGIN_LOAD 2
-
-ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  pthread_mutex_lock(&netPluginLock);
-  if (netPluginLoadFailed == netPluginStatus) {
-    goto exit;
-  }
-  if (netPluginLoadSuccess == netPluginStatus) {
-    ++netPluginRefCount;
-    goto exit;
-  }
-
-  netPluginLib = openNetPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
-  if (netPluginLib == nullptr) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Could not find:%s. Using internal network plugin.", couldNotFindNames);
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin.");
-    }
-    goto fail;
-  }
-
-  ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9");
-  if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
-    ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
-    if (ncclNet_v8 == nullptr) {
-      // Try v7 plugin
-      ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
-      if (ncclNet_v7 == nullptr) {
-        // Try v6 plugin
-        ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
-        if (ncclNet_v6 == nullptr) {
-          // Try v5 plugin
-          ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-          if (ncclNet_v5 == nullptr) {
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
-            goto fail;
-          } else {
-            ncclNets[0] = &ncclNet_v5_as_v9;
-            ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init;
-            // Set the name right away to allow for NCCL_NET=... to work
-            ncclNet_v5_as_v9.name = ncclNet_v5->name;
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
-          }
-        } else {
-          ncclNets[0] = &ncclNet_v6_as_v9;
-          ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init;
-          // Set the name right away to allow for NCCL_NET=... to work
-          ncclNet_v6_as_v9.name = ncclNet_v6->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
-        }
-      } else {
-        ncclNets[0] = &ncclNet_v7_as_v9;
-        ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init;
-        // Set the name right away to allow for NCCL_NET=... to work
-        ncclNet_v7_as_v9.name = ncclNet_v7->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
-      }
-    } else {
-      ncclNets[0] = &ncclNet_v8_as_v9;
-      ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init;
-      // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v8_as_v9.name = ncclNet_v8->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name);
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name);
-  }
-
-  // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9");
-  if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
-    ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
-    if (ncclCollNet_v8 == nullptr) {
-      ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
-      if (ncclCollNet_v7 == nullptr) {
-        ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
-        if (ncclCollNet_v6 == nullptr) {
-          ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-          if (ncclCollNet_v5 == nullptr) {
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
-          } else {
-            ncclCollNets[0] = &ncclCollNet_v5_as_v9;
-            ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init;
-            ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
-          }
-        } else {
-         ncclCollNets[0] = &ncclCollNet_v6_as_v9;
-         ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init;
-         ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
-         INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
-        }
-      } else {
-       ncclCollNets[0] = &ncclCollNet_v7_as_v9;
-       ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init;
-       ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
-       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
-      }
-    } else {
-      ncclCollNets[0] = &ncclCollNet_v8_as_v9;
-      ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init;
-      ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name);
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name);
-  }
-
-  ++netPluginRefCount;
-  netPluginStatus = netPluginLoadSuccess;
-  comm->netPluginLoaded = 1;
-
-exit:
-  pthread_mutex_unlock(&netPluginLock);
-  return ncclSuccess;
-fail:
-  if (netPluginLib) dlclose(netPluginLib);
-  netPluginStatus = netPluginLoadFailed;
-  goto exit;
-}
-
-ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&netPluginLock);
-  if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
-    if (ncclNets[0]) {
-      INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
-    }
-    if (ncclCollNets[0]) {
-      INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
-    }
-    dlclose(netPluginLib);
-    netPluginLib = nullptr;
-    ncclNets[0] = nullptr;
-    ncclCollNets[0] = nullptr;
-    netPluginStatus = netPluginLoadReady;
-    comm->netPluginLoaded = 0;
-    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
-      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
-  }
-  pthread_mutex_unlock(&netPluginLock);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
-  ncclNetProperties_t props;
-
-  NCCLCHECK(net->getProperties(dev, &props));
-  ncclNetDeviceType type = props.netDeviceType;
-  if (type) switch (type) {
-    case NCCL_NET_DEVICE_UNPACK:
-      if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
-        INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
-          props.netDeviceVersion);
-        return ncclSuccess;
-      } else {
-        WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
-          props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
-        return ncclInternalError;
-      }
-    default:
-      WARN("Unknown device code index %d \n", type);
-      return ncclInternalError;
-  }
-
-  return ncclSuccess;
-}
-
-static ncclResult_t netGetState(int i, enum ncclNetState* state) {
-  pthread_mutex_lock(&netLock);
-  if (ncclNetStates[i] == ncclNetStateInit) {
-    int ndev;
-    if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
-    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
-    else ncclNetStates[i] = ncclNetStateEnabled;
-  }
-  *state = ncclNetStates[i];
-  pthread_mutex_unlock(&netLock);
-  return ncclSuccess;
-}
-
-static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
-  pthread_mutex_lock(&netLock);
-  if (ncclCollNetStates[i] == ncclNetStateInit) {
-    int ndev;
-    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
-    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
-    else ncclCollNetStates[i] = ncclNetStateEnabled;
-  }
-  *state = ncclCollNetStates[i];
-  pthread_mutex_unlock(&netLock);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetInit(struct ncclComm* comm) {
-  // Initialize main communication network
-  const char* netName;
-  bool ok = false;
-
-  netName = comm->config.netName;
-  for (int i=0; i<3; i++) {
-    if (ncclNets[i] == nullptr) continue;
-    enum ncclNetState state;
-    NCCLCHECK(netGetState(i, &state));
-    if (state != ncclNetStateEnabled) continue;
-    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
-    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
-      // Mismatched device plugin version
-      continue;
-    }
-
-    comm->ncclNet = ncclNets[i];
-    ok = true;
-
-    if (ncclCollNets[i]) {
-      NCCLCHECK(collNetGetState(i, &state));
-      if (state == ncclNetStateEnabled) {
-        comm->ncclCollNet = ncclCollNets[i];
-      }
-    }
-    break;
-  }
-
-  if (!ok) {
-    WARN("Error: network %s not found.", netName ? netName : "");
-    return ncclInvalidUsage;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
-  comm->ncclNet = nullptr;
-  comm->ncclCollNet = nullptr;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
-  constexpr int GPU_BUF_SIZE = 2*1024*1024;
-#if CUDART_VERSION >= 11030
-  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
-  int driverVersion;
-  CUDACHECK(cudaDriverGetVersion(&driverVersion));
-  if (driverVersion >= 11030) {
-    int cudaDev, attr = 0;
-    CUDACHECK(cudaGetDevice(&cudaDev));
-    CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
-    *gdrSupport = attr;
-    return ncclSuccess;
-  }
-#endif
-  static int gdrSupportMatrix[32] = {
-	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
-  if (gdrSupportMatrix[comm->cudaDev] == -1) {
-    int netDevs;
-    NCCLCHECK(comm->ncclNet->devices(&netDevs));
-    gdrSupportMatrix[comm->cudaDev] = 0;
-    for (int dev=0; dev<netDevs; dev++) {
-      // Find a net device which is GDR-capable
-      ncclNetProperties_t props;
-      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
-      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
-
-    // Allocate memory on the GPU and try to register it on the NIC.
-    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
-    ncclNetHandle_t handle;
-    char* gpuPtr = NULL;
-    void* mHandle = NULL;
-    ncclResult_t ret;
-    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
-
-    bool connected;
-    connected = false;
-    while (!connected) {
-
-      // If we're aborting now, skip to cleanup
-      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
-        goto cleanup2;
-      }
-
-      if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2);
-
-      if (rComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
-
-      connected = (rComm != NULL) && (sComm != NULL);
-    }
-
-    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
-    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
-      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
-      gdrSupportMatrix[comm->cudaDev] = 1;
-    }
-    ncclDebugNoWarn = 0;
-    NCCLCHECK(ncclCudaFree(gpuPtr));
-cleanup2:
-    if (rComm != NULL)
-      NCCLCHECK(comm->ncclNet->closeRecv(rComm));
-    if (sComm != NULL)
-      NCCLCHECK(comm->ncclNet->closeSend(sComm));
-    NCCLCHECK(comm->ncclNet->closeListen(lComm));
-cleanup1:
-      break;
-    }
-  }
-  *gdrSupport = gdrSupportMatrix[comm->cudaDev];
-  return ncclSuccess;
-}
-
-int ncclNetVersion(struct ncclComm* comm) {
-  return
-    (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 :
-    (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 :
-    (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 :
-    (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 :
-    9;
-}
diff --git a/src/plugin/net.cc b/src/plugin/net.cc
new file mode 100644
index 0000000..9257d77
--- /dev/null
+++ b/src/plugin/net.cc
@@ -0,0 +1,319 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "net.h"
+#include "bootstrap.h"
+#include "checks.h"
+#include "plugin.h"
+
+#include <string.h>
+#include <errno.h>
+//#include <sys/types.h>
+//#include <sys/stat.h>
+//#include <unistd.h>
+
+extern ncclNet_t* getNcclNet_v6(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v7(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v8(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v9(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v10(void* netPluginLib);
+
+extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib);
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
+static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 };
+ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
+enum ncclNetState {
+  ncclNetStateInit = 0,
+  ncclNetStateEnabled = 1,
+  ncclNetStateDisabled = 2
+};
+enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+
+NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1);
+static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static void* netPluginLib;
+
+static int netPluginRefCount;
+static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();}
+
+enum {
+  netPluginLoadFailed  = -1,
+  netPluginLoadReady   =  0,
+  netPluginLoadSuccess =  1,
+};
+
+static int netPluginStatus = netPluginLoadReady;
+
+ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
+  static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT;
+  pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce);
+
+  pthread_mutex_lock(&netPluginLock);
+  if (netPluginLoadFailed == netPluginStatus) {
+    goto exit;
+  }
+  if (netPluginLoadSuccess == netPluginStatus) {
+    ++netPluginRefCount;
+    goto exit;
+  }
+
+  netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN"));
+  if (netPluginLib == nullptr) {
+    goto fail;
+  }
+
+  ncclNets[0] = getNcclNet_v10(netPluginLib);
+  if (ncclNets[0]) ncclNetsVer[0] = 10;
+  if (ncclNets[0] == nullptr) {
+    // Try v9 plugin
+    ncclNets[0] = getNcclNet_v9(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 9;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v8 plugin
+    ncclNets[0] = getNcclNet_v8(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 8;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v7 plugin
+    ncclNets[0] = getNcclNet_v7(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 7;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v6 plugin
+    ncclNets[0] = getNcclNet_v6(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 6;
+  }
+  if (ncclNets[0] == nullptr) {
+    goto fail;
+  }
+
+  // Check for CollNet
+  ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v9(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v8(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v7(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v6(netPluginLib);
+  }
+
+  ++netPluginRefCount;
+  netPluginStatus = netPluginLoadSuccess;
+  comm->netPluginLoaded = 1;
+
+exit:
+  pthread_mutex_unlock(&netPluginLock);
+  return ncclSuccess;
+fail:
+  if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib));
+  netPluginStatus = netPluginLoadFailed;
+  goto exit;
+}
+
+ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
+  pthread_mutex_lock(&netPluginLock);
+  if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
+    if (ncclNets[0]) {
+      INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
+    }
+    if (ncclCollNets[0]) {
+      INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
+    }
+    NCCLCHECK(ncclClosePluginLib(netPluginLib));
+    netPluginLib = nullptr;
+    ncclNets[0] = nullptr;
+    ncclCollNets[0] = nullptr;
+    netPluginStatus = netPluginLoadReady;
+    comm->netPluginLoaded = 0;
+    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
+      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
+  }
+  pthread_mutex_unlock(&netPluginLock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
+  ncclNetProperties_t props;
+
+  NCCLCHECK(net->getProperties(dev, &props));
+  ncclNetDeviceType type = props.netDeviceType;
+  if (type) switch (type) {
+    case NCCL_NET_DEVICE_UNPACK:
+      if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
+        INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
+          props.netDeviceVersion);
+        return ncclSuccess;
+      } else {
+        WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
+          props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
+        return ncclInternalError;
+      }
+    default:
+      WARN("Unknown device code index %d \n", type);
+      return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+static ncclResult_t netGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
+    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
+    else ncclNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclCollNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else ncclCollNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclCollNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetInit(struct ncclComm* comm) {
+  // Initialize main communication network
+  const char* netName;
+  bool ok = false;
+
+  netName = comm->config.netName;
+  for (int i=0; i<3; i++) {
+    if (ncclNets[i] == nullptr) continue;
+    enum ncclNetState state;
+    NCCLCHECK(netGetState(i, &state));
+    if (state != ncclNetStateEnabled) continue;
+    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
+    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
+      // Mismatched device plugin version
+      continue;
+    }
+
+    comm->ncclNet = ncclNets[i];
+    comm->ncclNetVer = ncclNetsVer[i];
+    ok = true;
+
+    if (ncclCollNets[i]) {
+      NCCLCHECK(collNetGetState(i, &state));
+      if (state == ncclNetStateEnabled) {
+        comm->ncclCollNet = ncclCollNets[i];
+      }
+    }
+    break;
+  }
+
+  if (!ok) {
+    WARN("Error: network %s not found.", netName ? netName : "");
+    return ncclInvalidUsage;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
+  comm->ncclNet = nullptr;
+  comm->ncclCollNet = nullptr;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
+  constexpr int GPU_BUF_SIZE = 2*1024*1024;
+#if CUDART_VERSION >= 11030
+  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
+  int driverVersion;
+  CUDACHECK(cudaDriverGetVersion(&driverVersion));
+  if (driverVersion >= 11030) {
+    int cudaDev, attr = 0;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
+    *gdrSupport = attr;
+    return ncclSuccess;
+  }
+#endif
+  static int gdrSupportMatrix[32] = {
+	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+  if (gdrSupportMatrix[comm->cudaDev] == -1) {
+    int netDevs;
+    NCCLCHECK(comm->ncclNet->devices(&netDevs));
+    gdrSupportMatrix[comm->cudaDev] = 0;
+    for (int dev=0; dev<netDevs; dev++) {
+      // Find a net device which is GDR-capable
+      ncclNetProperties_t props;
+      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
+      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+
+    // Allocate memory on the GPU and try to register it on the NIC.
+    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+    ncclNetHandle_t handle;
+    char* gpuPtr = NULL;
+    void* mHandle = NULL;
+    ncclResult_t ret;
+    ncclDebugNoWarn = NCCL_NET;
+    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
+
+    bool connected;
+    connected = false;
+    while (!connected) {
+
+      // If we're aborting now, skip to cleanup
+      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
+        goto cleanup2;
+      }
+
+      if (sComm == NULL)
+        NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2);
+
+      if (rComm == NULL)
+        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
+
+      connected = (rComm != NULL) && (sComm != NULL);
+    }
+
+    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
+    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
+      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
+      gdrSupportMatrix[comm->cudaDev] = 1;
+    }
+    ncclDebugNoWarn = 0;
+    NCCLCHECK(ncclCudaFree(gpuPtr));
+cleanup2:
+    if (rComm != NULL)
+      NCCLCHECK(comm->ncclNet->closeRecv(rComm));
+    if (sComm != NULL)
+      NCCLCHECK(comm->ncclNet->closeSend(sComm));
+    NCCLCHECK(comm->ncclNet->closeListen(lComm));
+cleanup1:
+      break;
+    }
+  }
+  *gdrSupport = gdrSupportMatrix[comm->cudaDev];
+  return ncclSuccess;
+}
diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc
new file mode 100644
index 0000000..682f239
--- /dev/null
+++ b/src/plugin/net/net_v10.cc
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+
+static ncclNet_v10_t* ncclNet_v10;
+static ncclCollNet_v10_t* ncclCollNet_v10;
+
+ncclNet_t* getNcclNet_v10(void* lib) {
+  ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10");
+  if (ncclNet_v10) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name);
+    return ncclNet_v10;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol.");
+  return nullptr;
+}
+
+ncclCollNet_t* getNcclCollNet_v10(void* lib) {
+  ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10");
+  if (ncclCollNet_v10) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name);
+    return ncclCollNet_v10;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc
new file mode 100644
index 0000000..baff679
--- /dev/null
+++ b/src/plugin/net/net_v6.cc
@@ -0,0 +1,178 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v6_t* ncclNet_v6;
+static ncclCollNet_v6_t* ncclCollNet_v6;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+  return ncclNet_v6->connect(dev, handle, sendComm);
+}
+
+static ncclResult_t ncclNet_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+  return ncclNet_v6->accept(listenComm, recvComm);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+     ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v6->init(logfn));
+  ncclNet.devices = ncclNet_v6->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v6->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_accept;
+  ncclNet.regMr = ncclNet_regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v6->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v6->iflush;
+  ncclNet.test = ncclNet_v6->test;
+  ncclNet.closeSend = ncclNet_v6->closeSend;
+  ncclNet.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet.closeListen = ncclNet_v6->closeListen;
+  ncclNet.getDeviceMr = NULL;
+  ncclNet.irecvConsumed = NULL;
+  ncclNet.makeVDevice  = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v6(void* lib) {
+  ncclNet_v6 = (ncclNet_v6_t*)dlsym(lib, "ncclNetPlugin_v6");
+  if (ncclNet_v6) {
+    ncclNet.name = ncclNet_v6->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNet_v6->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v6->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v6->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v6->listen;
+  ncclCollNet.connect = ncclCollNet_v6->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = nullptr;
+  ncclCollNet.ireducescatter = nullptr;
+  ncclCollNet.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet.test = ncclCollNet_v6->test;
+  ncclCollNet.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v6->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v6(void* lib) {
+  ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(lib, "ncclCollNetPlugin_v6");
+  if (ncclCollNet_v6) {
+    ncclCollNet.name = ncclCollNet_v6->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNet_v6->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc
new file mode 100644
index 0000000..4bad5ec
--- /dev/null
+++ b/src/plugin/net/net_v7.cc
@@ -0,0 +1,174 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v7_t* ncclNet_v7;
+static ncclCollNet_v7_t* ncclCollNet_v7;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v7_t p7;
+  ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
+  if (ans != ncclSuccess) return ans;
+  props->name = p7.name;
+  props->pciPath = p7.pciPath;
+  props->guid = p7.guid;
+  props->ptrSupport = p7.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p7.speed;
+  props->port = p7.port;
+  props->maxComms = p7.maxComms;
+  props->maxRecvs = p7.maxRecvs;
+  props->latency = p7.latency;
+  props->netDeviceType = p7.netDeviceType;
+  props->netDeviceVersion = p7.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v7->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v7_t p7;
+  ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
+  if (ans != ncclSuccess) return ans;
+  props->name = p7.name;
+  props->pciPath = p7.pciPath;
+  props->guid = p7.guid;
+  props->ptrSupport = p7.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p7.speed;
+  props->port = p7.port;
+  props->maxComms = p7.maxComms;
+  props->maxRecvs = p7.maxRecvs;
+  props->latency = p7.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v7->init(logfn));
+  ncclNet.devices = ncclNet_v7->devices;
+  ncclNet.getProperties = ncclNet_getProperties; // ncclNet_v5->getProperties;
+  ncclNet.listen = ncclNet_v7->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_v7->accept;
+  ncclNet.regMr = ncclNet_regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v7->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v7->iflush;
+  ncclNet.test = ncclNet_v7->test;
+  ncclNet.closeSend = ncclNet_v7->closeSend;
+  ncclNet.closeRecv = ncclNet_v7->closeRecv;
+  ncclNet.closeListen = ncclNet_v7->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v7->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet.makeVDevice  = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v7(void* lib) {
+  ncclNet_v7 = (ncclNet_v7_t*)dlsym(lib, "ncclNetPlugin_v7");
+  if (ncclNet_v7) {
+    ncclNet.name = ncclNet_v7->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNet_v7->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v7->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v7->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v7->listen;
+  ncclCollNet.connect = ncclCollNet_v7->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v7->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v7->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = nullptr;
+  ncclCollNet.ireducescatter = nullptr;
+  ncclCollNet.iflush = ncclCollNet_v7->iflush;
+  ncclCollNet.test = ncclCollNet_v7->test;
+  ncclCollNet.closeColl = ncclCollNet_v7->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v7->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v7(void* lib) {
+  ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(lib, "ncclCollNetPlugin_v7");
+  if (ncclCollNet_v7) {
+    ncclCollNet.name = ncclCollNet_v7->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNet_v7->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc
new file mode 100644
index 0000000..b43bb89
--- /dev/null
+++ b/src/plugin/net/net_v8.cc
@@ -0,0 +1,196 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v8_t* ncclNet_v8;
+static ncclCollNet_v8_t* ncclCollNet_v8;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType = p8.netDeviceType;
+  props->netDeviceVersion = p8.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v8->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
+                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                           void* sendMhandle, void** request) {
+  ncclNetSGE_v8_t recvPartsInt;
+  if (nRecvParts > 1) return ncclInternalError;
+  if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+  recvPartsInt.mhandle = recvParts->mhandle;
+  recvPartsInt.address = recvParts->address;
+  recvPartsInt.size = (int)recvParts->size;
+  ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
+                  bytesPerRank, windowOffset, windowBytes,
+                  sendMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData,
+                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                               ncclDataType_t dataType, ncclRedOp_t redOp,
+                               void* recvMhandle, void** request) {
+  ncclNetSGE_v8_t sendPartsInt;
+  if (nSendParts > 1) return ncclInternalError;
+  if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+  sendPartsInt.mhandle = sendParts->mhandle;
+  sendPartsInt.address = sendParts->address;
+  sendPartsInt.size = (int)sendParts->size;
+  ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
+                  recvData, bytesPerRank, windowOffset, windowBytes,
+                  dataType, redOp,
+                  recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v8->init(logfn));
+  ncclNet.devices = ncclNet_v8->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v8->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_v8->accept;
+  ncclNet.regMr = ncclNet_v8->regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v8->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v8->iflush;
+  ncclNet.test = ncclNet_v8->test;
+  ncclNet.closeSend = ncclNet_v8->closeSend;
+  ncclNet.closeRecv = ncclNet_v8->closeRecv;
+  ncclNet.closeListen = ncclNet_v8->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed;
+  ncclNet.makeVDevice   = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v8(void* lib) {
+  ncclNet_v8 = (ncclNet_v8_t*)dlsym(lib, "ncclNetPlugin_v8");
+  if (ncclNet_v8) {
+    ncclNet.name = ncclNet_v8->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNet_v8->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v8->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v8->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v8->listen;
+  ncclCollNet.connect = ncclCollNet_v8->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v8->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_v8->regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v8->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = ncclCollNet_iallgather;
+  ncclCollNet.ireducescatter = ncclCollNet_ireducescatter;
+  ncclCollNet.iflush = ncclCollNet_v8->iflush;
+  ncclCollNet.test = ncclCollNet_v8->test;
+  ncclCollNet.closeColl = ncclCollNet_v8->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v8->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v8(void* lib) {
+  ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(lib, "ncclCollNetPlugin_v8");
+  if (ncclCollNet_v8) {
+    ncclCollNet.name = ncclCollNet_v8->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNet_v8->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc
new file mode 100644
index 0000000..34e0393
--- /dev/null
+++ b/src/plugin/net/net_v9.cc
@@ -0,0 +1,121 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v9_t* ncclNet_v9;
+static ncclCollNet_v9_t* ncclCollNet_v9;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  return ncclNet_v9->isend(sendComm, data, size, tag, mhandle, request);
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  return ncclNet_v9->irecv(recvComm, n, data, sizes, tags, mhandles, request);
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v9->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) {
+  return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props);
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+}
+
+static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request) {
+  return ncclCollNet_v9->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v9_t*)recvParts, bytesPerRank,
+                             windowOffset, windowBytes, sendMhandle, request);
+}
+
+static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request) {
+  return ncclCollNet_v9->ireducescatter(collComm, nSendParts, (ncclNetSGE_v9_t*)sendParts, recvData, bytesPerRank,
+                                 windowOffset, windowBytes, dataType, redOp, recvMhandle, request);
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v9->init(logfn));
+  ncclNet.devices = ncclNet_v9->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v9->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept = ncclNet_v9->accept;
+  ncclNet.regMr = ncclNet_v9->regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v9->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v9->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v9->iflush;
+  ncclNet.test = ncclNet_v9->test;
+  ncclNet.closeSend = ncclNet_v9->closeSend;
+  ncclNet.closeRecv = ncclNet_v9->closeRecv;
+  ncclNet.closeListen = ncclNet_v9->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed;
+  ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v9(void* lib) {
+  ncclNet_v9 = (ncclNet_v9_t*)dlsym(lib, "ncclNetPlugin_v9");
+  if (ncclNet_v9) {
+    ncclNet.name = ncclNet_v9->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNet_v9->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v9->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v9->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v9->listen;
+  ncclCollNet.connect = ncclCollNet_v9->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v9->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_v9->regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v9->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v9->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_v9->iallreduce;
+  ncclCollNet.iallgather = ncclCollNet_iallgather;
+  ncclCollNet.ireducescatter = ncclCollNet_ireducescatter;
+  ncclCollNet.iflush = ncclCollNet_v9->iflush;
+  ncclCollNet.test = ncclCollNet_v9->test;
+  ncclCollNet.closeColl = ncclCollNet_v9->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v9->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v9(void* lib) {
+  ncclCollNet_v9 = (ncclCollNet_v9_t*)dlsym(lib, "ncclCollNetPlugin_v9");
+  if (ncclCollNet_v9) {
+    ncclCollNet.name = ncclCollNet_v9->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNet_v9->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
new file mode 100644
index 0000000..a43df28
--- /dev/null
+++ b/src/plugin/plugin_open.cc
@@ -0,0 +1,134 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <dlfcn.h>
+
+#include "debug.h"
+
+#define MAX_STR_LEN 255
+
+enum ncclPluginType {
+  ncclPluginTypeNet,
+  ncclPluginTypeTuner,
+  ncclPluginTypeProfiler,
+};
+
+#define NUM_LIBS 3
+static void *libHandles[NUM_LIBS];
+static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
+static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" };
+static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" };
+static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
+
+static void* tryOpenLib(char* name, int* err, char* errStr) {
+  *err = 0;
+  if (nullptr == name || strlen(name) == 0) {
+    return nullptr;
+  }
+
+  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
+    name = nullptr;
+  }
+
+  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == handle) {
+    strncpy(errStr, dlerror(), MAX_STR_LEN);
+    errStr[MAX_STR_LEN] = '\0';
+    // "handle" and "name" won't be NULL at the same time.
+    // coverity[var_deref_model]
+    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+      *err = ENOENT;
+    }
+  }
+  return handle;
+}
+
+static void appendNameToList(char* nameList, int *nameListLen, char* name) {
+  snprintf(nameList, *nameListLen, " %s", name);
+  nameList += strlen(name) + 1;
+  *nameListLen -= strlen(name) + 1;
+}
+
+static void* openPluginLib(enum ncclPluginType type, const char* libName) {
+  int openErr, len = PATH_MAX;
+  char libName_[MAX_STR_LEN] = { 0 };
+  char openErrStr[MAX_STR_LEN + 1] = { 0 };
+  char eNoEntNameList[PATH_MAX] = { 0 };
+
+  if (libName && strlen(libName)) {
+    snprintf(libName_, MAX_STR_LEN, "%s", libName);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+
+    snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+  } else {
+    snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+  }
+
+  if (strlen(eNoEntNameList)) {
+    INFO(subsys[type], "%s/Plugin: Could not find:%s. %s", pluginNames[type], eNoEntNameList, pluginFallback[type]);
+  } else if (strlen(pluginFallback[type])) {
+    INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], pluginFallback[type]);
+  }
+  return nullptr;
+}
+
+void* ncclOpenNetPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeNet, name);
+}
+
+void* ncclOpenTunerPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeTuner, name);
+}
+
+void* ncclOpenProfilerPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeProfiler, name);
+}
+
+void* ncclGetNetPluginLib(void) {
+  return libHandles[ncclPluginTypeNet];
+}
+
+ncclResult_t ncclClosePluginLib(void* handle) {
+  for (int l=0; l<NUM_LIBS; l++) {
+    if (libHandles[l] == handle) {
+      libHandles[l] = nullptr;
+      dlclose(handle);
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
+}
diff --git a/src/misc/profiler.cc b/src/plugin/profiler.cc
similarity index 57%
rename from src/misc/profiler.cc
rename to src/plugin/profiler.cc
index c9fb2a8..023a704 100644
--- a/src/misc/profiler.cc
+++ b/src/plugin/profiler.cc
@@ -11,182 +11,20 @@
 #include "utils.h"
 #include "proxy.h"
 #include "profiler.h"
+#include "transport.h"
+#include "plugin.h"
+
+extern ncclProfiler_t* getNcclProfiler_v1(void* lib);
+extern ncclProfiler_t* getNcclProfiler_v2(void* lib);
+extern ncclProfiler_t* getNcclProfiler_v3(void* lib);
 
 static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
 static int profilerPluginRefCount;
 static void* profilerPluginLib;
 static ncclProfiler_t* ncclProfiler;
-static ncclProfiler_v2_t ncclProfiler_v1_as_v2;
-static ncclProfiler_v1_t* ncclProfiler_v1;
-
-static uint8_t ncclStringToFunc(const char* func) {
-  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
-  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
-  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
-  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
-  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
-  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
-  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
-  return ncclFuncSend;
-}
-
-static uint8_t ncclStringToAlgo(const char* algo) {
-  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
-  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
-  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
-  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
-  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
-  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
-  return NCCL_ALGO_PAT;
-}
-
-static uint8_t ncclStringToProto(const char* proto) {
-  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
-  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
-  return NCCL_PROTO_SIMPLE;
-}
-
-static uint8_t ncclStringToDatatype(const char* dt) {
-  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
-  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
-  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
-  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
-  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
-  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
-  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
-#endif
-  return ncclFloat64;
-}
-
-static ncclResult_t ncclProfiler_v1_as_v2_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr) {
-  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
-  eDescr_v1.type = eDescr->type;
-  eDescr_v1.parentObj = eDescr->parentObj;
-  eDescr_v1.rank = eDescr->rank;
-  switch(eDescr->type) {
-    case ncclProfileGroup: break;
-    case ncclProfileColl: {
-      eDescr_v1.coll.name = eDescr->coll.name;
-      eDescr_v1.coll.commHash = eDescr->coll.commHash;
-      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
-      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
-      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
-      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
-      eDescr_v1.coll.count = eDescr->coll.count;
-      eDescr_v1.coll.root = eDescr->coll.root;
-      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
-      eDescr_v1.coll.op = 0; // removed in v2
-      eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes;
-      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
-      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
-      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
-      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
-    } break;
-    case ncclProfileP2p: {
-      eDescr_v1.p2p.name = eDescr->p2p.name;
-      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
-      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
-      eDescr_v1.p2p.buff = eDescr->p2p.buff;
-      eDescr_v1.p2p.count = eDescr->p2p.count;
-      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
-      eDescr_v1.p2p.peer = eDescr->p2p.peer;
-    } break;
-    case ncclProfileProxyOp: {
-      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
-      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
-      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
-      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
-      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
-      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
-    } break;
-    case ncclProfileProxyStep: {
-      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
-    } break;
-    case ncclProfileProxyCtrl: break;
-    default:;
-  }
-  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
-}
-
-static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) {
-  ncclProfiler_v1->init(context, eActivationMask);
-  ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent;
-  ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent;
-  ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState;
-  ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize;
-  return ncclSuccess;
-}
 
 #define MAX_STR_LEN 256
 
-static void* tryOpenLib(char* name, int *err, char* errStr) {
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = 0;
-    if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openProfilerPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char profilerPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-
-  const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN");
-  if (envProfilerPluginName && strlen(envProfilerPluginName)) {
-    snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName);
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
-      return pluginLib;
-    }
-
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-  } else {
-    snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so");
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-  }
-
-  return nullptr;
-}
-
 enum {
   profilerPluginLoadFailed = -1,
   profilerPluginLoadReady = 0,
@@ -195,43 +33,31 @@ enum {
 static int profilerPluginStatus = profilerPluginLoadReady;
 static pid_t pid;
 
-#define MAX_PLUGIN_LOAD 2
-
 static ncclResult_t ncclProfilerPluginLoad(void) {
   if (profilerPluginLoadFailed == profilerPluginStatus) {
     return ncclSuccess;
   }
 
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
   pthread_mutex_lock(&profilerLock);
   if (profilerPluginLoadSuccess == profilerPluginStatus) {
     ++profilerPluginRefCount;
     goto exit;
   }
 
-  profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
+  profilerPluginLib = ncclOpenProfilerPluginLib(ncclGetEnv("NCCL_PROFILER_PLUGIN"));
   if (profilerPluginLib == nullptr) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames);
-    }
     goto fail;
   }
 
-  ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2");
+  ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
   if (ncclProfiler == nullptr) {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2.");
-    ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1");
-    if (ncclProfiler_v1 == nullptr) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
-      goto fail;
-    } else {
-      ncclProfiler = &ncclProfiler_v1_as_v2;
-      ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name;
-      ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init;
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1.");
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2.");
+    ncclProfiler = getNcclProfiler_v2(profilerPluginLib);
+  }
+  if (ncclProfiler == NULL) {
+    ncclProfiler = getNcclProfiler_v1(profilerPluginLib);
+  }
+  if (ncclProfiler == NULL) {
+    goto fail;
   }
 
   ++profilerPluginRefCount;
@@ -247,7 +73,7 @@ exit:
   pthread_mutex_unlock(&profilerLock);
   return ncclSuccess;
 fail:
-  if (profilerPluginLib) dlclose(profilerPluginLib);
+  if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib));
   profilerPluginStatus = profilerPluginLoadFailed;
   goto exit;
 }
@@ -256,7 +82,7 @@ static ncclResult_t ncclProfilerPluginUnload(void) {
   pthread_mutex_lock(&profilerLock);
   if (0 == (--profilerPluginRefCount)) {
     INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
-    dlclose(profilerPluginLib);
+    NCCLCHECK(ncclClosePluginLib(profilerPluginLib));
     profilerPluginLib = nullptr;
     ncclProfiler = nullptr;
     profilerPluginStatus = profilerPluginLoadReady;
@@ -269,6 +95,11 @@ static ncclResult_t ncclProfilerPluginUnload(void) {
 #include "timer.h"
 
 #if ENABLE_TIMER
+// These counters are used to measure profiler overheads for different part of the code
+// These counters are only useful/meaningful in controlled test environments where there
+// is only one thread updating each set of counters, i.e., every communicator has its
+// own proxy thread and the network uses only one thread to make progress (this is true
+// for net_ib plugin but might not be true for net_socket plugin).
 static int64_t elapsedCount;
 static int64_t initCount, finalizeCount;
 static int64_t groupStartCount, groupStopCount;
@@ -324,15 +155,14 @@ static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2];
 #endif
 
 
-static int eActivationMask;       // Set by profiler
-static int eActivationMaskGroup;  // Cached for current group
+int ncclProfilerEventMask;       // Set by profiler
 
 ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
   TIME_START_EVENT(elapsed);
   TIME_START_EVENT(init);
   ncclProfilerPluginLoad();
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask);
+    int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask);
     if (err) {
       WARN("Profiler init failed with error (%d). Continue without profiler.", err);
       ncclProfiler = NULL;
@@ -356,9 +186,29 @@ ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
 
 ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(groupStart);
-  eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
+    // Check if any collective in the plan has a set event activation mask
+    struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+    int eActivationMask_ = 0;
+    while (ct) {
+      if (ct->eActivationMask) {
+        eActivationMask_ = ct->eActivationMask;
+        goto startGroup;
+      }
+      ct = ct->next;
+    }
+    // Check if any pt2pt in the plan has a set event activation mask
+    while (pt) {
+      if (pt->eActivationMask) {
+        eActivationMask_ = pt->eActivationMask;
+        goto startGroup;
+      }
+      pt = pt->next;
+    }
+
+  startGroup:
+    if (eActivationMask_ & (ncclProfileGroup | ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileGroup;
       ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
@@ -379,52 +229,63 @@ ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) {
 
 ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(taskStart);
-  if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
-    if (plan->groupEventHandle && enable) {
-      struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
-      while (ct) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileColl;
-        eDescr.parentObj = plan->groupEventHandle;
-        eDescr.rank = plan->comm->rank;
-        eDescr.coll.name = plan->comm->commName;
-        eDescr.coll.commHash = plan->comm->commHash;
-        eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
-        eDescr.coll.func = ncclFuncToString(ct->func);
-        eDescr.coll.sendBuff = ct->sendbuff;
-        eDescr.coll.recvBuff = ct->recvbuff;
-        eDescr.coll.count = ct->count;
-        eDescr.coll.root = ct->root;
-        eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
-        eDescr.coll.trafficBytes = ct->trafficBytes;
-        eDescr.coll.nMaxChannels = ct->nMaxChannels;
-        eDescr.coll.nWarps = ct->nWarps;
-        eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
-        eDescr.coll.proto = ncclProtoToString(ct->protocol);
-        ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
-
-        // update collective task with group event activation mask
-        ct->eActivationMask = eActivationMaskGroup;
-        ct = ct->next;
+  struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+  while (ct) {
+    if (__builtin_expect(ncclProfiler != NULL, 0)) {
+      if (plan->groupEventHandle) {
+        int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin);
+        if (enable) {
+          ncclProfilerEventDescr_t eDescr = { 0 };
+          eDescr.type = ncclProfileColl;
+          eDescr.parentObj = plan->groupEventHandle;
+          eDescr.rank = plan->comm->rank;
+          eDescr.coll.name = plan->comm->commName;
+          eDescr.coll.commHash = plan->comm->commHash;
+          eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
+          eDescr.coll.func = ncclFuncToString(ct->func);
+          eDescr.coll.sendBuff = ct->sendbuff;
+          eDescr.coll.recvBuff = ct->recvbuff;
+          eDescr.coll.count = ct->count;
+          eDescr.coll.root = ct->root;
+          eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
+          eDescr.coll.nMaxChannels = ct->nMaxChannels;
+          eDescr.coll.nWarps = ct->nWarps;
+          eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
+          eDescr.coll.proto = ncclProtoToString(ct->protocol);
+          ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
+        }
       }
+    }
+    // comm->seqNumber values are updated even if the plugin is not active, since they are used by RAS as well.
+    // The test for "persistent" is a workaround for graph-captured collectives.  In their case this function may not be
+    // consistently invoked on all the ranks, which would lead to mismatched counter values and thus false-positive
+    // reports from RAS.  Instead, we choose not to include graph-captured collectives in our counts.  An exception is
+    // made if ncclProfileKernelCh profiler events are active, as they result in proxy events always being added, which
+    // gives the consistency.
+    if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle &&
+                              (ct->eActivationMask & ncclProfileKernelCh)))
+      plan->comm->seqNumber[ct->func]++;
+    ct = ct->next;
+  }
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    if (plan->groupEventHandle) {
       struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
       while (pt) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileP2p;
-        eDescr.parentObj = plan->groupEventHandle;
-        eDescr.rank = plan->comm->rank;
-        eDescr.p2p.name = plan->comm->commName;
-        eDescr.p2p.commHash = plan->comm->commHash;
-        eDescr.p2p.func = ncclFuncToString(pt->func);
-        eDescr.p2p.buff = pt->buff;
-        eDescr.p2p.count = pt->count;
-        eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
-        eDescr.p2p.peer = pt->root;
-        ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
-
-        // update collective task with group event activation mask
-        pt->eActivationMask = eActivationMaskGroup;
+        int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh);
+        if (enable) {
+          ncclProfilerEventDescr_t eDescr = { 0 };
+          eDescr.type = ncclProfileP2p;
+          eDescr.parentObj = plan->groupEventHandle;
+          eDescr.rank = plan->comm->rank;
+          eDescr.p2p.name = plan->comm->commName;
+          eDescr.p2p.commHash = plan->comm->commHash;
+          eDescr.p2p.func = ncclFuncToString(pt->func);
+          eDescr.p2p.buff = pt->buff;
+          eDescr.p2p.count = pt->count;
+          eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
+          eDescr.p2p.peer = pt->root;
+          ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
+        }
         pt = pt->next;
       }
     }
@@ -436,16 +297,15 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
 ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(taskStop);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
-    if (plan->groupEventHandle && enable) {
+    if (plan->groupEventHandle) {
       struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
       while (ct) {
-        ncclProfiler->stopEvent(ct->eventHandle);
+        if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle);
         ct = ct->next;
       }
       struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
       while (pt) {
-        ncclProfiler->stopEvent(pt->eventHandle);
+        if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle);
         pt = pt->next;
       }
     }
@@ -463,7 +323,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
+    if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
@@ -485,7 +345,7 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
+    if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
@@ -518,7 +378,7 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
+    if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) {
       int step_ = DIVUP(stepId, args->sliceSteps);
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyStep;
@@ -536,7 +396,7 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
+    if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) {
       int step_ = DIVUP(stepId, args->sliceSteps);
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyStep;
@@ -568,7 +428,7 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand
   TIME_START_EVENT(proxyCtrlStart);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     // for proxy control events we allow profiling mode to change on a per event basis
-    int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
+    int eActivationMaskProxy = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
     if (eActivationMaskProxy & ncclProfileProxyCtrl) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyCtrl;
@@ -591,6 +451,30 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = &args->subs[s];
+    if (sub->eActivationMask & ncclProfileKernelCh) {
+      ncclProfilerEventDescr_t eDescr = { };
+      eDescr.type = ncclProfileKernelCh;
+      eDescr.parentObj = sub->taskEventHandle;
+      eDescr.kernelCh.channelId = sub->channelId;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr);
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = &args->subs[s];
+    if (sub->kernelEventHandle) {
+      ncclProfiler->stopEvent(sub->kernelEventHandle);
+    }
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyOpRecord);
   struct ncclProxySubArgs* sub = &args->subs[s];
@@ -619,7 +503,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs*
 
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyCtrlRecord);
-  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
+  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
     ncclProfilerEventStateArgs_t args = { };
     args.proxyCtrl.appendedProxyOps = appended;
     ncclProfiler->recordEventState(eHandle, eState, &args);
@@ -632,3 +516,47 @@ ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) {
   op->pid = pid;
   return ncclSuccess;
 }
+
+static pthread_mutex_t proxyProfilerConnectLock = PTHREAD_MUTEX_INITIALIZER;
+
+static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxyOp* op) {
+  ncclResult_t ret = ncclSuccess;
+  pthread_mutex_lock(&proxyProfilerConnectLock);
+  if (comm->profiler.initialized) goto exit;
+  for (int c = 0; c < MAXCHANNELS; c++) {
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.sendProxyConn[c]), ret, exit);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.sendProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit);
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.recvProxyConn[c]), ret, exit);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.recvProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit);
+  }
+  comm->profiler.initialized = true;
+exit:
+  pthread_mutex_unlock(&proxyProfilerConnectLock);
+  return ret;
+}
+
+bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op) {
+  bool enabled = (__builtin_expect(ncclProfiler != NULL, 0) && (op->eActivationMask & ncclProfileKernelCh));
+  if (enabled && !comm->profiler.initialized) (void)proxyProfilerConnect(comm, op);
+  return enabled;
+}
+
+ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle;
+    if (type == 0) { // start
+      if (sub->eActivationMask & ncclProfileNetPlugin) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileNetPlugin;
+        eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS];
+        eDescr.rank = sub->rank;
+        eDescr.netPlugin.id = pluginId;
+        eDescr.netPlugin.data = extData;
+        ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr);
+      }
+    } else { // stop
+      ncclProfiler->stopEvent(*eHandle);
+    }
+  }
+  return ncclSuccess;
+}
diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc
new file mode 100644
index 0000000..1397429
--- /dev/null
+++ b/src/plugin/profiler/profiler_v1.cc
@@ -0,0 +1,133 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+#include "checks.h"
+
+static ncclProfiler_t ncclProfiler;
+static ncclProfiler_v1_t* ncclProfiler_v1;
+
+static uint8_t ncclStringToFunc(const char* func) {
+  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
+  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
+  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
+  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
+  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
+  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
+  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
+  return ncclFuncSend;
+}
+
+static uint8_t ncclStringToAlgo(const char* algo) {
+  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
+  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
+  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
+  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
+  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
+  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
+  return NCCL_ALGO_PAT;
+}
+
+static uint8_t ncclStringToProto(const char* proto) {
+  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
+  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
+  return NCCL_PROTO_SIMPLE;
+}
+
+static uint8_t ncclStringToDatatype(const char* dt) {
+  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
+  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
+  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
+  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
+  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
+  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
+  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
+#endif
+  return ncclFloat64;
+}
+
+static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
+  eDescr_v1.type = eDescr->type;
+  eDescr_v1.parentObj = eDescr->parentObj;
+  eDescr_v1.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v1.coll.name = eDescr->coll.name;
+      eDescr_v1.coll.commHash = eDescr->coll.commHash;
+      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
+      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v1.coll.count = eDescr->coll.count;
+      eDescr_v1.coll.root = eDescr->coll.root;
+      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
+      eDescr_v1.coll.op = 0; // removed in v2
+      eDescr_v1.coll.trafficBytes = 0; // removed in v3
+      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
+      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v1.p2p.name = eDescr->p2p.name;
+      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
+      eDescr_v1.p2p.buff = eDescr->p2p.buff;
+      eDescr_v1.p2p.count = eDescr->p2p.count;
+      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
+      eDescr_v1.p2p.peer = eDescr->p2p.peer;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    case ncclProfileKernelCh:
+    case ncclProfileNetPlugin: {
+      *eHandle = NULL;
+      return ncclSuccess;
+    }
+    default:;
+  }
+  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
+}
+
+static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs);
+}
+
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
+  NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask));
+  ncclProfiler.startEvent = ncclProfiler_startEvent;
+  ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent;
+  ncclProfiler.recordEventState = ncclProfiler_recordEventState;
+  ncclProfiler.finalize = ncclProfiler_v1->finalize;
+  return ncclSuccess;
+}
+
+ncclProfiler_t* getNcclProfiler_v1(void* lib) {
+  ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(lib, "ncclProfiler_v1");
+  if (ncclProfiler_v1) {
+    ncclProfiler.name = ncclProfiler_v1->name;
+    ncclProfiler.init = ncclProfiler_init;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v1->name);
+    return &ncclProfiler;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
+  return NULL;
+}
diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc
new file mode 100644
index 0000000..3d00008
--- /dev/null
+++ b/src/plugin/profiler/profiler_v2.cc
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+#include "checks.h"
+
+static ncclProfiler_t ncclProfiler;
+static ncclProfiler_v2_t* ncclProfiler_v2;
+
+static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  if (eDescr->type == ncclProfileKernelCh || eDescr->type == ncclProfileNetPlugin) {
+    *eHandle = NULL;
+    return ncclSuccess;
+  }
+  return ncclProfiler_v2->startEvent(context, eHandle, (ncclProfilerEventDescr_v2_t *)eDescr);
+}
+
+static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs);
+}
+
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
+  NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask));
+  ncclProfiler.startEvent = ncclProfiler_startEvent;
+  ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent;
+  ncclProfiler.recordEventState = ncclProfiler_recordEventState;
+  ncclProfiler.finalize = ncclProfiler_v2->finalize;
+  return ncclSuccess;
+}
+
+ncclProfiler_t* getNcclProfiler_v2(void* lib) {
+  ncclProfiler_v2 = (ncclProfiler_v2_t*)dlsym(lib, "ncclProfiler_v2");
+  if (ncclProfiler_v2) {
+    ncclProfiler.name = ncclProfiler_v2->name;
+    ncclProfiler.init = ncclProfiler_init;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v2->name);
+    return &ncclProfiler;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2");
+  return NULL;
+}
diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc
new file mode 100644
index 0000000..322bea5
--- /dev/null
+++ b/src/plugin/profiler/profiler_v3.cc
@@ -0,0 +1,20 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+
+static ncclProfiler_v3_t* ncclProfiler_v3;
+
+ncclProfiler_t* getNcclProfiler_v3(void* lib) {
+  ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3");
+  if (ncclProfiler_v3) {
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name);
+    return ncclProfiler_v3;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3");
+  return NULL;
+}
diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc
new file mode 100644
index 0000000..443bf78
--- /dev/null
+++ b/src/plugin/tuner.cc
@@ -0,0 +1,99 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "checks.h"
+#include "debug.h"
+#include "tuner.h"
+#include "plugin.h"
+
+extern ncclTuner_t* getNcclTuner_v2(void* lib);
+extern ncclTuner_t* getNcclTuner_v3(void* lib);
+extern ncclTuner_t* getNcclTuner_v4(void* lib);
+
+pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static int tunerPluginRefCount;
+static void* tunerPluginLib = nullptr;
+static ncclTuner_t* tunerSymbol = nullptr;
+
+enum {
+  tunerPluginLoadFailed  = -1,
+  tunerPluginLoadReady   =  0,
+  tunerPluginLoadSuccess =  1,
+};
+
+#define MAX_PLUGIN_LOAD 4
+
+static int status = tunerPluginLoadReady;
+
+ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
+  // Initialize to nullptr by default if plugin tuner cannot be loaded.
+  comm->tuner = nullptr;
+  if (tunerPluginLoadFailed == status) {
+    return ncclSuccess;
+  }
+
+  pthread_mutex_lock(&tunerPluginLock);
+  if (tunerPluginLoadFailed == status) {
+    goto exit;
+  }
+
+  if (tunerPluginLoadSuccess == status) {
+    comm->tuner = tunerSymbol;
+    ++tunerPluginRefCount;
+    goto exit;
+  }
+
+  tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN"));
+  if (nullptr == tunerPluginLib) {
+    tunerPluginLib = ncclGetNetPluginLib();
+    if (nullptr == tunerPluginLib) {
+      goto fail;
+    }
+  }
+
+  tunerSymbol = getNcclTuner_v4(tunerPluginLib);
+  if (tunerSymbol == NULL) {
+    tunerSymbol = getNcclTuner_v3(tunerPluginLib);
+  }
+  if (tunerSymbol == NULL) {
+    tunerSymbol = getNcclTuner_v2(tunerPluginLib);
+  }
+  if (tunerSymbol == NULL) {
+    goto fail;
+  }
+
+  comm->tuner = tunerSymbol;
+  ++tunerPluginRefCount;
+  status = tunerPluginLoadSuccess;
+  comm->tunerPluginLoaded = 1;
+
+exit:
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+fail:
+  tunerPluginLib = nullptr;
+  status = tunerPluginLoadFailed;
+  goto exit;
+}
+
+ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
+  pthread_mutex_lock(&tunerPluginLock);
+  if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
+    INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
+    NCCLCHECK(ncclClosePluginLib(tunerPluginLib));
+    tunerPluginLib = nullptr;
+    tunerSymbol = nullptr;
+    comm->tuner = nullptr;
+    status = tunerPluginLoadReady;
+    comm->tunerPluginLoaded = 0;
+  }
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+}
diff --git a/src/plugin/tuner/tuner_v2.cc b/src/plugin/tuner/tuner_v2.cc
new file mode 100644
index 0000000..005638f
--- /dev/null
+++ b/src/plugin/tuner/tuner_v2.cc
@@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "checks.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v2_t* ncclTuner_v2;
+static ncclTuner_t ncclTuner;
+
+static int hasNvlsSupport(float** collCostTable) {
+  // Requirements for support of different algorithms:
+  //
+  // - NVLS intra-node: nvlsSupport
+  // - NVLS intra+inter-node: collNetSupport
+  // - NVLSTree intra-node: always disabled
+  // - NVLSTree inter-node: nvlsSupport
+  // - Collnet* inter-node: collNetSupport
+  //
+  // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
+}
+
+static int hasCollNetSupport(float** collCostTable) {
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
+}
+
+static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
+  int algorithm = NCCL_ALGO_UNDEF;
+  int protocol = NCCL_PROTO_UNDEF;
+  int nvlsSupport = hasNvlsSupport(collCostTable);
+  int collNetSupport = hasCollNetSupport(collCostTable);
+  NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
+  // set time to 0 below to make sure this algorithm/protocol is selected later on
+  if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
+    float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+    if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
+  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context));
+  ncclTuner.getCollInfo = ncclTuner_getCollInfo;
+  ncclTuner.destroy = ncclTuner_v2->destroy;
+  return ncclSuccess;
+}
+
+ncclTuner_t* getNcclTuner_v2(void* lib) {
+  ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(lib, "ncclTunerPlugin_v2");
+  if (ncclTuner_v2) {
+    ncclTuner.name = ncclTuner_v2->name;
+    ncclTuner.init = ncclTuner_init;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name);
+    return &ncclTuner;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
+  return NULL;
+}
diff --git a/src/plugin/tuner/tuner_v3.cc b/src/plugin/tuner/tuner_v3.cc
new file mode 100644
index 0000000..3898243
--- /dev/null
+++ b/src/plugin/tuner/tuner_v3.cc
@@ -0,0 +1,38 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "checks.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v3_t* ncclTuner_v3;
+static ncclTuner_t ncclTuner;
+
+static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
+  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
+  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context));
+  ncclTuner.getCollInfo = ncclTuner_getCollInfo;
+  ncclTuner.destroy = ncclTuner_v3->destroy;
+  return ncclSuccess;
+}
+
+ncclTuner_t* getNcclTuner_v3(void* lib) {
+  ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(lib, "ncclTunerPlugin_v3");
+  if (ncclTuner_v3) {
+    ncclTuner.name = ncclTuner_v3->name;
+    ncclTuner.init = ncclTuner_init;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name);
+    return &ncclTuner;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
+  return NULL;
+}
diff --git a/src/plugin/tuner/tuner_v4.cc b/src/plugin/tuner/tuner_v4.cc
new file mode 100644
index 0000000..4bfd116
--- /dev/null
+++ b/src/plugin/tuner/tuner_v4.cc
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v4_t* ncclTuner_v4;
+
+ncclTuner_t* getNcclTuner_v4(void* lib) {
+  ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4");
+  if (ncclTuner_v4) {
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name);
+    return ncclTuner_v4;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
+  return NULL;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index 5a83ef3..7e8021e 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -383,6 +383,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->pid = op->pid;
   sub->profilerContext = op->profilerContext;
   sub->ringAlgo = op->ringAlgo;
+  sub->workCounter = op->workCounter;
   args->nsubs = subIndex+1;
   if (subIndex) {
     if ((args->sliceSteps != op->sliceSteps) ||
@@ -532,6 +533,19 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
   return ncclSuccess;
 }
 
+static ncclResult_t SaveProxyProfiler(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
+  struct ncclProxyConnector* proxyConn = (op->coll == ncclFuncRecv) ? &comm->profiler.recvProxyConn[op->channelId] : &comm->profiler.sendProxyConn[op->channelId];
+  if (justInquire) *justInquire = true;
+  else {
+    op->sendbuff = (uint8_t *)comm->profiler.workStarted;
+    op->recvbuff = (uint8_t *)comm->profiler.workCompleted;
+    NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op));
+    // Ensure that in graph capturing the proxy workCounter is incremented to keep up with kernel workCounter
+    op->workCounter += comm->profiler.workCounter[op->channelId];
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
   if (peer < 0) return ncclSuccess;
 
@@ -612,20 +626,19 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       // Run full algorithm to count the number of steps for each peer.
       ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      int last = 0;
-      int *nstepsSend = NULL, *nstepsRecv = NULL;
       const int rank = comm->rank, nranks = comm->nRanks;
-      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
       NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up);
       NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up);
 
-      while (last == 0) {
-        int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
-        size_t inpIx, outIx;
-        algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
-        if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
-        if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
-      }
+      struct ncclPatStep ps;
+      do {
+        algo.getNextOp(&ps);
+        if (ps.flags & PatSkipped) continue;
+        if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++;
+        if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++;
+      } while (ps.last != 2);
       for (int i=0; i<log2Up(nranks); i++) {
         if (nstepsSend[i]) {
           int sendPeer = (rank + (1<<i)) % nranks;
@@ -647,20 +660,19 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       // Run full algorithm to count the number of steps for each peer.
       ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      int last = 0;
-      int *nstepsSend = NULL, *nstepsRecv = NULL;
       const int rank = comm->rank, nranks = comm->nRanks;
-      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
       NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down);
       NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down);
 
-      while (last == 0) {
-        int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
-        size_t inpIx, outIx;
-        algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
-        if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
-        if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
-      }
+      struct ncclPatStep ps;
+      do {
+        algo.getNextOp(&ps);
+        if (ps.flags & PatSkipped) continue;
+        if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++;
+        if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++;
+      } while (ps.last != 2);
       for (int i=0; i<log2Up(nranks); i++) {
         if (nstepsSend[i]) {
           int sendPeer = (rank - (1<<i) + nranks) % nranks;
@@ -683,6 +695,11 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       if (op->root == comm->rank) return ncclSuccess;
       NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire));
     } break;
+  case ncclPatternProfiler: {
+      if (ncclProfilerNeedsProxy(comm, op)) {
+        NCCLCHECK(SaveProxyProfiler(comm, op, justInquire));
+      }
+    } break;
   }
   return ncclSuccess;
 }
@@ -725,10 +742,10 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
   while (op) {
     if (op->state == ncclProxyOpNone) return ncclInternalError;
     TIME_START(0); TIME_START(1);
-    NCCLCHECK(op->progress(proxyState, op));
+    ncclResult_t ret = op->progress(proxyState, op);
     if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
     *idle &= op->idle;
-    if (op->state == ncclProxyOpNone) {
+    if (op->state == ncclProxyOpNone || ret != ncclSuccess) {
       TIME_START(2);
       NCCLCHECK(removeOp(state, &op, &prevOp));
       TIME_STOP(2);
@@ -910,7 +927,7 @@ void* ncclProxyProgress(void *proxyState_) {
     if (ret != ncclSuccess) {
       __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
       INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
-      continue;
+      break;
     }
     void* eHandle;
     ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
@@ -932,7 +949,7 @@ void* ncclProxyProgress(void *proxyState_) {
       }
     }
     lastIdle = idle;
-  } while (state->stop == 0 || (state->stop == 1 && state->active));
+  } while ((state->stop == 0 || (state->stop == 1 && state->active)) && __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0);
   return NULL;
 }
 
@@ -1140,6 +1157,7 @@ ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyCon
   }
 
   ncclIpcHdr hdr;
+  memset(&hdr, '\0', sizeof(hdr));
   hdr.type = type;
   hdr.rank = rank;
   hdr.reqSize = reqSize;
@@ -1323,9 +1341,12 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) {
     pthread_mutexattr_init(&mutexAttr);
     pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
     pthread_mutex_init(&pool->mutex, &mutexAttr);
+    pthread_mutexattr_destroy(&mutexAttr);
     pthread_condattr_t condAttr;
+    pthread_condattr_init(&condAttr);
     pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
     pthread_cond_init(&pool->cond, &condAttr);
+    pthread_condattr_destroy(&condAttr);
     state->opsPool = pool;
 
     memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1);
diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc
index 3e4e9a5..3eafe1b 100644
--- a/src/ras/client_support.cc
+++ b/src/ras/client_support.cc
@@ -4,8 +4,6 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out duriyng development only!
-#include <cassert>
 #include <cstdarg>
 #include <cstddef>
 
@@ -26,26 +24,26 @@
 #define STR2(v) #v
 #define STR(v) STR2(v)
 
-// The RAS client listening socket of this RAS thread (normally port 28028).
-int rasClientListeningSocket = -1;
-
-// Auxiliary structure used when processing the results.  Helps with statistics gathering and sorting.
+// Generic auxiliary structure used when processing the results.  Helps with statistics gathering and sorting,
+// e.g., for the calculation of the distribution of the number of peers per node, of the number of GPUs per peer,
+// of the communicator sizes, or of the counts of collective operations.
 struct rasValCount {
   uint64_t value; // The observed value.
   int count; // The number of occurences of this value in the results.
   int firstIdx; // The index of the first occurence of this value in the results.
 };
 
-// Used in rasAuxComm below.  The values are bitmasks so that they can be combined.
+// Communicator status, used in rasAuxComm below.  The values are bitmasks so that they can be combined.
 typedef enum {
-  RAS_ACS_UNKNOWN = 1, // Set if a peer did not provide info about a given communicator.
+  RAS_ACS_NOCOMM = 1, // Set if the peer claims not to be a member of a given communicator.
   RAS_ACS_INIT = 2,
   RAS_ACS_RUNNING = 4,
   RAS_ACS_FINALIZE = 8,
   RAS_ACS_ABORT = 16
 } rasACStatus;
 
-// Used in rasAuxComm below.  The values are bitmasks so that they can be combined (with the exception of RAS_ACE_OK).
+// Communicator errors, used in rasAuxComm below.  The values are bitmasks so that they can be combined (with the
+// exception of RAS_ACE_OK).
 typedef enum {
   RAS_ACE_OK = 0,
   RAS_ACE_MISMATCH = 1,
@@ -53,22 +51,45 @@ typedef enum {
   RAS_ACE_INCOMPLETE = 4
 } rasACError;
 
-// Auxiliary structure used when processing the results.  Helps with sorting and includes additional statistics
-// on the number of peers and nodes for a communicator.
+// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query.  For each communicator, caches
+// statistics extracted from the results, such as the number of peers and nodes or the communicator status.  Includes
+// a pointer to the communicator data in the results, making it easy to sort the communicators by a different key
+// without altering the results buffer, or just to iterate over the communicators, given that the communicator data
+// in the resuls is of variable length.
 struct rasAuxComm {
-  struct rasCollComms::comm* comm;
+  struct rasCollComms::comm* comm; // Points to the results buffer.
   int nPeers;
   int nNodes;
   int ranksPerNodeMin;
   int ranksPerNodeMax;
   unsigned int status; // Bitmask of rasACStatus values.
   unsigned int errors; // Bitmask of rasACError values.
-  uint64_t firstCollOpCount; // collOpCount of the first rank, to compare against.
+  uint64_t firstCollOpCounts[NCCL_NUM_FUNCTIONS]; // collOpCounts of the first rank, to compare against.
+  int nIncompleteRanks; // Number of ranks that we didn't get any response from.
 };
 
+// Auxiliary structure used when processing the rasPeerInfo data stored in the global rasPeers array.  Makes it possible
+// to extract a subset of peers (e.g., the dead ones), to sort by a different key without altering the original array,
+// and also has room for extracted temporary data such as the number of peers per node or the number of GPUs per peer.
+struct rasAuxPeerInfo {
+  struct rasPeerInfo* peer; // Points to an element in rasPeers.
+  int value;
+};
+
+// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query, specifically when iterating over
+// each communicator's ranks.  Makes it possible to sort by a different key without altering the original array, and
+// also has room for extracted temporary data such as the rank's status or a count of collective operations.
+struct rasAuxCommRank {
+  struct rasCollComms::comm::rank* rank; // Points to the results buffer.
+  uint64_t value;
+};
+
+// The RAS client listening socket of this RAS thread (normally port 28028).
+int rasClientListeningSocket = -1;
+
 // Connected RAS clients.
-struct rasClient* rasClients;
-int nRasClients;
+struct rasClient* rasClientsHead;
+struct rasClient* rasClientsTail;
 
 // Minimum byte count to increment the output buffer size by if it's too small.
 #define RAS_OUT_INCREMENT 4096
@@ -85,6 +106,7 @@ static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS
                            // Still, 1024 should normally be plenty (verbose output may make things more difficult,
                            // but we do check for overflows, so it will just be trimmed).
 
+
 static ncclResult_t getNewClientEntry(struct rasClient** pClient);
 static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen);
 static void rasClientTerminate(struct rasClient* client);
@@ -101,15 +123,13 @@ static void rasOutExtract(char* buffer);
 static int rasOutLength();
 static void rasOutReset();
 
-static int rasPeersNGpuCompare(const void* e1, const void* e2);
-static int rasPeersNProcsCompare(const void* e1, const void* e2);
-static int rasPeersHostPidCompare(const void* e1, const void* e2);
+static int rasAuxPeersValueCompare(const void* e1, const void* e2);
 static int ncclSocketsHostCompare(const void* p1, const void* p2);
 static int rasValCountsCompareRev(const void* p1, const void* p2);
 static int rasAuxCommsCompareRev(const void* p1, const void* p2);
-static int rasCommRanksPeerCompare(const void* p1, const void* p2);
-static int rasCommRanksCollOpCompare(const void* p1, const void* p2);
+static int rasAuxCommRanksValueCompare(const void* p1, const void* p2);
 
+static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size);
 static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size);
 static const char* ncclErrorToString(ncclResult_t err);
 static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size);
@@ -181,21 +201,20 @@ fail:
 // Returns the index of the first available entry in the rasClients array, enlarging the array if necessary.
 static ncclResult_t getNewClientEntry(struct rasClient** pClient) {
   struct rasClient* client;
-  int i;
-  for (i = 0; i < nRasClients; i++)
-    if (rasClients[i].status == RAS_CLIENT_CLOSED)
-      break;
-  if (i == nRasClients) {
-    NCCLCHECK(ncclRealloc(&rasClients, nRasClients, nRasClients+RAS_INCREMENT));
-    nRasClients += RAS_INCREMENT;
-  }
 
-  client = rasClients+i;
-  memset(client, '\0', sizeof(*client));
+  NCCLCHECK(ncclCalloc(&client, 1));
+
   client->sock = client->pfd = -1;
   ncclIntruQueueConstruct(&client->sendQ);
   client->timeout =  RAS_COLLECTIVE_LEG_TIMEOUT;
-  client->collIdx = -1;
+
+  if (rasClientsHead) {
+    rasClientsTail->next = client;
+    client->prev = rasClientsTail;
+    rasClientsTail = client;
+  } else {
+    rasClientsHead = rasClientsTail = client;
+  }
 
   *pClient = client;
   return ncclSuccess;
@@ -219,22 +238,32 @@ static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgL
   struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
   meta->offset = 0;
   meta->length = msgLen;
-  ncclIntruQueueEnqueue(&client->sendQ, meta);
-  assert(client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED);
-  rasPfds[client->pfd].events |= POLLOUT;
+  if (client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED) {
+    ncclIntruQueueEnqueue(&client->sendQ, meta);
+    rasPfds[client->pfd].events |= POLLOUT;
+  } else {
+    INFO(NCCL_RAS, "RAS invalid client status %d -- internal error?", client->status);
+  }
 }
 
 // Terminates a connection with a RAS client.
 static void rasClientTerminate(struct rasClient* client) {
   (void)close(client->sock);
-  client->sock = -1;
-  client->status = RAS_CLIENT_CLOSED;
   rasPfds[client->pfd].fd = -1;
   rasPfds[client->pfd].events = rasPfds[client->pfd].revents = 0;
-  client->pfd = -1;
   while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&client->sendQ)) {
     free(meta);
   }
+
+  if (client == rasClientsHead)
+    rasClientsHead = rasClientsHead->next;
+  if (client == rasClientsTail)
+    rasClientsTail = rasClientsTail->prev;
+  if (client->prev)
+    client->prev->next = client->next;
+  if (client->next)
+    client->next->prev = client->prev;
+  free(client);
 }
 
 
@@ -245,16 +274,12 @@ static void rasClientTerminate(struct rasClient* client) {
 // Invoked when an asynchronous operation that a client was waiting on completes.  Finds the right client and
 // reinvokes rasClientRun.
 ncclResult_t rasClientResume(struct rasCollective* coll) {
-  int collIdx = coll-rasCollectives;
-  int i;
-  struct rasClient* client = nullptr;
-  for (i = 0; i < nRasClients; i++) {
-    client = rasClients+i;
-    if (client->status != RAS_CLIENT_CLOSED && client->collIdx == collIdx) {
+  struct rasClient* client;
+
+  for (client = rasClientsHead; client; client = client->next)
+    if (client->coll == coll)
       break;
-    }
-  }
-  if (i == nRasClients) {
+  if (client == nullptr) {
     INFO(NCCL_RAS, "RAS failed to find a matching client!");
     rasCollFree(coll);
     goto exit;
@@ -266,8 +291,7 @@ exit:
 }
 
 // Handles a ready client FD from the main event loop.
-void rasClientEventLoop(int clientIdx, int pollIdx) {
-  struct rasClient* client = rasClients+clientIdx;
+void rasClientEventLoop(struct rasClient* client, int pollIdx) {
   bool closed = false;
 
   if (client->status == RAS_CLIENT_CONNECTED) {
@@ -431,7 +455,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) {
         break;
       }
     case RAS_CLIENT_CONNS:
-      assert(client->collIdx != -1);
       NCCLCHECKGOTO(rasClientRunConns(client), ret, exit);
 #endif
       client->status = RAS_CLIENT_COMMS;
@@ -440,7 +463,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) {
         break;
       }
     case RAS_CLIENT_COMMS:
-      assert(client->collIdx != -1);
       NCCLCHECKGOTO(rasClientRunComms(client), ret, exit);
       client->status = RAS_CLIENT_FINISHED;
       break;
@@ -459,7 +481,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasPeerInfo* peersReSorted = nullptr;
+  struct rasAuxPeerInfo* auxRasPeers = nullptr;
   int totalGpus, totalNodes, firstNGpusNode, firstNGpusGlobal, firstNPeersGlobal;
   bool consistentNGpusNode, consistentNGpusGlobal, consistentNPeersGlobal;
   int firstIdx, nPeers;
@@ -467,6 +489,8 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   int nValCounts;
   static int cudaDriver = -1, cudaRuntime = -1;
 
+  TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting");
+
   rasOutReset();
   rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
                " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
@@ -481,7 +505,6 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
 
-  rasOutReset();
   totalGpus = totalNodes = 0;
   firstNGpusNode = 0; // #GPUs on the first peer of a node.
   firstNGpusGlobal = 0; // #GPUs on peerIdx 0.
@@ -489,7 +512,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   consistentNGpusGlobal = true; // Whether #GPUs/peer is consistent between the peers *on all nodes*.
   consistentNPeersGlobal = true; // Whether #peers/node is consistent between all nodes.
   nPeers = 0; // #peers on a node.
-  firstNPeersGlobal = 0;
+  firstNPeersGlobal = 0; // #peers on the first node.
   for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
     int nGpus = __builtin_popcountll(rasPeers[peerIdx].cudaDevs);
     totalGpus += nGpus;
@@ -522,6 +545,11 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
     }
   } // for (peerIdx)
 
+  TRACE(NCCL_RAS, "RAS: totalNodes %d, nRasPeers %d, totalGpus %d", totalNodes, nRasPeers, totalGpus);
+  TRACE(NCCL_RAS, "RAS: consistentNPeersGlobal %d, consistentNGpusGlobal %d, consistentNGpusNode %d",
+        consistentNPeersGlobal, consistentNGpusGlobal, consistentNGpusNode);
+  TRACE(NCCL_RAS, "RAS: firstNPeersGlobal %d, firstNGpusGlobal %d", firstNPeersGlobal, firstNGpusGlobal);
+
   rasOutAppend("Job summary\n"
                "===========\n\n");
 
@@ -532,22 +560,24 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
                  totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
   } else {
     // Gather the stats on the number of processes per node.  However, that number is not a property of a peer,
-    // but of a group of peers, so calculating it is more involved.  We make a copy of rasPeers and creatively
-    // misuse it: cudaDevs of each element will be repurposed to store the number of processes on the node.
-    NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail);
-    memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+    // but of a group of peers, so calculating it is more involved.  We store the value in a temporary auxRasPeers
+    // array.
+    NCCLCHECKGOTO(ncclCalloc(&auxRasPeers, nRasPeers), ret, fail);
 
     firstIdx = 0;
     nPeers = 0;
     for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+      auxRasPeers[peerIdx].peer = rasPeers+peerIdx;
       if (peerIdx == 0) {
         nPeers = 1;
         firstIdx = 0;
       } else { // peerIdx > 0
-        if (!ncclSocketsSameNode(&peersReSorted[peerIdx].addr, &peersReSorted[peerIdx-1].addr)) {
+        if (!ncclSocketsSameNode(&auxRasPeers[peerIdx].peer->addr, &auxRasPeers[peerIdx-1].peer->addr)) {
+          TRACE(NCCL_RAS, "RAS: node %s: nPeers %d",
+                ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers);
           for (int i = firstIdx; i < peerIdx; i++) {
             // Go back and update the number of processes of all the elements of that node.
-            peersReSorted[i].cudaDevs = nPeers;
+            auxRasPeers[i].value = nPeers;
           }
           nPeers = 1;
           firstIdx = peerIdx;
@@ -557,21 +587,23 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
       } // peerIdx > 0
       if (peerIdx == nRasPeers-1) {
         // Last iteration of the loop.
+        TRACE(NCCL_RAS, "RAS: node %s: nPeers %d",
+              ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers);
         for (int i = firstIdx; i < nRasPeers; i++) {
-          peersReSorted[i].cudaDevs = nPeers;
+          auxRasPeers[i].value = nPeers;
         }
       }
     } // for (peerIdx)
 
-    // Re-sort it now using the number of processes on the node (cudaDevs) as the primary key, host IP as the
+    // Re-sort it now using the number of processes on the node (value) as the primary key, host IP as the
     // secondary, and process id as the tertiary.
-    qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNProcsCompare);
+    qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare);
 
     // Calculate the distribution of different numbers of peers per node.
     nValCounts = 0;
     for (int peerIdx = 0; peerIdx < nRasPeers;) {
-      if (peerIdx == 0 || peersReSorted[peerIdx].cudaDevs != peersReSorted[peerIdx-1].cudaDevs) {
-        valCounts[nValCounts].value = peersReSorted[peerIdx].cudaDevs;
+      if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) {
+        valCounts[nValCounts].value = auxRasPeers[peerIdx].value;
         valCounts[nValCounts].count = 1;
         valCounts[nValCounts].firstIdx = peerIdx;
         nValCounts++;
@@ -579,14 +611,15 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         valCounts[nValCounts-1].count++;
       }
       // Advance peerIdx to the next node.
-      peerIdx += peersReSorted[peerIdx].cudaDevs;
-    }
+      peerIdx += auxRasPeers[peerIdx].value;
+    } // for (peerIdx)
     // valCounts is currently sorted by value (the number of peers per node).  Sort it by the count (most frequent
     // number of peers first).
     qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
 
     // Print it out, the most frequent peer counts first.
     if (consistentNGpusNode && consistentNGpusGlobal) {
+      // consistentNPeersGlobal must be false
       rasOutAppend("  Nodes  Processes         GPUs\n"
                    "          per node  per process\n");
       for (int i = 0; i < nValCounts; i++) {
@@ -594,7 +627,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         rasOutAppend("%7d  %9ld  %11d\n",
                      vc->count, vc->value, firstNGpusGlobal);
       }
-    } else {
+    } else { // !consistentNGpusNode || !consistentNGpusGlobal
       rasOutAppend("  Nodes  Processes\n"
                    "          per node\n");
       for (int i = 0; i < nValCounts; i++) {
@@ -606,24 +639,29 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
       // We calculate and print the GPUs/process separately.  This is required for !consistentNGpusNode and
       // it also makes our life easier above for !consistentNGpusGlobal (which could require a larger valCounts).
 
-      // Sort peers by the GPU count, to simplify data extraction.
-      memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+      // Sort peers by the GPU count, to simplify data extraction.  Not sure how fast __builtin_popcountll is so we
+      // may just as well cache it...
+      for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+        auxRasPeers[peerIdx].value = __builtin_popcountll(auxRasPeers[peerIdx].peer->cudaDevs);
+        TRACE(NCCL_RAS, "RAS: node %s pid %d: nGpus %d",
+              ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)),
+              auxRasPeers[peerIdx].peer->pid, auxRasPeers[peerIdx].value);
+      }
       // GPU count is the primary key, host IP is the secondary, and process id is the tertiary.
-      qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNGpuCompare);
+      qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare);
 
       // Calculate the distribution of different numbers of GPUs per peer.
       nValCounts = 0;
       for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
-        if (peerIdx == 0 || __builtin_popcountll(peersReSorted[peerIdx].cudaDevs) !=
-                            __builtin_popcountll(peersReSorted[peerIdx-1].cudaDevs)) {
-          valCounts[nValCounts].value = __builtin_popcountll(peersReSorted[peerIdx].cudaDevs);
+        if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) {
+          valCounts[nValCounts].value = auxRasPeers[peerIdx].value;
           valCounts[nValCounts].count = 1;
           valCounts[nValCounts].firstIdx = peerIdx;
           nValCounts++;
         } else {
           valCounts[nValCounts-1].count++;
         }
-      }
+      } // for (peerIdx)
       // valCounts is currently sorted by value (number of GPUs per peer).  Sort it by the count (most frequent
       // GPU counts first).
       qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
@@ -637,7 +675,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         rasOutAppend("         %9d  %11ld\n",
                      vc->count, vc->value);
       }
-    }
+    } // !consistentNGpusNode || !consistentNGpusGlobal
     rasOutAppend("\n"
                  "  Nodes  Processes         GPUs\n"
                  "(total)    (total)      (total)\n"
@@ -652,16 +690,16 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         // provided that they meet our definition of an outlier.
         if (rasCountIsOutlier(vc->count, client->verbose, totalNodes)) {
           rasOutAppend("\nThe outlier node%s:\n", (vc->count > 1 ? "s" : ""));
-          // peersReSorted is sorted by the node IP address (not port!) as the secondary key and the pid as
+          // auxRasPeers is sorted by the node IP address (not port!) as the secondary key and the pid as
           // the tertiary, which comes in handy when printing...
           for (int peerIdx = vc->firstIdx; peerIdx < vc->count*vc->value + vc->firstIdx; peerIdx += vc->value) {
             lineBuf[0] = '\0';
             for (int j = 0; j < vc->value; j++) {
               snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                       (j > 0 ? "," : ""), peersReSorted[j].pid);
+                       (j > 0 ? "," : ""), auxRasPeers[j].peer->pid);
             }
             rasOutAppend("  Node %s running process%s %s\n",
-                         ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
+                         ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)),
                          (vc->value > 1 ? "es" : ""), lineBuf);
           } // for (peerIdx)
         } // if (rasCountIsOutlier(vc->count))
@@ -678,13 +716,12 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_CONNS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_CONNS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress; // We need to wait for async. responses.
   }
@@ -696,18 +733,18 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_COMMS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress;
   }
+  TRACE(NCCL_RAS, "RAS: rasClientRunInit: scheduling RAS_COLL_COMMS and finishing");
 exit:
-  free(peersReSorted);
+  free(auxRasPeers);
   return ret;
 fail:
   goto exit;
@@ -721,13 +758,16 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollective* coll = client->coll;
   struct rasCollConns* connsData = (struct rasCollConns*)coll->data;
   int expected;
   struct rasPeerInfo* peersBuf = nullptr;
 
-  assert(coll->nFwdSent == coll->nFwdRecv);
-  client->collIdx = -1;
+  if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) {
+    INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status);
+    return ncclInternalError;
+  }
+  client->coll = nullptr;
 
   rasOutReset();
   rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9);
@@ -822,13 +862,12 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_COMMS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress;
   }
@@ -847,10 +886,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollective* coll = client->coll;
   struct rasCollComms* commsData = (struct rasCollComms*)coll->data;
   struct rasCollComms::comm* comm;
-  struct rasCollComms::comm::rank* ranksReSorted = nullptr;
+  struct rasAuxCommRank* auxCommRanks = nullptr;
   struct rasValCount* valCounts = nullptr;
   int nValCounts;
   struct rasValCount* collOpCounts = nullptr;
@@ -860,7 +899,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   int vcIdx;
   int nPeersMissing;
   uint64_t* peerNvmlDevs = nullptr;
-  const char*const statusStr[] = { "UNKNOWN", "INIT", "RUNNING", "FINALIZE", "ABORT" };
+  const char*const statusStr[] = { "NOCOMM", "INIT", "RUNNING", "FINALIZE", "ABORT" };
   const char*const errorStr[] = {
     // Listing them all like this, while a bit of a hassle, is less effort than formatting in a temporary buffer.
     "OK",
@@ -873,14 +912,22 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     "INCOMPLETE,ERROR,MISMATCH"
   };
 
-  assert(coll->nFwdSent == coll->nFwdRecv);
-  client->collIdx = -1;
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: starting");
+  TRACE(NCCL_RAS, "RAS: coll nLegTimeouts %d, nPeers %d, nData %d; commsData nComms %d",
+        coll->nLegTimeouts, coll->nPeers, coll->nData, commsData->nComms);
+
+  if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) {
+    INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status);
+    return ncclInternalError;
+  }
+  client->coll = nullptr;
 
   rasOutReset();
   rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
 
   // Calculate the number of missing peers early as we rely on it for other things.
   nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers;
+  TRACE(NCCL_RAS, "RAS: nRasPeers %d, nRasDeadPeers %d, nPeersMissing %d", nRasPeers, nRasDeadPeers, nPeersMissing);
 
   // Sort the communicators by size.  As the structure is inconvenient to move around due to the elements being
   // of variable length, we create an auxiliary array that includes pointers to individual elements and simply sort
@@ -896,12 +943,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     auxComms[commIdx].comm = comm;
     comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
   }
-  NCCLCHECKGOTO(ncclCalloc(&ranksReSorted, maxCommSize), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&auxCommRanks, maxCommSize), ret, fail);
+  TRACE(NCCL_RAS, "RAS: maxCommSize %d", maxCommSize);
 
   // For convenience, create a translation table from rasCollective's peerIdx to rasPeers peerIdx.
   NCCLCHECKGOTO(ncclCalloc(&peerIdxConv, coll->nPeers), ret, fail);
-  for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++)
+  for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) {
     peerIdxConv[peerIdx] = rasPeerFind(coll->peers+peerIdx);
+    TRACE(NCCL_RAS, "RAS: coll peers[%d] -> rasPeers[%d]", peerIdx, peerIdxConv[peerIdx]);
+  }
   // Sort coll->peers to match the ordering of rasPeers -- we may need it later...
   qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
 
@@ -910,42 +960,75 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     struct rasAuxComm* auxComm = auxComms+commIdx;
     int nRanks = 0;
     comm = auxComm->comm;
+    TRACE(NCCL_RAS, "RAS: coll comms[%d]: commId (0x%lx, 0x%lx, 0x%lx), commNRanks %d, nRanks %d, nMissingRanks %d",
+          commIdx, comm->commId.commHash, comm->commId.hostHash, comm->commId.pidHash,
+          comm->commNRanks, comm->nRanks, comm->nMissingRanks);
 
-    if (comm->commNRanks > comm->nRanks) {
+    if (comm->nMissingRanks > 0) {
       // There are two possibilities here.  Either we are missing the data on some ranks because the processes are
       // unreachable, or the processes _are_ reachable but didn't report to be part of this communicator (which
-      // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort).  Because we
-      // currently don't collect data about missing ranks, we can't reliably distinguish these two cases.
-      // For now we rely on an approximation: if we _know_ that some peers failed to respond, we mark this
-      // as an INCOMPLETE error; otherwise as a MISMATCH warning.
-      if (nPeersMissing > 0 || nRasDeadPeers > 0)
-        auxComm->errors |= RAS_ACE_INCOMPLETE;
-      else {
+      // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort).
+      if (nPeersMissing == 0 && nRasDeadPeers == 0) {
+        // We received data from _all_ processes.  That's an easy case.
         auxComm->errors |= RAS_ACE_MISMATCH;
-        auxComm->status |= RAS_ACS_UNKNOWN;
-      }
-    }
+        auxComm->status |= RAS_ACS_NOCOMM;
+      } else {
+        // We failed to receive data from some processes but we don't know if that's why we don't have the info about
+        // some ranks of this communicator.  We need to check all the missing ranks one-by-one as different ranks may
+        // have different reason.
+        struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
 
-    memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
-    // Convert ranksReSorted' peerIdx to rasPeers and sort by it -- that way we will have the ranks sorted
-    // by process _and_ node, which makes counting easy.
-    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
-      ranksReSorted[rankIdx].peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
-    qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksPeerCompare);
+        for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+          struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+          void* found;
+          if ((found = bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers),
+                               ncclSocketsCompare)) != nullptr) {
+            // We did receive the data from that process, but not about this communicator.
+            auxComm->errors |= RAS_ACE_MISMATCH;
+            auxComm->status |= RAS_ACS_NOCOMM;
+          } else {
+            // We failed to receive data from that process.
+            auxComm->errors |= RAS_ACE_INCOMPLETE;
+            auxComm->nIncompleteRanks++;
+          }
+          TRACE(NCCL_RAS, "RAS: comm missingRank[%d] commRank %d, addr %td (-> %d), cudaDev %d, nvmlDev %d",
+                rankIdx, missingRank->commRank, (found ? ((union ncclSocketAddress*)found) - coll->peers: -1),
+                rasPeerFind(&missingRank->addr), missingRank->cudaDev, missingRank->nvmlDev);
+        } // for (rankIdx)
+      } // nPeersMissing > 0 || nRasDeadPeers > 0
+    } // if (comm->nMissingRanks > 0)
+
+    // Initialize auxCommRanks from comm->rank, converting peerIdx to rasPeers, then sort by it -- that way we will
+    // have the ranks sorted by node and process, which makes counting easy.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+      struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+      auxCommRanks[rankIdx].rank = rank;
+      auxCommRanks[rankIdx].value = peerIdxConv[rank->peerIdx];
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] commRank %d, peerIdx %d (-> %d), cudaDev %d, nvmlDev %d",
+            rankIdx, rank->commRank, rank->peerIdx, peerIdxConv[rank->peerIdx], rank->cudaDev, rank->nvmlDev);
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] collOpCounts (%ld, %ld, %ld, %ld, %ld)",
+            rankIdx, rank->collOpCounts[0], rank->collOpCounts[1], rank->collOpCounts[2], rank->collOpCounts[3],
+            rank->collOpCounts[4]);
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] status initState %d, asyncError %d, finalizeCalled %d, destroyFlag %d, "
+            "abortFlag %d", rankIdx, rank->status.initState, rank->status.asyncError, rank->status.finalizeCalled,
+            rank->status.destroyFlag, rank->status.abortFlag); /**/
+    }
+    // This also sorts by the commRank, which we don't care about here, but it won't hurt.
+    qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
 
     // Count the peers and nodes, get the status/error indicators.
     for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-      struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+      struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
       if (rankIdx == 0) {
         auxComm->nPeers = auxComm->nNodes = 1;
         auxComm->ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS;
         auxComm->ranksPerNodeMax = 0;
-        auxComm->firstCollOpCount = rank->collOpCount;
+        memcpy(auxComm->firstCollOpCounts, auxRank->rank->collOpCounts, sizeof(auxComm->firstCollOpCounts));
         nRanks = 1;
       } else { // rankIdx > 0
-        if (rank->peerIdx != rank[-1].peerIdx) {
+        if (auxRank->value != auxRank[-1].value) {
           auxComm->nPeers++;
-          if (!ncclSocketsSameNode(&rasPeers[rank->peerIdx].addr, &rasPeers[rank[-1].peerIdx].addr)) {
+          if (!ncclSocketsSameNode(&rasPeers[auxRank->value].addr, &rasPeers[auxRank[-1].value].addr)) {
             auxComm->nNodes++;
             if (auxComm->ranksPerNodeMin > nRanks)
               auxComm->ranksPerNodeMin = nRanks;
@@ -953,7 +1036,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               auxComm->ranksPerNodeMax = nRanks;
             nRanks = 0;
           }
-        } // if (rank->peerIdx != rank[-1].peerIdx)
+        } // if (auxRank->value != auxRank[-1].value)
         nRanks++;
       } // rankIdx > 0
       if (rankIdx == comm->nRanks-1) {
@@ -964,25 +1047,27 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
           auxComm->ranksPerNodeMax = nRanks;
       }
 
-      if (rank->status.abortFlag)
+      if (auxRank->rank->status.abortFlag)
         auxComm->status |= RAS_ACS_ABORT;
-      else if (rank->status.finalizeCalled || rank->status.destroyFlag) {
+      else if (auxRank->rank->status.finalizeCalled || auxRank->rank->status.destroyFlag) {
         // destroyFlag is set by ncclCommDestroy and ncclCommAbort.  finalizeCalled appears to be set by
         // ncclCommFinalize only.  According to the docs, ncclCommDestroy *can* be called without calling
         // ncclCommFinalize first.  The code structure here ensures that we attribute destroyFlag properly
         // as a finalize state indicator (and ignore it in case of ncclCommAbort).
         auxComm->status |= RAS_ACS_FINALIZE;
       }
-      else if (rank->status.initState == ncclSuccess)
+      else if (auxRank->rank->status.initState == ncclSuccess)
         auxComm->status |= RAS_ACS_RUNNING;
-      else // rank->initState != ncclSuccess
+      else // auxRank->rank->initState != ncclSuccess
         auxComm->status |= RAS_ACS_INIT;
 
-      if (rank->collOpCount != auxComm->firstCollOpCount)
-        auxComm->errors |= RAS_ACE_MISMATCH;
-      if (rank->status.initState != ncclSuccess && rank->status.initState != ncclInProgress)
+      for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS && !(auxComm->errors & RAS_ACE_MISMATCH); collIdx++) {
+        if (auxRank->rank->collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx])
+          auxComm->errors |= RAS_ACE_MISMATCH;
+      }
+      if (auxRank->rank->status.initState != ncclSuccess && auxRank->rank->status.initState != ncclInProgress)
         auxComm->errors |= RAS_ACE_ERROR;
-      if (rank->status.asyncError != ncclSuccess && rank->status.asyncError != ncclInProgress)
+      if (auxRank->rank->status.asyncError != ncclSuccess && auxRank->rank->status.asyncError != ncclInProgress)
         auxComm->errors |= RAS_ACE_ERROR;
     } // for (rankIdx)
 
@@ -990,9 +1075,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       // We've got a status mismatch between ranks.
       auxComm->errors |= RAS_ACE_MISMATCH;
     }
+    TRACE(NCCL_RAS, "RAS: auxComm nPeers %d, nNodes %d, nIncompleteRanks %d",
+          auxComm->nPeers, auxComm->nNodes, auxComm->nIncompleteRanks);
+    TRACE(NCCL_RAS, "RAS: auxComm ranksPerNodeMin %d, ranksPerNodeMax %d, status 0x%x, errors 0x%x",
+          auxComm->ranksPerNodeMin, auxComm->ranksPerNodeMax, auxComm->status, auxComm->errors);
   } // for (commIdx)
   // Sort it by size/nNodes/status/errors/missing ranks.
-  qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev);
+  if (auxComms)
+    qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev);
 
   // Calculate the distribution of different communicator sizes.
   NCCLCHECKGOTO(ncclCalloc(&valCounts, commsData->nComms), ret, fail);
@@ -1014,10 +1104,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     }
   }
 
-  rasOutAppend("Group     Comms     Nodes     Ranks     Ranks     Ranks    Status  Errors\n"
-               "    #  in group  per comm  per node  per comm  in group\n");
-  if (commsData->nComms == 0)
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: done with initial data processing");
+
+  if (commsData->nComms > 0) {
+    rasOutAppend("Group     Comms     Nodes     Ranks     Ranks     Ranks    Status  Errors\n"
+                 "    #  in group  per comm  per node  per comm  in group\n");
+  } else {
     rasOutAppend("No communicator data collected!\n");
+  }
 
   // Allocate an auxiliary structure used for counting the number of ranks (unique GPUs) in a group.
   NCCLCHECKGOTO(ncclCalloc(&peerNvmlDevs, coll->nPeers), ret, fail);
@@ -1058,6 +1152,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  // status (which is a bitmask) into an array index.
                  statusStr[(sizeof(unsigned int)*8-1)-__builtin_clz(auxComm->status)], errorStr[auxComm->errors]);
   }
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
 
   rasOutAppend("\nErrors\n"
                "======\n\n");
@@ -1068,12 +1167,12 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     if (rasCountIsOutlier(nPeersMissing, client->verbose)) {
       // Extract a list of missing peers.  We don't want to print it right away because it would be sorted
       // by address (including port, which isn't meaningful to end users).
-      struct rasPeerInfo* peersBuf = nullptr;
+      struct rasAuxPeerInfo* auxPeersBuf = nullptr;
       int nPeersBuf;
 
       // Both rasPeers and coll->peers are sorted by address (the latter we sorted above) which makes comparing
       // them much easier.
-      NCCLCHECKGOTO(ncclCalloc(&peersBuf, nPeersMissing), ret, fail);
+      NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nPeersMissing), ret, fail);
       nPeersBuf = 0;
       for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) {
         int cmp;
@@ -1088,30 +1187,42 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
         } else if (cmp < 0) {
           // Process missing from coll->peers.  Don't report dead ones though, as they are not included
           // in nPeersMissing and are reported separately below.
-          if (!rasPeerIsDead(&rasPeers[rasPeerIdx].addr)) {
-            assert(nPeersBuf < nPeersMissing);
-            memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf));
+          bool dead;
+          if (!(dead = rasPeerIsDead(&rasPeers[rasPeerIdx].addr))) {
+            if (nPeersBuf < nPeersMissing) {
+              auxPeersBuf[nPeersBuf++].peer = rasPeers+rasPeerIdx;
+            } else {
+              INFO(NCCL_RAS, "RAS overflow of auxPeersBuf: nPeersBuf %d, rasPeerIdx %d (%s), collPeerIdx %d -- "
+                   "internal error?",
+                   nPeersBuf, rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), collPeerIdx);
+            }
           }
+          TRACE(NCCL_RAS, "RAS rasPeerIdx %d (%s) is missing from coll->peers; dead %d",
+                rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), dead);
           rasPeerIdx++;
         } else { // cmp > 0
           // Process not found in rasPeers -- shouldn't happen, unless during a race?
+          INFO(NCCL_RAS, "RAS failed to find coll->peer[%d] (%s) in rasPeers -- internal error?",
+               collPeerIdx, ncclSocketToString(coll->peers+collPeerIdx, rasLine));
           collPeerIdx++;
         } // cmp > 0
       } // for (rasPeerIdx, collPeerIdx)
 
-      // Sort the output by host and pid.
-      qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare);
+      // Sort the output by host and pid.  rasAuxPeersValueCompare uses value as the primary key, which is 0 for
+      // all auxPeersBuf elements here, so it will do.
+      qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare);
       for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
-        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid,
-                     ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)),
-                     (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""),
-                     rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf,
+        struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx;
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid,
+                     ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf,
                                         sizeof(lineBuf)));
       }
       if (nPeersBuf != nPeersMissing)
         rasOutAppend("  [could not find information on %d process%s]\n",
                      nPeersMissing-nPeersBuf, (nPeersMissing-nPeersBuf > 1 ? "es" : ""));
-      free(peersBuf);
+      free(auxPeersBuf);
     } // if (rasCountIsOutlier(nPeersMissing))
     rasOutAppend("\n");
   }
@@ -1121,31 +1232,35 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  "  %d job process%s considered dead (unreachable via the RAS network)\n", nRasDeadPeers,
                  (nRasDeadPeers > 1 ? "es are" : " is"));
     if (rasCountIsOutlier(nRasDeadPeers, client->verbose)) {
-      struct rasPeerInfo* peersReSorted = nullptr;
-      int nPeersReSorted = 0;
-      NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasDeadPeers), ret, fail);
+      // rasDeadPeers contains only addresses, whereas we want a complete rasPeerInfo, and sorted differently.
+      struct rasAuxPeerInfo* auxPeersBuf = nullptr;
+      int nPeersBuf = 0;
+      NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nRasDeadPeers), ret, fail);
       for (int i = 0; i < nRasDeadPeers; i++) {
         int peerIdx = rasPeerFind(rasDeadPeers+i);
         if (peerIdx != -1)
-          memcpy(peersReSorted+(nPeersReSorted++), rasPeers+peerIdx, sizeof(*peersReSorted));
+          auxPeersBuf[nPeersBuf++].peer = rasPeers+peerIdx;
       }
-      // Sort the output by host and pid, not host and port.
-      qsort(peersReSorted, nPeersReSorted, sizeof(*peersReSorted), rasPeersHostPidCompare);
-      for (int peerIdx = 0; peerIdx < nPeersReSorted; peerIdx++) {
-        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersReSorted[peerIdx].pid,
-                     ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
-                     (__builtin_popcountll(peersReSorted[peerIdx].cudaDevs) > 1 ? "s" : ""),
-                     rasGpuDevsToString(peersReSorted[peerIdx].cudaDevs, peersReSorted[peerIdx].nvmlDevs, lineBuf,
+      // Sort the output by host and pid, not host and port.  rasAuxPeersValueCompare uses value as the primary key,
+      // which is 0 for all auxPeersBuf elements here, so it will do.
+      qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare);
+      for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
+        struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx;
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid,
+                     ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf,
                                         sizeof(lineBuf)));
       }
-      if (nPeersReSorted != nRasDeadPeers)
+      if (nPeersBuf != nRasDeadPeers)
         rasOutAppend("  [could not find information on %d process%s]\n",
-                     nRasDeadPeers-nPeersReSorted, (nRasDeadPeers-nPeersReSorted > 1 ? "es" : ""));
-      free(peersReSorted);
+                     nRasDeadPeers-nPeersBuf, (nRasDeadPeers-nPeersBuf > 1 ? "es" : ""));
+      free(auxPeersBuf);
     } // if (rasCountIsOutlier(nRasDeadPeers)
     rasOutAppend("\n");
   }
 
+  // Continue printing the largest communicators first, as in the summary table.
   for (vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
     struct rasValCount* vc;
     vc = valCounts+vcIdx;
@@ -1154,23 +1269,28 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       comm = auxComm->comm;
 
       if (auxComm->errors & RAS_ACE_INCOMPLETE) {
-        int nRanksMissing = comm->commNRanks - comm->nRanks;
         rasOutAppend("#%d-%d (%016lx) INCOMPLETE\n"
                      "  Missing communicator data from %d rank%s\n", vcIdx, commIdx - vc->firstIdx,
-                     comm->commHash, nRanksMissing, (nRanksMissing > 1 ? "s" : ""));
-        if (rasCountIsOutlier(nRanksMissing, client->verbose)) {
-          lineBuf[0] = '\0';
-          // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
-          // exception of the missing ranks...
-          for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
-            if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
-              rankIdx++;
-            } else {
-              snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                       (rankIdx == commRank ? "" : ","), commRank);
-            }
-          } // for (commRank)
-          rasOutAppend("  The missing rank%s: %s\n", (nRanksMissing > 1 ? "s" : ""), lineBuf);
+                     comm->commId.commHash, auxComm->nIncompleteRanks, (auxComm->nIncompleteRanks > 1 ? "s" : ""));
+        if (rasCountIsOutlier(auxComm->nIncompleteRanks, client->verbose)) {
+          struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
+          for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+            struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+            // Filter out ranks that provided a response but not for this communicator.
+            if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), ncclSocketsCompare) ==
+                nullptr) {
+              int peerIdx = rasPeerFind(&missingRank->addr);
+              if (peerIdx != -1) {
+                rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                             missingRank->commRank,
+                             rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, lineBuf, sizeof(lineBuf)),
+                             rasPeers[peerIdx].pid,
+                             ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+              } else {
+                rasOutAppend("  Rank %d -- [process information not found]\n", missingRank->commRank);
+              }
+            } // if rank did not respond
+          } // for (rankIdx)
         } // if (rasCountIsOutlier(nRanksMissing))
         rasOutAppend("\n");
       } // if (auxComm->errors & RAS_ACE_INCOMPLETE)
@@ -1178,7 +1298,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       if (auxComm->errors & RAS_ACE_ERROR) {
         int ncclErrors[ncclNumResults];
         int nErrors;
-        rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+        rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash);
 
         memset(ncclErrors, '\0', sizeof(ncclErrors));
         for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
@@ -1203,6 +1323,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       } // if (auxComm->errors & RAS_ACE_ERROR)
     } // for (commIdx)
   } // for (vcIdx)
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
 
   rasOutAppend("Warnings\n"
                "========\n\n");
@@ -1213,15 +1338,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  coll->nLegTimeouts, (coll->nLegTimeouts > 1 ? "s" : ""));
   }
 
+  // Continue printing the largest communicators first, as in the summary table.
   for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
     struct rasValCount* vc = valCounts+vcIdx;
     for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) {
-      bool inconsistent;
       struct rasAuxComm* auxComm = auxComms+commIdx;
       comm = auxComm->comm;
 
       if (auxComm->errors & RAS_ACE_MISMATCH) {
-        rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+        rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash);
 
         if (collOpCounts == nullptr) {
           // Allocating comm->commNRanks elements ensures that we won't need to reallocate, because the valCounts
@@ -1234,28 +1359,31 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
           rasOutAppend("  Communicator ranks have different status\n");
 
           // We need to sort the ranks by status.  However, status is normally calculated from other fields.
-          // We will copy the ranks and reuse collOpCount to store it.
-          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+          // We will store it in the auxCommRanks' value.
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+            struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+            struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
+            auxRank->rank = rank;
 
             if (rank->status.abortFlag)
-              rank->collOpCount = RAS_ACS_ABORT;
+              auxRank->value = RAS_ACS_ABORT;
             else if (rank->status.finalizeCalled || rank->status.destroyFlag)
-              rank->collOpCount = RAS_ACS_FINALIZE;
+              auxRank->value = RAS_ACS_FINALIZE;
             else if (rank->status.initState == ncclSuccess)
-              rank->collOpCount = RAS_ACS_RUNNING;
+              auxRank->value = RAS_ACS_RUNNING;
             else
-              rank->collOpCount = RAS_ACS_INIT;
+              auxRank->value = RAS_ACS_INIT;
           }
-          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
+          qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
           // Calculate the frequency of different status values.
           int nCollOpCounts = 0;
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
+            if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) {
               // __builtin_clz returns the number of leading 0-bits.  This makes it possible to translate the
-              // status (which is a bitmask) into an array index.
-              collOpCounts[nCollOpCounts].value = (sizeof(unsigned int)*8-1) - __builtin_clz(ranksReSorted[rankIdx].collOpCount);
+              // status (which is a bitmask) into an array index.  The argument is an unsigned int (there is no
+              // 64-bit version seemingly, but we don't actually need one here).
+              collOpCounts[nCollOpCounts].value =
+                (sizeof(unsigned int)*8-1) - __builtin_clz((unsigned int)auxCommRanks[rankIdx].value);
               collOpCounts[nCollOpCounts].count = 1;
               collOpCounts[nCollOpCounts].firstIdx = rankIdx;
               nCollOpCounts++;
@@ -1263,11 +1391,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               collOpCounts[nCollOpCounts-1].count++;
             }
           }
-          if (comm->nRanks < comm->commNRanks) {
-            // Add a "fake" element corresponding to the missing entries.  The statusStr array contains the "UNKNOWN"
-            // string at index 0.
-            collOpCounts[nCollOpCounts].value = 0;
-            collOpCounts[nCollOpCounts].count = comm->commNRanks - comm->nRanks;
+          if (comm->nMissingRanks - auxComm->nIncompleteRanks > 0) {
+            // Add a "fake" element corresponding to the NOCOMM entries, since they are not in the ranks array.
+            collOpCounts[nCollOpCounts].value = 0; // The index of "NOCOMM" in statusStr.
+            collOpCounts[nCollOpCounts].count = comm->nMissingRanks - auxComm->nIncompleteRanks;
             collOpCounts[nCollOpCounts].firstIdx = -1; // "Fake" entry identifier.
             nCollOpCounts++;
           }
@@ -1280,114 +1407,159 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               rasOutAppend("  %d ranks have status %s\n", vcc->count, statusStr[vcc->value]);
             if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
               if (vcc->firstIdx != -1) {
-                // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
+                // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing...
                 for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
-                  int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+                  int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx];
                   if (peerIdx != -1) {
                     if (vcc->count > 1)
                       rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
-                                   ranksReSorted[rankIdx].commRank,
-                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   auxCommRanks[rankIdx].rank->commRank,
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
                                    rasPeers[peerIdx].pid,
                                    ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
                     else
                       rasOutAppend("  Rank %d has status %s -- GPU %s managed by process %d on node %s\n",
-                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value],
-                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value],
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
                                    rasPeers[peerIdx].pid,
                                    ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
                   } else { // peerIdx == -1
                     if (vcc->count > 1)
-                      rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
+                      rasOutAppend("  Rank %d -- [process information not found]\n",
+                                   auxCommRanks[rankIdx].rank->commRank);
                     else
                       rasOutAppend("  Rank %d has status %s -- [process information not found]\n",
-                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value]);
+                                   auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value]);
                   } // peerIdx == -1
                 } // for (rankIdx)
               } else {
-                // UNKNOWN ranks.  Format a string with their rank numbers (we don't know anything more).
-                lineBuf[0] = '\0';
-                // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
-                // exception of the missing ranks...
-                for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
-                  if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
-                    rankIdx++;
-                  } else {
-                    snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                             (rankIdx == commRank ? "" : ","), commRank);
-                  }
-                } // for (commRank)
-                if (vcc->count > 1) {
-                  rasOutAppend("  The unknown ranks: %s\n", lineBuf);
-                } else {
-                  rasOutAppend("  Rank %s has status %s\n", lineBuf, statusStr[vcc->value]);
-                }
-              }
+                // NOCOMM ranks are in a different array.
+                struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks +
+                                                                                                 comm->nRanks);
+                for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+                  struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+                  // Filter out ranks that did not respond at all.
+                  if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers),
+                              ncclSocketsCompare)) {
+                    int peerIdx = rasPeerFind(&missingRank->addr);
+                    if (peerIdx != -1) {
+                      if (vcc->count > 1) {
+                        rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                     missingRank->commRank, rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev,
+                                                                           lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+                      } else {
+                        rasOutAppend("  Rank %d has status %s -- GPU %s managed by process %d on node %s\n",
+                                     missingRank->commRank, statusStr[vcc->value],
+                                     rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev,
+                                                    lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+                      }
+                    } else { // peerIdx == -1
+                      if (vcc->count > 1) {
+                        rasOutAppend("  Rank %d -- [process information not found]\n", missingRank->commRank);
+                      } else {
+                        rasOutAppend("  Rank %d has status %s -- [process information not found]\n",
+                                     missingRank->commRank, statusStr[vcc->value]);
+                      }
+                    } // peerIdx == -1
+                  } // if rank responded
+                } // for (rankIdx)
+              } // vcc->firstIdx == -1
             } // if (rasCountIsOutlier(vcc->count))
           } // for (coc)
         } // if (__builtin_popcount(auxComm->status) > 1)
 
-        inconsistent = false;
-        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-          if (comm->ranks[rankIdx].collOpCount != auxComm->firstCollOpCount) {
-            inconsistent = true;
-            break;
-          }
-        }
-        if (inconsistent) {
-          rasOutAppend("  Communicator ranks have different collective operation counts\n");
+        for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS; collIdx++) {
+          bool inconsistent = false;
 
-          // Sort the ranks by collOpCount and rank for easy counting.
-          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
-          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
-          // Calculate the frequency of different collOpCount values.
-          int nCollOpCounts = 0;
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
-              collOpCounts[nCollOpCounts].value = ranksReSorted[rankIdx].collOpCount;
-              collOpCounts[nCollOpCounts].count = 1;
-              collOpCounts[nCollOpCounts].firstIdx = rankIdx;
-              nCollOpCounts++;
-            } else {
-              collOpCounts[nCollOpCounts-1].count++;
+            if (comm->ranks[rankIdx].collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx]) {
+              inconsistent = true;
+              break;
             }
           }
-          // Sort by that frequency (most frequent first).
-          qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
 
-          for (int coc = 0; coc < nCollOpCounts; coc++) {
-            struct rasValCount* vcc = collOpCounts+coc;
-            if (vcc->count > 1)
-              rasOutAppend("  %d ranks have launched up to operation %ld\n", vcc->count, vcc->value);
-            if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
-              // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
-              for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
-                int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
-                if (peerIdx != -1) {
-                  if (vcc->count > 1)
-                    rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
-                                 ranksReSorted[rankIdx].commRank,
-                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
-                                 rasPeers[peerIdx].pid,
-                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
-                  else
-                    rasOutAppend("  Rank %d has launched up to operation %ld -- GPU %s managed by process %d on node %s\n",
-                                 ranksReSorted[rankIdx].commRank, vcc->value,
-                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
-                                 rasPeers[peerIdx].pid,
-                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
-                } else { // peerIdx == -1
-                  if (vcc->count > 1)
-                    rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
-                  else
-                     rasOutAppend("  Rank %d has launched up to operation %ld -- [process information not found]\n",
-                                  ranksReSorted[rankIdx].commRank, vcc->value);
-                } // peerIdx == -1
-              } // for (rankIdx)
-            } // if (rasCountIsOutlier(vcc->count))
-          } // for (coc)
-        } // if (inconsistent)
-        rasOutAppend("\n");
+          if (inconsistent) {
+            rasOutAppend("  Communicator ranks have different %s operation counts\n", ncclFuncStr[collIdx]);
+
+            // Sort the ranks by collOpCounts[collIdx] and commRank for easy counting.
+            for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+              struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+              struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
+              auxRank->rank = rank;
+              auxRank->value = rank->collOpCounts[collIdx];
+            }
+            qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
+            // Calculate the frequency of different collOpCounts[collIdx] values.
+            int nCollOpCounts = 0;
+            for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+              if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) {
+                collOpCounts[nCollOpCounts].value = auxCommRanks[rankIdx].value;
+                collOpCounts[nCollOpCounts].count = 1;
+                collOpCounts[nCollOpCounts].firstIdx = rankIdx;
+                nCollOpCounts++;
+              } else {
+                collOpCounts[nCollOpCounts-1].count++;
+              }
+            }
+            // Sort by that frequency (most frequent first).
+            qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
+
+            for (int coc = 0; coc < nCollOpCounts; coc++) {
+              struct rasValCount* vcc = collOpCounts+coc;
+              if (vcc->count > 1) {
+                if (vcc->value > 0)
+                  rasOutAppend("  %d ranks have launched up to operation %ld\n", vcc->count, vcc->value);
+                else
+                  rasOutAppend("  %d ranks have not launched any operations\n", vcc->count);
+              }
+              if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
+                // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing...
+                for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
+                  int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx];
+                  if (peerIdx != -1) {
+                    if (vcc->count > 1) {
+                      rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                   auxCommRanks[rankIdx].rank->commRank,
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                   rasPeers[peerIdx].pid,
+                                   ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                    } else {
+                      if (vcc->value > 0) {
+                        rasOutAppend("  Rank %d has launched up to operation %ld -- GPU %s managed by process %d "
+                                     "on node %s\n", auxCommRanks[rankIdx].rank->commRank, vcc->value,
+                                     rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                      } else {
+                        rasOutAppend("  Rank %d has not launched any operations -- GPU %s managed by process %d "
+                                     "on node %s\n", auxCommRanks[rankIdx].rank->commRank,
+                                     rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                      }
+                    }
+                  } else { // peerIdx == -1
+                    if (vcc->count > 1) {
+                      rasOutAppend("  Rank %d -- [process information not found]\n",
+                                   auxCommRanks[rankIdx].rank->commRank);
+                    } else {
+                      if (vcc->value > 0)
+                        rasOutAppend("  Rank %d has launched up to operation %ld -- [process information not found]\n",
+                                     auxCommRanks[rankIdx].rank->commRank, vcc->value);
+                      else
+                        rasOutAppend("  Rank %d has not launched any operations -- [process information not found]\n",
+                                     auxCommRanks[rankIdx].rank->commRank);
+                    }
+                  } // peerIdx == -1
+                } // for (rankIdx)
+              } // if (rasCountIsOutlier(vcc->count))
+            } // for (coc)
+            rasOutAppend("\n");
+          } // if (inconsistent)
+        } // for (collIdx)
       } // if (auxComm->errors & RAS_ACE_MISMATCH)
     } // for (commIdx)
   } // for (vcIdx)
@@ -1398,20 +1570,26 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   rasOutExtract(msg);
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
+
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: finishing");
 exit:
   free(peerNvmlDevs);
   free(collOpCounts);
   free(valCounts);
   free(peerIdxConv);
-  free(ranksReSorted);
+  free(auxCommRanks);
   free(auxComms);
   return ret;
 fail:
   goto exit;
 }
 
+// Generates detailed info about encountered errors, be it initialization ones or asynchronous ones.
 static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm,
                                      const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync) {
+  // Because the number of possible error kinds is finite and small, we don't bother in this case with allocating
+  // temporary data structures, counting the errors, sorting arrays, etc.  Instead, in each iteration we pick the most
+  // numerous error kind, we iterate through the ranks in search for this error, and immediately add it to the output.
   for (;;) {
     int maxCount = 0;
     ncclResult_t maxCountIdx = ncclSuccess;
@@ -1489,17 +1667,20 @@ static void rasOutAppend(const char* format, ...) {
   }
 
   nRasOutBuffer += needed;
-  assert(nRasOutBuffer <= rasOutBufferSize);
+  if (nRasOutBuffer >= rasOutBufferSize)
+    nRasOutBuffer = rasOutBufferSize - 1; // Should never happen, but just to be extra sure...
 exit:
   ;
 }
 
 // Copies the output data from an internal buffer to a user-supplied one, including the terminating '\0'.
 // The user buffer must already be allocated and be at least rasOutLength() bytes long (which includes
-// the terminating '\0').
+// the terminating '\0').  Resets the output buffer when done.
 static void rasOutExtract(char* buffer) {
-  if (rasOutBuffer)
+  if (rasOutBuffer) {
     memcpy(buffer, rasOutBuffer, rasOutLength());
+    rasOutReset();
+  }
 }
 
 // Returns the current length of the used portion of the output buffer, *not* including the terminating '\0'.
@@ -1524,60 +1705,25 @@ exit:
 // Various sorting callbacks used when grouping/formatting data. //
 ///////////////////////////////////////////////////////////////////
 
-// Sorting callback for rasPeerInfo elements.  Sorts by the number of bits set in cudaDevs.  Uses the host IP as the
-// secondary key and the process id as the tertiary key.
-static int rasPeersNGpuCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
-  int c1 = __builtin_popcountll(p1->cudaDevs);
-  int c2 = __builtin_popcountll(p2->cudaDevs);
+// Sorting callback for rasAuxPeerInfo elements.  Sorts by value, with the peers host IP as the secondary key and
+// the process id as the tertiary key.
+static int rasAuxPeersValueCompare(const void* e1, const void* e2) {
+  const struct rasAuxPeerInfo* p1 = (const struct rasAuxPeerInfo*)e1;
+  const struct rasAuxPeerInfo* p2 = (const struct rasAuxPeerInfo*)e2;
 
-  if (c1 == c2) {
+  if (p1->value == p2->value) {
     // Host IP address is the secondary key.
-    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+    int cmp = ncclSocketsHostCompare(&p1->peer->addr, &p2->peer->addr);
     if (cmp == 0) {
       // Process ID is the tertiary key.
-      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+      cmp = (p1->peer->pid < p2->peer->pid ? -1 : (p1->peer->pid > p2->peer->pid ? 1 : 0));
     }
     return cmp;
   } else {
-    return (c1 < c2 ? -1 : 1);
+    return (p1->value < p2->value ? -1 : 1);
   }
 }
 
-// Sorting callback for rasPeerInfo elements.  Sorts by the number of peers per node, which we store in cudaDevs.
-// Uses the host IP as the secondary key and the process id as the tertiary key.
-static int rasPeersNProcsCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
-
-  if (p1->cudaDevs == p2->cudaDevs) {
-    // Host IP address is the secondary key.
-    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
-    if (cmp == 0) {
-      // Process ID is the tertiary key.
-      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
-    }
-    return cmp;
-  } else {
-    return (p1->cudaDevs < p2->cudaDevs ? -1 : 1);
-  }
-}
-
-// Sorting callback for rasPeerInfo elements.  Sorts by the host IP and the process id as the secondary key (rather
-// than the port).
-static int rasPeersHostPidCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
-
-  int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
-  if (cmp == 0) {
-    // Process ID is the secondary key.
-    cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
-  }
-  return cmp;
-}
-
 // Sorting callback for ncclSocketAddress.  Unlike the ncclSocketsCompare, it ignores the port.
 static int ncclSocketsHostCompare(const void* p1, const void* p2) {
   const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1;
@@ -1599,7 +1745,8 @@ static int ncclSocketsHostCompare(const void* p1, const void* p2) {
     cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr));
   } else {
     // The only remaining valid case are empty addresses.
-    assert(family == 0);
+    if (family != 0)
+      INFO(NCCL_RAS, "RAS invalid address family %d -- internal error?", family);
     cmp = 0; // Two empty addresses are equal...
   }
 
@@ -1657,24 +1804,16 @@ static int rasAuxCommsCompareRev(const void* p1, const void* p2) {
   }
 }
 
-// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the peerIdx.
-static int rasCommRanksPeerCompare(const void* p1, const void* p2) {
-  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
-  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
+// Sorting callback for rasAuxCommRank elements.  Sorts by value, with rank's commRank as the secondary key.
+static int rasAuxCommRanksValueCompare(const void* p1, const void* p2) {
+  const struct rasAuxCommRank* r1 = (const struct rasAuxCommRank*)p1;
+  const struct rasAuxCommRank* r2 = (const struct rasAuxCommRank*)p2;
 
-  return (r1->peerIdx < r2->peerIdx ? -1 : (r1->peerIdx > r2->peerIdx ? 1 : 0));
-}
-
-// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the collOpCount, with rank as the secondary key.
-static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
-  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
-  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
-
-  if (r1->collOpCount == r2->collOpCount) {
-    // Use the rank as the secondary key.
-    return (r1->commRank < r2->commRank ? -1 : (r1->commRank > r2->commRank ? 1 : 0));
+  if (r1->value == r2->value) {
+    // Use the commRank as the secondary key.
+    return (r1->rank->commRank < r2->rank->commRank ? -1 : (r1->rank->commRank > r2->rank->commRank ? 1 : 0));
   } else {
-    return (r1->collOpCount < r2->collOpCount ? -1 : 1);
+    return (r1->value < r2->value ? -1 : 1);
   }
 }
 
@@ -1705,14 +1844,20 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf,
   return buf;
 }
 
+// Formats a GPU string based on the CUDA/NVML ids provided.  If the CUDA id is different from the NVML id, both are
+// printed.
+static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size) {
+  snprintf(buf, size, "%d", cudaDev);
+  if (cudaDev != nvmlDev) {
+    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", nvmlDev);
+  }
+  return buf;
+}
+
 // Formats a GPU string based on the rasCollComms's rank.  If the CUDA id is different from the NVML id, both are
 // printed.
 static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) {
-  snprintf(buf, size, "%d", rank->cudaDev);
-  if (rank->cudaDev != rank->nvmlDev) {
-    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", rank->nvmlDev);
-  }
-  return buf;
+  return rasGpuToString(rank->cudaDev, rank->nvmlDev, buf, size);
 }
 
 // Converts a NCCL error result to a string.
@@ -1753,3 +1898,21 @@ static bool rasCountIsOutlier(int count, bool verbose, int totalCount) {
            (totalCount == -1 || count <= totalCount * RAS_CLIENT_OUTLIER_FRACTION);
   }
 }
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasClientSupportTerminate() {
+  (void)close(rasClientListeningSocket);
+  rasClientListeningSocket = -1;
+
+  free(rasOutBuffer);
+  rasOutBuffer = nullptr;
+  nRasOutBuffer = rasOutBufferSize = 0;
+
+  for (struct rasClient* client = rasClientsHead; client;) {
+    struct rasClient* clientNext = client->next;
+    rasClientTerminate(client);
+    client = clientNext;
+  }
+
+  // rasClientsHead and rasClientsTail are taken care of by rasClientTerminate().
+}
diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc
index 201144f..7283360 100644
--- a/src/ras/collectives.cc
+++ b/src/ras/collectives.cc
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out duriyng development only!
+#define NDEBUG // Comment out during development only!
 #include <cassert>
 #include <mutex>
 
@@ -12,6 +12,7 @@
 #include "checks.h"
 #include "comm.h"
 #include "nccl.h"
+#include "transport.h"
 #include "utils.h"
 #include "ras_internal.h"
 
@@ -32,14 +33,14 @@ static int nRasCollHistory, rasCollHistNextIdx;
 // Monotonically increased to ensure that each collective originating locally has a unique Id.
 static uint64_t rasCollLastId;
 
-// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
+// Keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
 // no such tracking).
-struct rasCollective* rasCollectives;
-static int nRasCollectives;
+struct rasCollective* rasCollectivesHead;
+struct rasCollective* rasCollectivesTail;
 
 static ncclResult_t getNewCollEntry(struct rasCollective** pColl);
 static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
-                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx);
+                                       const struct rasCollRequest* req, size_t reqLen, struct rasConnection* fromConn);
 static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen);
 static ncclResult_t rasCollReadyResp(struct rasCollective* coll);
 static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
@@ -47,12 +48,17 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
                                         const union ncclSocketAddress* peers, int nPeers,
                                         const char* data, int nData, int nLegTimeouts);
 
-static ncclResult_t rasCollConnsInit(char** pData, int* pNData);
+static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData);
 static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg);
 
-static ncclResult_t rasCollCommsInit(char** pData, int* pNData);
+static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData);
 static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg);
+static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm);
 static int ncclCommsCompare(const void* p1, const void* p2);
+static int peersHashesCompare(const void* p1, const void* p2);
+static int peersHashesSearch(const void* k, const void* e);
+static int rasCommIdCompare(const void* p1, const void* p2);
+static int rasCollCommsMissingRankSearch(const void* k, const void* e);
 
 
 ///////////////////////////////////////////////////////////////////////////////////////
@@ -62,22 +68,26 @@ static int ncclCommsCompare(const void* p1, const void* p2);
 // Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary.
 static ncclResult_t getNewCollEntry(struct rasCollective** pColl) {
   struct rasCollective* coll;
-  int i;
-  for (i = 0; i < nRasCollectives; i++)
-    if (rasCollectives[i].type == RAS_MSG_NONE)
-      break;
-  if (i == nRasCollectives) {
-    NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT));
-    nRasCollectives += RAS_INCREMENT;
-  }
+  int nRasConns;
+
+  NCCLCHECK(ncclCalloc(&coll, 1));
 
-  coll = rasCollectives+i;
-  memset(coll, '\0', sizeof(*coll));
   coll->startTime = clockNano();
-  coll->fromConnIdx = -1;
+  coll->fromConn = nullptr;
   // We are unlikely to use the whole array, but at least we won't need to realloc.
+  nRasConns = 0;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    nRasConns++;
   NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns));
 
+  if (rasCollectivesHead) {
+    rasCollectivesTail->next = coll;
+    coll->prev = rasCollectivesTail;
+    rasCollectivesTail = coll;
+  } else {
+    rasCollectivesHead = rasCollectivesTail = coll;
+  }
+
   *pColl = coll;
   return ncclSuccess;
 }
@@ -95,21 +105,23 @@ void rasCollReqInit(struct rasCollRequest* req) {
 // in preparation for collective response messages.
 // pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible
 // in scenarios such as a total of two peers.
-// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless
+// pColl provides on return a pointer to the allocated rasCollective structure to track this collective (unless
 // it's a broadcast, which require no such tracking).
-ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx,
-                               int fromConnIdx) {
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone,
+                               struct rasCollective** pColl, struct rasConnection* fromConn) {
   struct rasCollective* coll = nullptr;
+  struct rasCollRequest* reqMod = (struct rasCollRequest*)req;
+  size_t reqLen = 0;
   if (req->type >= RAS_COLL_CONNS) {
     // Keep track of this collective operation so that we can handle the responses appropriately.
     NCCLCHECK(getNewCollEntry(&coll));
-    if (pCollIdx)
-      *pCollIdx = coll-rasCollectives;
+    if (pColl)
+      *pColl = coll;
     memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr));
     coll->rootId = req->rootId;
     coll->type = req->type;
     coll->timeout = req->timeout;
-    coll->fromConnIdx = fromConnIdx;
+    coll->fromConn = fromConn;
     if (ncclCalloc(&coll->peers, 1) == ncclSuccess) {
       memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers));
       coll->nPeers = 1;
@@ -117,9 +129,9 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen,
 
     // Collective-specific initialization of accumulated data (using local data for now).
     if (req->type == RAS_COLL_CONNS)
-      (void)rasCollConnsInit(&coll->data, &coll->nData);
+      (void)rasCollConnsInit(&reqMod, &reqLen, &coll->data, &coll->nData);
     else if (req->type == RAS_COLL_COMMS)
-      (void)rasCollCommsInit(&coll->data, &coll->nData);
+      (void)rasCollCommsInit(&reqMod, &reqLen, &coll->data, &coll->nData);
   } else { // req->type < RAS_COLL_CONNS
     // Add the info to the collective message history.
     nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
@@ -131,42 +143,42 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen,
     // Collective-specific message handling.
     if (req->type == RAS_BC_DEADPEER) {
       bool done = false;
-      rasMsgHandleBCDeadPeer(req, &done);
+      rasMsgHandleBCDeadPeer(&reqMod, &reqLen, &done);
       if (done)
         goto exit;
     }
   } // req->type < RAS_COLL_CONNS
 
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
-    rasConns[connIdx].linkFlag = false;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    conn->linkFlag = false;
 
-  (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx);
-  (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx);
+  (void)rasLinkSendCollReq(&rasNextLink, coll, reqMod, reqLen, fromConn);
+  (void)rasLinkSendCollReq(&rasPrevLink, coll, reqMod, reqLen, fromConn);
 
   if (coll && pAllDone)
     *pAllDone = (coll->nFwdSent == coll->nFwdRecv);
 exit:
+  if (reqMod != req)
+    free(reqMod);
   return ncclSuccess;
 }
 
 // Sends the collective message through all connections associated with this link (with the exception of the one
 // the message came from, if any).
 static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
-                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
-      if (!conn->linkFlag) {
-        // We send collective messages through fully established and operational connections only.
-        if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
-          if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr)
-            coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx;
-        } // if (conn->sockIdx != -1 && RAS_SOCK_READY)
-        conn->linkFlag = true;
-      } // if (!conn->linkFlag)
-    } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx)
-  } // for (i)
+                                       const struct rasCollRequest* req, size_t reqLen,
+                                       struct rasConnection* fromConn) {
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
+    if (linkConn->conn && linkConn->conn != fromConn && !linkConn->conn->linkFlag) {
+      // We send collective messages through fully established and operational connections only.
+      if (linkConn->conn->sock && linkConn->conn->sock->status == RAS_SOCK_READY &&
+          !linkConn->conn->experiencingDelays) {
+        if (rasConnSendCollReq(linkConn->conn, req, reqLen) == ncclSuccess && coll != nullptr)
+          coll->fwdConns[coll->nFwdSent++] = linkConn->conn;
+      } // linkConn->conn is fully established and operational.
+      linkConn->conn->linkFlag = true;
+    } // if (linkConn->conn && linkConn->conn != fromConn && !linkConn->con->linkFlag)
+  } // for (linkConn)
 
   return ncclSuccess;
 }
@@ -190,8 +202,8 @@ static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct
 // in which case it can immediately send the response.
 ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
   bool allDone = false;
-  int collIdx = -1;
-  assert(sock->connIdx != -1);
+  struct rasCollective* coll = nullptr;
+  assert(sock->conn);
 
   // First check if we've already handled this request (through another connection).
   for (int i = 0; i < nRasCollHistory; i++) {
@@ -202,7 +214,7 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
       if (msg->collReq.type >= RAS_COLL_CONNS) {
         // Send an empty response so that the sender can account for it.  The non-empty response has already been
         // sent through the connection that we received the request through first.
-        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+        NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId,
                                       /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
       }
       goto exit;
@@ -211,31 +223,29 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
 
   if (msg->collReq.type >= RAS_COLL_CONNS) {
     // Check if we're currently handling this collective request.
-    for (int i = 0; i < nRasCollectives; i++) {
-      struct rasCollective* coll = rasCollectives+i;
-      if (coll->type != RAS_MSG_NONE &&
-          memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+    for (coll = rasCollectivesHead; coll; coll = coll->next) {
+      if (memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
           msg->collReq.rootId == coll->rootId) {
         assert(msg->collReq.type == coll->type);
 
         // Send an empty response so that the sender can account for it.  The non-empty response will be
         // sent through the connection that we received the request through first.
-        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+        NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId,
                                       /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
         goto exit;
       } // if match
-    } // for (i)
+    } // for (coll)
   } // if (msg->collReq.type >= RAS_COLL_CONNS)
 
   // Re-broadcast the message to my peers (minus the one it came from) and handle it locally.
-  NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx));
+  NCCLCHECK(rasNetSendCollReq(&msg->collReq, &allDone, &coll, sock->conn));
 
   if (msg->collReq.type >= RAS_COLL_CONNS && allDone) {
-    assert(collIdx != -1);
+    assert(coll);
     // We are a leaf process -- send the response right away.  This can probably trigger only for the case of a total
     // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer
     // has more than one connection so there should always be _some_ other peer to forward the request to.
-    NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx));
+    NCCLCHECK(rasCollReadyResp(coll));
   }
 exit:
   return ncclSuccess;
@@ -245,9 +255,9 @@ exit:
 // Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't
 // any peers (unlikely), the peers sent their responses (likely), or we timed out.
 static ncclResult_t rasCollReadyResp(struct rasCollective* coll) {
-  if (coll->fromConnIdx != -1) {
+  if (coll->fromConn) {
     // For remotely-initiated collectives, send the response back.
-    NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId,
+    NCCLCHECK(rasConnSendCollResp(coll->fromConn, &coll->rootAddr, coll->rootId,
                                   coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts));
 
     // Add the identifying info to the collective message history.
@@ -302,18 +312,15 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
 // the data from the response into the accumulated data.  If all the responses have been accounted for, sends the
 // accumulated response back.
 ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
-  int collIdx;
-  struct rasCollective* coll = nullptr;
+  struct rasCollective* coll;
   char line[SOCKET_NAME_MAXLEN+1];
 
-  for (collIdx = 0; collIdx < nRasCollectives; collIdx++) {
-    coll = rasCollectives+collIdx;
-    if (coll->type != RAS_MSG_NONE &&
-        memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
+  for (coll = rasCollectivesHead; coll; coll = coll->next) {
+    if (memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
         msg->collResp.rootId == coll->rootId)
       break;
   }
-  if (collIdx == nRasCollectives) {
+  if (coll == nullptr) {
     INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!",
          ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId,
          ncclSocketToString(&sock->sock.addr, rasLine));
@@ -321,11 +328,11 @@ ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
   }
 
   coll->nLegTimeouts += msg->collResp.nLegTimeouts;
-  assert(sock->connIdx != -1);
-  // Account for the received response in our collective operation tracking.
+  assert(sock->conn);
+  // Account for the received response in our collective operations tracking.
   for (int i = 0; i < coll->nFwdSent; i++) {
-    if (coll->fwdConns[i] == sock->connIdx) {
-      coll->fwdConns[i] = -1;
+    if (coll->fwdConns[i] == sock->conn) {
+      coll->fwdConns[i] = nullptr;
       break;
     }
   }
@@ -353,46 +360,53 @@ exit:
 
 // Removes a connection from all ongoing collectives.  Called when a connection is experiencing a delay or is being
 // terminated.
-void rasCollsPurgeConn(int connIdx) {
-  for (int i = 0; i < nRasCollectives; i++) {
-    struct rasCollective* coll = rasCollectives+i;
-    if (coll->type != RAS_MSG_NONE) {
-      char line[SOCKET_NAME_MAXLEN+1];
-      if (coll->fromConnIdx == connIdx) {
-        INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
-             ncclSocketToString(&coll->rootAddr, line), coll->rootId,
-             ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-        rasCollFree(coll);
-      } else {
-        for (int j = 0; j < coll->nFwdSent; j++) {
-          if (coll->fwdConns[j] == connIdx) {
-            coll->fwdConns[j] = -1;
-            coll->nFwdRecv++;
-            coll->nLegTimeouts++;
-            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
-                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
-                 ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
-                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
-            if (coll->nFwdSent == coll->nFwdRecv)
-              (void)rasCollReadyResp(coll);
-            break;
-          }
-        } // for (j)
-      } // coll->fromConnIdx != connIdx
-    } // !RAS_MSG_NONE
-  } // for (i)
+void rasCollsPurgeConn(struct rasConnection* conn) {
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    char line[SOCKET_NAME_MAXLEN+1];
+    if (coll->fromConn == conn) {
+      INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
+           ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+           ncclSocketToString(&conn->addr, rasLine));
+      rasCollFree(coll);
+    } else {
+      for (int i = 0; i < coll->nFwdSent; i++) {
+        if (coll->fwdConns[i] == conn) {
+          coll->fwdConns[i] = nullptr;
+          coll->nFwdRecv++;
+          coll->nLegTimeouts++;
+          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+               coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+          if (coll->nFwdSent == coll->nFwdRecv)
+            (void)rasCollReadyResp(coll);
+          break;
+        }
+      } // for (i)
+    } // coll->fromConn != conn
+    coll = collNext;
+  } // for (coll)
 }
 
 // Frees a rasCollective entry and any memory associated with it.
 void rasCollFree(struct rasCollective* coll) {
+  if (coll == nullptr)
+    return;
+
   free(coll->fwdConns);
-  coll->fwdConns = nullptr;
   free(coll->peers);
-  coll->peers = nullptr;
   free(coll->data);
-  coll->data = nullptr;
-  coll->fromConnIdx = -1;
-  coll->type = RAS_MSG_NONE;
+
+  if (coll == rasCollectivesHead)
+    rasCollectivesHead = rasCollectivesHead->next;
+  if (coll == rasCollectivesTail)
+    rasCollectivesTail = rasCollectivesTail->prev;
+  if (coll->prev)
+    coll->prev->next = coll->next;
+  if (coll->next)
+    coll->next->prev = coll->prev;
+  free(coll);
 }
 
 // Invoked from the main RAS thread loop to handle timeouts of the collectives.
@@ -407,64 +421,64 @@ void rasCollFree(struct rasCollective* coll) {
 // and send back whatever we have.  Unfortunately, the peer that the RAS client is connected to will in all likelihood
 // time out first, so at that point any delayed responses that eventually arrive are likely to be too late...
 void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) {
-    struct rasCollective* coll = rasCollectives+collIdx;
-    if (coll->type == RAS_MSG_NONE || coll->timeout == 0)
-      continue;
-
-    if (now - coll->startTime > coll->timeout) {
-      // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
-      if (!coll->timeoutWarned) {
-        INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
-             ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
-             (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
-        coll->timeoutWarned = true;
-      }
-      for (int i = 0; i < coll->nFwdSent; i++) {
-        if (coll->fwdConns[i] != -1) {
-          struct rasConnection* conn = rasConns+coll->fwdConns[i];
-          char line[SOCKET_NAME_MAXLEN+1];
-          if (!conn->experiencingDelays && conn->sockIdx != -1) {
-            struct rasSocket* sock = rasSockets+conn->sockIdx;
-            // Ensure that the connection is fully established and operational, and that the socket hasn't been
-            // re-created during the handling of the collective (which would suggest that the request may have been
-            // lost).
-            if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime)
-              continue;
-          }
-          // In all other cases we declare a timeout so that we can (hopefully) recover.
-          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
-               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
-               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
-               coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
-          coll->fwdConns[i] = -1;
-          coll->nFwdRecv++;
-          coll->nLegTimeouts++;
-        } // if (coll->fwdConns[i] != -1)
-      } // for (i)
-      if (coll->nFwdSent == coll->nFwdRecv) {
-        (void)rasCollReadyResp(coll);
-      } else {
-        // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
-        // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
-        // (incomplete) responses will arrive shortly, so we should wait a little longer.
-        if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
-          // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
-          // the originator of the collective, if it's not us, may have timed out already anyway).
-          INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    if (coll->timeout > 0) {
+      if (now - coll->startTime > coll->timeout) {
+        // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
+        if (!coll->timeoutWarned) {
+          INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
                ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
                (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
-          coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
-          coll->nFwdRecv = coll->nFwdSent;
+          coll->timeoutWarned = true;
+        }
+        for (int i = 0; i < coll->nFwdSent; i++) {
+          if (coll->fwdConns[i]) {
+            struct rasConnection* conn = coll->fwdConns[i];
+            char line[SOCKET_NAME_MAXLEN+1];
+            if (!conn->experiencingDelays && conn->sock) {
+              // Ensure that the connection is fully established and operational, and that the socket hasn't been
+              // re-created during the handling of the collective (which would suggest that the request may have been
+              // lost).
+              if (conn->sock->status == RAS_SOCK_READY && conn->sock->createTime < coll->startTime)
+                continue;
+            }
+            // In all other cases we declare a timeout so that we can (hopefully) recover.
+            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+                 ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+            coll->fwdConns[i] = nullptr;
+            coll->nFwdRecv++;
+            coll->nLegTimeouts++;
+          } // if (coll->fwdConns[i])
+        } // for (i)
+        if (coll->nFwdSent == coll->nFwdRecv) {
           (void)rasCollReadyResp(coll);
         } else {
-          *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
-        }
-      } // conn->nFwdRecv < conn->nFwdSent
-    } else {
-      *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
-    }
-  } // for (collIdx)
+          // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
+          // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
+          // (incomplete) responses will arrive shortly, so we should wait a little longer.
+          if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
+            // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
+            // the originator of the collective, if it's not us, may have timed out already anyway).
+            INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+                 ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+                 (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+            coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
+            coll->nFwdRecv = coll->nFwdSent;
+            (void)rasCollReadyResp(coll);
+          } else {
+            *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
+          }
+        } // conn->nFwdRecv < conn->nFwdSent
+      } else {
+        *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
+      }
+    } // if (coll->timeout > 0)
+
+    coll = collNext;
+  } // for (coll)
 }
 
 
@@ -476,15 +490,16 @@ void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
 // For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well
 // as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen,
 // but the system clocks may not be perfectly in sync).
-static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
+static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) {
   struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN};
   struct rasCollConns* pConnsData;
 
+  *pReqLen = rasCollDataLength(RAS_COLL_CONNS);
+
   // Update the statistical data first and in the process also calculate how much connection-specific space we
   // will need.
-  for (int i = 0; i < nRasConns; i++) {
-    struct rasConnection* conn = rasConns+i;
-    if (conn->inUse && conn->travelTimeCount > 0) {
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+    if (conn->travelTimeCount > 0) {
       if (connsData.travelTimeMin > conn->travelTimeMin)
         connsData.travelTimeMin = conn->travelTimeMin;
       if (connsData.travelTimeMax < conn->travelTimeMax)
@@ -502,9 +517,9 @@ static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
   pConnsData = (struct rasCollConns*)*pData;
   memcpy(pConnsData, &connsData, sizeof(*pConnsData));
   if (connsData.nNegativeMins > 0) {
-    for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) {
-      struct rasConnection* conn = rasConns+i;
-      if (conn->inUse && conn->travelTimeMin < 0) {
+    int negMinsIdx = 0;
+    for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+      if (conn->travelTimeMin < 0) {
         struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx;
         memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source));
         memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest));
@@ -560,10 +575,26 @@ static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg*
 // Initializes the accumulated data with just the local data for now.
 // For this particular collective, we keep for every communicator information about every rank, to help identify
 // the missing ones and the discrepancies between the ones that did respond.
-static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
+// For any new (previously unseen) communicator we also save the basic identification data about every rank that is
+// "missing" (i.e., not part of this process).  During merging, this should be replaced by the actual data from
+// those ranks, if they are responsive.  We want to provide this information to the user (so that we can say more
+// than "rank xyz missing").
+// Every "new" communicator is also recorded in the (updated) request, so that when that request is forwarded to our
+// peers, those peers don't needlessly send us the same data.
+static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) {
+  ncclResult_t ret = ncclSuccess;
   struct rasCollComms* commsData;
-  int nComms = 0, nRanks = 0;
+  int nComms = 0, nRanks = 0, nMissingRanks = 0;
+  bool skipMissing = false;
   std::lock_guard<std::mutex> lock(ncclCommsMutex);
+  struct rasCollComms::comm* comm;
+  struct rasCollRequest* req = nullptr;
+  struct rasPeerInfo** peersReSorted = nullptr;
+  int firstNewSkipMissingIdx = -1;
+
+  *pReqLen = rasCollDataLength(RAS_COLL_COMMS) +
+    (*pReq)->comms.nSkipMissingRanksComms * sizeof(*(*pReq)->comms.skipMissingRanksComms);
+  *pData = nullptr;
 
   // Start by counting the communicators so that we know how much space to allocate.
   // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case
@@ -572,77 +603,152 @@ static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
     qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare);
     ncclCommsSorted = true;
   }
-  for (int i = 0; i < nNcclComms; i++) {
-    if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting.
+  for (int commIdx = 0; commIdx < nNcclComms; commIdx++) {
+    if (ncclComms[commIdx] == nullptr) // nullptr's are always at the end after sorting.
       break;
-    if (i == 0) {
-      nComms = 1;
-    } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
-      nComms++;
-    }
-    nRanks++;
-  }
+    // A process may manage multiple GPUs and thus have multiple communicators with the same commHash.
+    // Comparing just the commHash is OK though within communicators that are part of the same process.
+    if (commIdx == 0 || ncclComms[commIdx]->commHash != ncclComms[commIdx-1]->commHash) {
+      skipMissing = rasCollCommsSkipMissing(*pReq, ncclComms[commIdx]);
+      if (!skipMissing) {
+        // Add this communicator to the request so that the processes we forward the request to know not to fill in
+        // the missing rank info.
+        struct rasCommId* skipComm;
+        if (req == nullptr) {
+          // We pessimistically allocate space for all the remaining communicators so that we don't need to reallocate.
+          int newSize = *pReqLen + (nNcclComms-commIdx) * sizeof(*req->comms.skipMissingRanksComms);
+          NCCLCHECKGOTO(ncclCalloc((char**)&req, newSize), ret, fail);
+          memcpy(req, *pReq, *pReqLen);
+          *pReq = req;
+          firstNewSkipMissingIdx = req->comms.nSkipMissingRanksComms;
+        }
+        skipComm = req->comms.skipMissingRanksComms + req->comms.nSkipMissingRanksComms++;
+        skipComm->commHash = ncclComms[commIdx]->commHash;
+        skipComm->hostHash = ncclComms[commIdx]->peerInfo->hostHash;
+        skipComm->pidHash = ncclComms[commIdx]->peerInfo->pidHash;
 
-  // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent
+        nMissingRanks += ncclComms[commIdx]->nRanks;
+      } // if (!skipMissing)
+      nComms++;
+    } // if encountered a new communicator
+    nRanks++;
+    if (!skipMissing)
+      nMissingRanks--;
+  } // for (commIdx)
+
+  // rasCollComms has nested variable-length arrays, which makes the size calculation and subsequent
   // pointer manipulations somewhat unwieldy...
-  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks);
-  NCCLCHECK(ncclCalloc(pData, *pNData));
+  // This is extra complicated because of the "hidden" array of struct rasCollCommsMissingRank following the
+  // ranks array for each communicator.
+  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks) +
+    nMissingRanks * sizeof(struct rasCollCommsMissingRank);
+  NCCLCHECKGOTO(ncclCalloc(pData, *pNData), ret, fail);
   commsData = (struct rasCollComms*)*pData;
   commsData->nComms = nComms;
 
   // comm points at the space in the accumulated data where the info about the current communicator is to be stored.
-  struct rasCollComms::comm* comm = commsData->comms;
-  for (int i = 0; i < nNcclComms; i++) {
-    struct rasCollComms::comm::rank* rank;
-    ncclResult_t asyncError;
-    if (ncclComms[i] == nullptr)
-      break;
-    if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
-      if (i > 0)
-        comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
-      comm->commHash = ncclComms[i]->commHash;
-      comm->commNRanks = ncclComms[i]->nRanks;
-      comm->nRanks = 0;
-    } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) {
-      INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
-           "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) {
-      INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
-           ncclComms[i]->rank, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    }
-    if (comm->nRanks == comm->commNRanks) {
-      INFO(NCCL_RAS,
-           "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)",
-           comm->commNRanks, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    }
-    rank = comm->ranks+comm->nRanks;
-    rank->commRank = ncclComms[i]->rank;
-    // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
-    // always 0.  It will increase after we send this response back to the peer we got the request from.
-    rank->peerIdx = 0;
-    rank->collOpCount = ncclComms[i]->collOpCount;
-    rank->status.initState = ncclComms[i]->initState;
-    if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess)
-      rank->status.asyncError = asyncError;
-    rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0);
-    rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0);
-    rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0);
-    rank->cudaDev = ncclComms[i]->cudaDev;
-    rank->nvmlDev = ncclComms[i]->nvmlDev;
-    comm->nRanks++;
-  }
-  assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData);
+  comm = commsData->comms;
+  // collCommIdx counts rasCollComms::comm (comm); commIdx indexes ncclComms.
+  for (int collCommIdx = 0, commIdx = 0; collCommIdx < nComms; collCommIdx++) {
+    struct ncclComm* ncclComm = ncclComms[commIdx];
 
-  return ncclSuccess;
+    comm->commId.commHash = ncclComm->commHash;
+    comm->commId.hostHash = ncclComm->peerInfo->hostHash;
+    comm->commId.pidHash = ncclComm->peerInfo->pidHash;
+    comm->commNRanks = ncclComm->nRanks;
+    comm->nRanks = comm->nMissingRanks = 0;
+
+    // Fill in the comm->ranks array.
+    for (; commIdx < nNcclComms && ncclComms[commIdx] && ncclComms[commIdx]->commHash == comm->commId.commHash;
+         commIdx++) {
+      ncclComm = ncclComms[commIdx];
+      struct rasCollComms::comm::rank* rank = comm->ranks+comm->nRanks;
+      ncclResult_t asyncError;
+      rank->commRank = ncclComm->rank;
+      // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
+      // always 0.  It will increase after we send this response back to the peer we got the request from.
+      rank->peerIdx = 0;
+      memcpy(rank->collOpCounts, ncclComm->seqNumber, sizeof(rank->collOpCounts));
+      rank->status.initState = ncclComm->initState;
+      if (ncclCommGetAsyncError(ncclComm, &asyncError) == ncclSuccess)
+        rank->status.asyncError = asyncError;
+      rank->status.finalizeCalled = (ncclComm->finalizeCalled != 0);
+      rank->status.destroyFlag = (ncclComm->destroyFlag != 0);
+      rank->status.abortFlag = (__atomic_load_n(ncclComm->abortFlag, __ATOMIC_ACQUIRE) != 0);
+      rank->cudaDev = ncclComm->cudaDev;
+      rank->nvmlDev = ncclComm->nvmlDev;
+      comm->nRanks++;
+    } // for (commIdx)
+
+    if (firstNewSkipMissingIdx != -1 &&
+        memcmp(req->comms.skipMissingRanksComms+firstNewSkipMissingIdx, &comm->commId, sizeof(comm->commId)) == 0) {
+      // Fill in the missingRanks array that follows the comm->ranks.
+      struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
+
+      if (peersReSorted == nullptr) {
+        // Create a lookup table to rasPeers that is sorted by hostHash and pidHash, to reduce the complexity of the
+        // lookups in the missingRankIdx loop below.
+        NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail);
+        for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++)
+          peersReSorted[peerIdx] = rasPeers+peerIdx;
+        qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesCompare);
+      }
+
+      comm->nMissingRanks = comm->commNRanks - comm->nRanks;
+      for (int missingRankIdx = 0, rankIdx = 0; missingRankIdx < comm->nMissingRanks; missingRankIdx++) {
+        struct rasCollCommsMissingRank* missingRank;
+        struct ncclPeerInfo* info;
+        struct rasPeerInfo** peer;
+        uint64_t key[2];
+        // Look for the next "hole" in the ranks array.
+        while (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == rankIdx+missingRankIdx)
+          rankIdx++;
+
+        missingRank = missingRanks + missingRankIdx;
+        missingRank->commRank = rankIdx + missingRankIdx;
+        info = ncclComm->peerInfo + missingRank->commRank;
+        key[0] = info->hostHash - ncclComm->commHash;
+        key[1] = info->pidHash - ncclComm->commHash;
+        peer = (struct rasPeerInfo**)bsearch(key, peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesSearch);
+        if (peer)
+          memcpy(&missingRank->addr, &(*peer)->addr, sizeof(missingRank->addr));
+        missingRank->cudaDev = info->cudaDev;
+        missingRank->nvmlDev = info->nvmlDev;
+      } // for (missingRankIdx)
+
+      if (++firstNewSkipMissingIdx == req->comms.nSkipMissingRanksComms)
+        firstNewSkipMissingIdx = -1;
+    } // if need to fill in the missingRanks
+
+    comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks) +
+                                        comm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
+  } // for (collCommIdx)
+  assert(((char*)comm) - (char*)commsData <= *pNData);
+
+  if (req) {
+    // Finish updating the request.
+    *pReqLen = rasCollDataLength(RAS_COLL_COMMS) +
+      req->comms.nSkipMissingRanksComms * sizeof(*req->comms.skipMissingRanksComms);
+    qsort(req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms,
+          sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare);
+  }
+ret:
+  free(peersReSorted);
+  return ret;
+fail:
+  if (req) {
+    free(req);
+    *pReq = nullptr;
+  }
+  free(*pData);
+  *pData = nullptr;
+  goto ret;
 }
 
 // Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data.
 static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) {
-  struct rasCollComms* collData;
-  struct rasCollComms* msgData;
+  struct rasCollComms* collData; // Data previously stored (locally) by our process.
+  struct rasCollComms* msgData; // Data just received from another process.
   int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
   ALIGN_SIZE(dataOffset, alignof(int64_t));
 
@@ -650,7 +756,7 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
   collData = (struct rasCollComms*)coll->data;
 
   if (msgData->nComms > 0) {
-    struct rasCollComms* newData = nullptr;
+    struct rasCollComms* newData = nullptr; // Destination buffer for the merged data.
 
     // Allocate the new buffer pessimistically (sized as the sum of the two old ones).
     NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData));
@@ -661,25 +767,28 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
     for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) {
       int cmp;
       if (collIdx < collData->nComms && msgIdx < msgData->nComms)
-        cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0));
+        cmp = rasCommIdCompare(&collComm->commId, &msgComm->commId);
       else
         cmp = (collIdx < collData->nComms ? -1 : 1);
 
       if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) {
         INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
-             "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash);
+             "possible hash collision (0x%lx, 0x%lx, 0x%lx)", collComm->commNRanks, msgComm->commNRanks,
+             collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash);
         cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1);
-        // We try to preserve both separately, although the input data might already be messed up anyway...
+        // We try to preserve them both separately...
       }
 
       if (cmp == 0) {
         // Merge the comms.
-        newComm->commHash = collComm->commHash;
+        memcpy(&newComm->commId, &collComm->commId, sizeof(newComm->commId));
         newComm->commNRanks = collComm->commNRanks;
         if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) {
           INFO(NCCL_RAS,
-               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)",
-               collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash);
+               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible hash collision "
+               "(0x%lx, 0x%lx, 0x%lx)", collComm->nRanks + msgComm->nRanks, newComm->commNRanks,
+               collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash);
+          newComm->nRanks = newComm->commNRanks;
           // We'll skip the extras in the loop below.
         } else {
           newComm->nRanks = collComm->nRanks + msgComm->nRanks;
@@ -691,16 +800,18 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
           int cmpRank;
           if (newRankIdx == newComm->commNRanks)
             break; // Short of failing, the best we can do is skip...
-          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks)
+          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks) {
             cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 :
                        (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0));
-          else
+          } else {
             cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1);
+          }
 
           // There shouldn't be any overlaps in ranks between different sources.
           if (cmpRank == 0) {
-            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
-                 collComm->ranks[collRankIdx].commRank, newComm->commHash);
+            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible hash collision "
+                 "(0x%lx, 0x%lx, 0x%lx)", collComm->ranks[collRankIdx].commRank,
+                 newComm->commId.commHash, newComm->commId.hostHash, newComm->commId.pidHash);
             msgRankIdx++; // Short of failing, the best we can do is skip...
           }
           memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ :
@@ -708,23 +819,63 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
           if (cmpRank > 0) {
             // peerIdx values from msgComm need to shift after merge.
             newComm->ranks[newRankIdx].peerIdx += coll->nPeers;
-          }
+
+            if (collComm->nMissingRanks > 0) {
+              // Remove the corresponding entry from missingRanks.
+              struct rasCollCommsMissingRank* missingRank;
+              missingRank = (struct rasCollCommsMissingRank*)bsearch(&newComm->ranks[newRankIdx].commRank,
+                                                                     collComm->ranks+collComm->nRanks,
+                                                                     collComm->nMissingRanks,
+                                                                     sizeof(struct rasCollCommsMissingRank),
+                                                                     rasCollCommsMissingRankSearch);
+              if (missingRank) {
+                // Mark the entry as no longer needed.
+                memset(&missingRank->addr, '\0', sizeof(missingRank->addr));
+              } else {
+                INFO(NCCL_RAS, "RAS failed to find missingRank data -- internal error?");
+              }
+            } // if (collComm->nMissingRanks > 0)
+          } // if (cmpRank > 0)
         } // for (newRankIdx)
-        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks));
-        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks));
+        if (collComm->nMissingRanks > 0) {
+          // Copy the missingRanks to newComm, skipping over any no longer needed entries.
+          union ncclSocketAddress emptyAddr;
+          struct rasCollCommsMissingRank* collMissingRanks;
+          struct rasCollCommsMissingRank* newMissingRanks;
+          int newRankIdx;
+
+          memset(&emptyAddr, '\0', sizeof(emptyAddr));
+          collMissingRanks = (struct rasCollCommsMissingRank*)(collComm->ranks+collComm->nRanks);
+          newMissingRanks = (struct rasCollCommsMissingRank*)(newComm->ranks+newComm->nRanks);
+          newRankIdx = 0;
+          for (int collRankIdx = 0; collRankIdx < collComm->nMissingRanks; collRankIdx++) {
+            if (memcmp(&collMissingRanks[collRankIdx].addr, &emptyAddr, sizeof(emptyAddr))) {
+              memcpy(newMissingRanks + newRankIdx++, collMissingRanks + collRankIdx, sizeof(*newMissingRanks));
+            }
+          }
+          newComm->nMissingRanks = newRankIdx;
+          assert(newComm->nRanks + newComm->nMissingRanks == newComm->commNRanks);
+        }
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks) +
+                                               newComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks) +
+                                                collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
         collIdx++;
-        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks));
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks) +
+                                               msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
         msgIdx++;
       } else if (cmp < 0) {
         // Copy from collComm.
-        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks);
+        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks) +
+          collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank);
         memcpy(newComm, collComm, commSize);
         newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
         collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize);
         collIdx++;
       } else { // cmp > 0
         // Copy from msgComm.
-        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks);
+        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks) +
+          msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank);
         memcpy(newComm, msgComm, commSize);
         for (int i = 0; i < newComm->nRanks; i++) {
           // peerIdx values from msgComm need to shift after merge.
@@ -745,18 +896,87 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
   return ncclSuccess;
 }
 
+// Checks if a given communicator is in the skipMissingRanksComms array of the request.
+static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm) {
+  struct rasCommId id;
+  id.commHash = comm->commHash;
+  id.hostHash = comm->peerInfo->hostHash;
+  id.pidHash = comm->peerInfo->pidHash;
+  return (bsearch(&id, req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms,
+                  sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare) != nullptr);
+}
+
 // Sorting callback for the ncclComms array.
 static int ncclCommsCompare(const void* p1, const void* p2) {
-  const ncclComm** pc1 = (const ncclComm**)p1;
-  const ncclComm** pc2 = (const ncclComm**)p2;
+  const ncclComm* comm1 = *(const ncclComm**)p1;
+  const ncclComm* comm2 = *(const ncclComm**)p2;
 
   // Put nullptr's at the end.
-  if (*pc1 == nullptr || *pc2 == nullptr)
-    return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0));
+  if (comm1 == nullptr || comm2 == nullptr)
+    return (comm1 != nullptr ? -1 : (comm2 != nullptr ? 1 : 0));
 
-  if ((*pc1)->commHash == (*pc2)->commHash) {
-    return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0));
+  if (comm1->commHash == comm2->commHash) {
+    return (comm1->rank < comm2->rank ? -1 : (comm1->rank > comm2->rank ? 1 : 0));
   } else {
-    return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1);
+    return (comm1->commHash < comm2->commHash ? -1 : 1);
   }
 }
+
+// Sorting callback for a lookup table to rasPeers.  Sorts by the hostHash (primary) and pidHash (secondary).
+static int peersHashesCompare(const void* p1, const void* p2) {
+  const struct rasPeerInfo* pi1 = *(const struct rasPeerInfo**)p1;
+  const struct rasPeerInfo* pi2 = *(const struct rasPeerInfo**)p2;
+
+  if (pi1->hostHash == pi2->hostHash) {
+    return (pi1->pidHash < pi2->pidHash ? -1 : (pi1->pidHash > pi2->pidHash ? 1 : 0));
+  } else {
+    return (pi1->hostHash < pi2->hostHash ? -1 : 1);
+  }
+}
+
+// Search callback for a lookup table to rasPeers.  Searches by the hostHash and pidHash.  The key is an array
+// containing the hostHash at index 0 and the pidHash at index 1.
+static int peersHashesSearch(const void* k, const void* e) {
+  const uint64_t* key = (const uint64_t*)k;
+  const struct rasPeerInfo* elem = *(const struct rasPeerInfo**)e;
+
+  if (key[0] == elem->hostHash) {
+    return (key[1] < elem->pidHash ? -1 : (key[1] > elem->pidHash ? 1 : 0));
+  } else {
+    return (key[0] < elem->hostHash ? -1 : 1);
+  }
+}
+
+// Sorting/searching callback for struct rasCommId.  Sorts by commHash, then hostHash, then pidHash.
+static int rasCommIdCompare(const void* p1, const void* p2) {
+  const struct rasCommId* i1 = (const struct rasCommId*)p1;
+  const struct rasCommId* i2 = (const struct rasCommId*)p2;
+  if (i1->commHash == i2->commHash) {
+    if (i1->hostHash == i2->hostHash) {
+      return (i1->pidHash < i2->pidHash ? -1 : (i1->pidHash > i2->pidHash ? 1 : 0));
+    } else {
+      return (i1->hostHash < i2->hostHash ? -1 : 1);
+    }
+  } else {
+    return (i1->commHash < i2->commHash ? -1 : 1);
+  }
+}
+
+// Search callback for rasCollComms::comm rasCollCommsMissingRank array.  The key is the commRank.
+static int rasCollCommsMissingRankSearch(const void* k, const void* e) {
+  int key = *(const int*)k;
+  const struct rasCollCommsMissingRank* elem = (const struct rasCollCommsMissingRank*)e;
+
+  return (key < elem->commRank ? -1 : (key > elem->commRank ? 1 : 0));
+}
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasCollectivesTerminate() {
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    rasCollFree(coll);
+    coll = collNext;
+  }
+
+  // rasCollectivesHead and rasCollectivesTail are taken care of by rasCollFree().
+}
diff --git a/src/ras/peers.cc b/src/ras/peers.cc
index f2692d3..8573209 100644
--- a/src/ras/peers.cc
+++ b/src/ras/peers.cc
@@ -40,10 +40,11 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks
 static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1);
 
 static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
-                                      struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1);
+                                      struct rasRankInit* ranks = nullptr, int nranks = 0,
+                                      struct rasConnection* fromConn = nullptr);
 static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
                                            bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
-                                           int fromConnIdx);
+                                           struct rasConnection* fromConn);
 static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
                                            int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks);
 ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
@@ -146,6 +147,8 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks
     rankPeer->pid = rank->pid;
     rankPeer->cudaDevs = (1UL << rank->cudaDev);
     rankPeer->nvmlDevs = (1UL << rank->nvmlDev);
+    rankPeer->hostHash = rank->hostHash;
+    rankPeer->pidHash = rank->pidHash;
     rankPeerIdx++;
 
     // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how
@@ -357,12 +360,12 @@ int rasPeerFind(const union ncclSocketAddress* addr) {
 // ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members
 // of the new communicator being established), and who thus don't need to be notified.  updatedDeadPeers can
 // be used, however, to request at least the propagation of rasDeadPeers to such peers.
-// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to
+// fromConn -- if provided -- identifies the connection used to receive this update; there's no need to
 // propagate the update back through it.
 // Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new
 // connections as needed.
 static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
-                                      struct rasRankInit* ranks, int nranks, int fromConnIdx) {
+                                      struct rasRankInit* ranks, int nranks, struct rasConnection* fromConn) {
   ncclResult_t ret = ncclSuccess;
 
   // Do we actually have anything to do?
@@ -371,8 +374,8 @@ static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nN
 
   // Start by propagating the update through the RAS network links.  We consider any errors during this process
   // to be non-fatal (we can re-sync later around a keep-alive exchange).
-  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
-  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn);
+  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn);
 
   // Calculate new link peers and open new connections if needed.
   NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail);
@@ -388,15 +391,13 @@ fail:
 // for the explanation of the function arguments.
 static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
                                            bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
-                                           int fromConnIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
+                                           struct rasConnection* fromConn) {
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
     // Note that we don't send the update via the connection that we received this notification from in the first
     // place (while it wouldn't loop indefinitely, it would add a needless extra exchange).
-    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
+    if (linkConn->conn && linkConn->conn != fromConn) {
       // Failed propagations are not considered fatal (we will retry after a keep-alive).
-      (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
+      (void)rasConnPropagateUpdate(linkConn->conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
     }
   }
 
@@ -407,7 +408,7 @@ static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct ra
 // arguments.
 static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
                                            int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) {
-  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+  if (conn->sock && conn->sock->status == RAS_SOCK_READY) {
     // If we have the rank info, check if the peer on the other side of this connection has participated in the new
     // communicator.
     int connRank = -1;
@@ -462,7 +463,8 @@ ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct ras
   msg->peersUpdate.deadPeersHash = rasDeadPeersHash;
   msg->peersUpdate.nDeadPeers = nDeadPeers;
   memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0]));
-  memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+  if (nDeadPeers > 0)
+    memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
 
   if (nPeers > 0)
     conn->lastSentPeersHash = rasPeersHash;
@@ -485,8 +487,7 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
   ncclResult_t ret = ncclSuccess;
   struct rasMsg* newMsg = nullptr;
   int newMsgLen = 0;
-  assert(sock->connIdx != -1);
-  struct rasConnection* conn = rasConns+sock->connIdx;
+  assert(sock->conn);
   int nPeers, nDeadPeers;
   int deadPeersOffset = 0;
   bool updatePeers, updateDeadPeers;
@@ -496,8 +497,8 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
        msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
   INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d",
        rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers);
-  conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
-  conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
+  sock->conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
+  sock->conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
 
   // Prepare ours to send back.  We don't enqueue it right away because we want to make sure first that we need
   // to send it.  We'll find out by comparing the hash values after the merge.
@@ -545,15 +546,15 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
       rasDeadPeersDump();
 
     // If post-merge the hashes are still different, send our (dead) peers back.
-    updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash);
-    updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash &&
-                       conn->lastRecvDeadPeersHash != rasDeadPeersHash);
+    updatePeers = (sock->conn->lastSentPeersHash != rasPeersHash && sock->conn->lastRecvPeersHash != rasPeersHash);
+    updateDeadPeers = (sock->conn->lastSentDeadPeersHash != rasDeadPeersHash &&
+                       sock->conn->lastRecvDeadPeersHash != rasDeadPeersHash);
     if (updatePeers || updateDeadPeers) {
       newMsg->peersUpdate.peersHash = rasPeersHash;
       newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash;
       if (updatePeers) {
         assert(nPeers > 0);
-        conn->lastSentPeersHash = rasPeersHash;
+        sock->conn->lastSentPeersHash = rasPeersHash;
       } else {
         // If hashes match, make sure that we don't send the rasPeers back.
         newMsg->peersUpdate.nPeers = 0;
@@ -564,14 +565,14 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
 
       if (updateDeadPeers) {
         assert(nRasDeadPeers > 0);
-        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        sock->conn->lastSentDeadPeersHash = rasDeadPeersHash;
 
         ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
         deadPeersOffset = newMsgLen;
         newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers);
 
         memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
-        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        sock->conn->lastSentDeadPeersHash = rasDeadPeersHash;
         newMsg->peersUpdate.nDeadPeers = nRasDeadPeers;
       } else {
         newMsg->peersUpdate.nDeadPeers = 0;
@@ -580,13 +581,13 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
       INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)",
            newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers);
 
-      rasConnEnqueueMsg(conn, newMsg, newMsgLen);
+      rasConnEnqueueMsg(sock->conn, newMsg, newMsgLen);
       newMsg = nullptr;
     } // if (updatePeers || updateDeadPeers)
 
     // Propagate the changes through our RAS network links.
     NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0,
-                                    sock->connIdx), ret, fail);
+                                    sock->conn), ret, fail);
   }
 
 exit:
@@ -603,7 +604,7 @@ fail:
 
 // Reinitializes the connection(s) of a particular link, following a peers update.
 // Adding new peers can affect the calculation of the link's primary connection and also the fallbacks.
-// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn
+// The newly added peers could also shift all the existing peerIdx values, invalidating the values in rasLinkConn
 // structures, so it's better to drop it all and recalculate from scratch.
 // We recalculate the primary peer; if an active connection to it already exists, then we're done.  If there
 // is no connection, we create one.  If a connection exists but is experiencing delays then we add a fallback and
@@ -611,77 +612,51 @@ fail:
 // External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed).
 static ncclResult_t rasLinkReinitConns(struct rasLink* link) {
   struct rasLinkConn* linkConn;
-  struct rasConnection* conn = nullptr;
   int newPeerIdx = myPeerIdx;
 
-  if (link->connsSize == 0) {
-    link->connsSize = RAS_INCREMENT;
-    NCCLCHECK(ncclCalloc(&link->conns, link->connsSize));
+  if (link->conns) {
+    // Free the old contents but keep the first entry for convenience (though wipe it).
+    for (struct rasLinkConn* linkConn = link->conns->next; linkConn;) {
+      struct rasLinkConn* linkConnNext = linkConn->next;
+      free(linkConn);
+      linkConn = linkConnNext;
+    }
+    memset(link->conns, '\0', sizeof(*link->conns));
+    link->lastUpdatePeersTime = 0;
+  } else { // link->conns == nullptr
+    NCCLCHECK(ncclCalloc(&link->conns, 1));
   }
-  link->nConns = 0;
 
-  // Establish a connection for this link.  We iterate as long as the connections we find are experiencing delays.
-  while (newPeerIdx != -1) {
-    if (link->nConns == link->connsSize) {
-      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
-      link->connsSize += RAS_INCREMENT;
-    }
+  // Fill in the entry for the primary connection.
+  linkConn = link->conns;
+  linkConn->peerIdx = newPeerIdx = rasLinkCalculatePeer(link, myPeerIdx, /*isFallback*/false);
+  linkConn->conn = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : nullptr);
+  linkConn->external = false;
 
-    newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1);
-    if (newPeerIdx == -1) {
-      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
-      if (link->nConns > 0)
-        break;
-    }
-    linkConn = link->conns+link->nConns;
-    linkConn->peerIdx = newPeerIdx;
-    linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1);
-    linkConn->external = false;
-
-    // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration.
-    // Depending on the circumstances, we may first need to create that connection.
-    if (linkConn->connIdx == - 1) {
-      if (link->nConns == 0) {
-        if (linkConn->peerIdx != -1) {
-          INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
-               link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
-               ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-          // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
-          // to avoid races and the creation of duplicate connections.
-          if (myPeerIdx < linkConn->peerIdx) {
-            NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
-          }
-          else { // If we didn't initiate the connection, start the timeout.
-            link->lastUpdatePeersTime = clockNano();
-          }
-        } // if (linkConn->peerIdx != -1)
-      } else { // link->nConns > 0
-        INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s",
-             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-        NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx));
-      } // link->nConns > 0
-    } else { // linkConn->connIdx != -1
-      if (link->nConns == 0) {
-        INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
-             link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-      } else {
-        INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s",
-             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+  if (linkConn->conn == nullptr) {
+    if (linkConn->peerIdx != -1) {
+      // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
+      // to avoid races and the creation of duplicate connections.
+      INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
+           link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
+           ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      if (myPeerIdx < linkConn->peerIdx) {
+        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn));
       }
-    }
-    link->nConns++;
-    if (linkConn->connIdx == -1)
-      break;
-    conn = rasConns+linkConn->connIdx;
-
-    // We check if the connection already went through the fallback calculation; if so, we'll need to create a new
-    // fallback in the next iteration, to ensure that RAS will keep retrying.
-    if (!conn->experiencingDelays)
-      break;
+      else { // If we didn't initiate the connection, start the timeout.
+        link->lastUpdatePeersTime = clockNano();
+      }
+    } // if (linkConn->peerIdx != -1)
+  } else { // linkConn->conn
+    INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
+         link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+  } // linkConn->conn
 
+  if (linkConn->conn && linkConn->conn->experiencingDelays) {
     INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
-         conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9,
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         linkConn->conn->experiencingDelays, (clockNano()-linkConn->conn->startRetryTime)/1e9,
+         (linkConn->conn->sock ? linkConn->conn->sock->status : - 1));
+    NCCLCHECK(rasLinkAddFallback(link, linkConn->conn));
   }
 
   return ncclSuccess;
@@ -701,39 +676,37 @@ int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallbac
     if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) {
       // peerIdx is a fallback and it is not running on the same node as us.
       int tryPeerIdx = newPeerIdx;
-      int tryConnIdx = -1;
+      struct rasConnection* tryConn = nullptr;
 
       // Try to skip the remaining peers on the same node as peerIdx.  We may end up skipping over some peers that
       // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a
       // little suboptimal one.
       while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) {
         if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) {
-          tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr);
-          if (tryConnIdx != -1) {
-            struct rasConnection* tryConn = rasConns+tryConnIdx;
+          tryConn = rasConnFind(&rasPeers[tryPeerIdx].addr);
+          if (tryConn) {
             // Check if the connection is fully established and operational, i.e., if the underlying socket
             // is ready and there's been recent communication on it.
-            if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY &&
-                !tryConn->experiencingDelays) {
+            if (tryConn->sock && tryConn->sock->status == RAS_SOCK_READY && !tryConn->experiencingDelays) {
               // We convinced ourselves that the node is not down.  We don't adjust newPeerIdx in
               // this case.  This is the only case when tryConnIdx != -1 after this loop.
               break;
             }
-          } // if (tryConnIdx != -1)
+          } // if (tryConn)
         } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr))
 
-        tryConnIdx = -1;
-        tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers;
+        tryConn = nullptr;
+        tryPeerIdx = (tryPeerIdx + link->direction + nRasPeers) % nRasPeers;
         if (tryPeerIdx == myPeerIdx)
           break;
       }
 
-      if (tryConnIdx == -1)
+      if (tryConn == nullptr)
         newPeerIdx = tryPeerIdx;
       if (tryPeerIdx == myPeerIdx)
         break;
     } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr))
-    
+
     if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) {
       newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers;
     }
@@ -932,7 +905,8 @@ bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSock
 static void rasPeersDump() {
   for (int p = 0; p < nRasPeers; p++) {
     const struct rasPeerInfo* peer = rasPeers+p;
-    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : ""));
+    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)),
+         (p == myPeerIdx ? " [this process]" : ""));
   }
   if (nRasPeers > 0)
     INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash);
@@ -958,3 +932,17 @@ static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nr
            rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2)));
   return result;
 }
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasPeersTerminate() {
+  free(rasPeers);
+  rasPeers = nullptr;
+  nRasPeers = 0;
+  rasPeersHash = 0;
+  myPeerIdx = -1;
+
+  free(rasDeadPeers);
+  rasDeadPeers = nullptr;
+  nRasDeadPeers = rasDeadPeersSize = 0;
+  rasDeadPeersHash = 0;
+}
diff --git a/src/ras/ras.cc b/src/ras/ras.cc
index 4905d7a..8ef551c 100644
--- a/src/ras/ras.cc
+++ b/src/ras/ras.cc
@@ -4,8 +4,10 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out during development only!
-#include <cassert>
+// Workaround for libstdc++ trying to force public visibility of std:: symbols.  We don't want to do that in libnccl.so.
+#include <bits/c++config.h>
+#undef _GLIBCXX_VISIBILITY
+#define _GLIBCXX_VISIBILITY(V)
 #include <cstddef>
 #include <mutex>
 #include <poll.h>
@@ -65,8 +67,8 @@ int nNcclComms = 0;
 bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank.
 
 static ncclResult_t rasLocalNotify(const struct rasNotification* msg);
-static ncclResult_t rasLocalHandle();
-static void rasLocalHandleTerminate();
+static ncclResult_t rasLocalHandle(bool* terminate);
+static void rasThreadCleanup();
 
 static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock);
 static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock);
@@ -74,6 +76,8 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock);
 
 static void* rasThreadMain(void*);
 
+static void rasTerminate() __attribute__((destructor));
+
 NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1);
 
 //////////////////////////////////////////////////
@@ -105,7 +109,6 @@ ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank)
 
       PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail);
       ncclSetThreadName(rasThread, "NCCL RAS");
-      (void)pthread_detach(rasThread);
 
       rasInitialized = true;
     }
@@ -157,18 +160,27 @@ ncclResult_t ncclRasCommFini(const struct ncclComm* comm) {
       }
     }
   }
-  if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) {
-    struct rasNotification msg;
-    msg.type = RAS_TERMINATE;
-    NCCLCHECK(rasLocalNotify(&msg));
-  }
+  ncclAtomicRefCountDecrement(&rasInitRefCount);
   return ncclSuccess;
 }
 
+// Global destructor.  Notifies the RAS thread to release all the resources
+// and terminate.  Waits for the thread to terminate.
+static void rasTerminate() {
+  struct rasNotification msg;
+  if (!rasInitialized)
+    return;
+  memset(&msg, '\0', sizeof(msg));
+  msg.type = RAS_TERMINATE;
+  if (rasLocalNotify(&msg) == ncclSuccess)
+    (void)pthread_join(rasThread, nullptr);
+}
+
 // Invoked by regular NCCL threads on every (non-split) comm initialization.  Provides info on all the ranks within
 // the communicator.
 ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) {
   struct rasNotification msg;
+  memset(&msg, '\0', sizeof(msg));
   msg.type = RAS_ADD_RANKS;
   msg.addRanks.ranks = ranks;
   msg.addRanks.nranks = nranks;
@@ -199,7 +211,7 @@ static ncclResult_t rasLocalNotify(const struct rasNotification* msg) {
 /////////////////////////////////////////////////////////////////////////////////
 
 // Handles asynchronous local notifications arriving from regular NCCL threads.
-static ncclResult_t rasLocalHandle() {
+static ncclResult_t rasLocalHandle(bool* terminate) {
   struct rasNotification msg;
 
   size_t done = 0;
@@ -212,9 +224,11 @@ static ncclResult_t rasLocalHandle() {
   }
 
   if (msg.type == RAS_ADD_RANKS) {
-    NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks));
+    (void)rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks);
+    // Not great if the above fails, but it shouldn't be critical; better to keep going.
   } else if (msg.type == RAS_TERMINATE) {
-    rasLocalHandleTerminate();
+    INFO(NCCL_RAS, "RAS handling local termination request");
+    *terminate = true;
   } else {
     WARN("RAS received unknown notification type %d", msg.type);
     return ncclInternalError;
@@ -223,10 +237,35 @@ static ncclResult_t rasLocalHandle() {
   return ncclSuccess;
 }
 
-// Handles local RAS_TERMINATE notification.
-static void rasLocalHandleTerminate() {
-  INFO(NCCL_RAS, "RAS handling local termination request");
-  // For now we don't do anything.
+// Cleans up local RAS state, normally in response to a RAS_TERMINATE notification.
+static void rasThreadCleanup() {
+  rasClientSupportTerminate();
+  rasNetTerminate();
+  rasCollectivesTerminate();
+  rasPeersTerminate();
+
+  {
+    std::lock_guard<std::mutex> lock(rasInitMutex);
+    (void)close(rasNotificationPipe[1]);
+    (void)close(rasNotificationPipe[0]);
+    // rasClientListeningSocket is taken care of by rasClientSupportTerminate().
+    rasNotificationPipe[0] = rasNotificationPipe[1] = -1;
+    (void)ncclSocketClose(&rasNetListeningSocket);
+    rasInitRefCount = 0;
+    rasInitialized = false;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+    free(ncclComms);
+    ncclComms = nullptr;
+    nNcclComms = 0;
+    ncclCommsSorted = false;
+  }
+
+  free(rasPfds);
+  rasPfds = nullptr;
+  nRasPfds = 0;
 }
 
 
@@ -270,10 +309,10 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
   else
     ncclIntruQueueEnqueue(&conn->sendQ, meta);
 
-  if (conn->sockIdx != -1) {
-    struct rasSocket* sock = rasSockets+conn->sockIdx;
-    if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
-      rasPfds[sock->pfd].events |= POLLOUT;
+  if (conn->sock) {
+    if (conn->sock->status == RAS_SOCK_READY ||
+        (conn->sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
+      rasPfds[conn->sock->pfd].events |= POLLOUT;
       ready = true;
     }
   }
@@ -283,31 +322,31 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
          "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
          msg->type, ncclSocketToString(&conn->addr, rasLine),
          conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         (conn->sock ? conn->sock->status : -1));
   }
 }
 
 // Attempts to send the queued RAS messages to another RAS thread.
 ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) {
-  struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock;
   struct rasMsgMeta* meta;
   *closed = 0;
   while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) {
-    if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
+    if (conn->sock->status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
       // We don't send anything beyond the handshake at this point.
       meta = nullptr;
       break;
     }
     if (meta->offset < sizeof(meta->length)) {
       // Send the length of the message.
-      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed));
+      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, &meta->length, sizeof(meta->length),
+                                   &meta->offset, closed));
       if (*closed)
         return ncclSuccess;
       if (meta->offset < sizeof(meta->length))
         break;
     }
     // Send the body of the message.
-    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length),
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, ((char*)&meta->msg)-sizeof(meta->length),
                                  meta->length+sizeof(meta->length), &meta->offset, closed));
     if (*closed)
       return ncclSuccess;
@@ -377,7 +416,7 @@ ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) {
 static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) {
   ncclResult_t ret = ncclSuccess;
   struct rasConnection* conn = nullptr;
-  int connIdx, peerIdx;
+  int peerIdx;
   struct rasMsg* newMsg = nullptr;
   int newMsgLen;
   char line[SOCKET_NAME_MAXLEN+1];
@@ -406,19 +445,16 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
   }
 
   // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race).
-  connIdx = rasConnFind(&msg->connInit.listeningAddr);
-  if (connIdx != -1) {
-    conn = rasConns+connIdx;
-
+  conn = rasConnFind(&msg->connInit.listeningAddr);
+  if (conn) {
     INFO(NCCL_RAS,
          "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
          (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
          conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0));
 
-    if (conn->sockIdx != -1) {
-      struct rasSocket* connSock = rasSockets+conn->sockIdx;
+    if (conn->sock) {
       INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)",
-           connSock->status, (clockNano()-connSock->createTime)/1e9);
+           conn->sock->status, (clockNano()-conn->sock->createTime)/1e9);
       // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have
       // a race where both sides attempt to establish a connection at roughly the same time, so the other side's
       // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them.
@@ -433,21 +469,19 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
         goto exit;
       } else {
         INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one");
-        rasSocketTerminate(connSock);
+        rasSocketTerminate(conn->sock);
       }
     }
-  }
-  if (!conn) {
+  } else { // conn == nullptr
     NCCLCHECK(getNewConnEntry(&conn));
     memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr));
-    connIdx = conn - rasConns;
   }
 
   sock->status = RAS_SOCK_READY;
   // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
 
-  conn->sockIdx = sock-rasSockets;
-  sock->connIdx = connIdx;
+  conn->sock = sock;
+  sock->conn = conn;
   memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr));
 
   // Make sure that the connection is part of the right links forming the RAS network.  At this point we only
@@ -456,8 +490,8 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
   // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before
   // the peers update.
   if (peerIdx != -1) {
-    (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx);
-    (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx);
+    (void)rasLinkConnUpdate(&rasNextLink, conn, peerIdx);
+    (void)rasLinkConnUpdate(&rasPrevLink, conn, peerIdx);
   }
 
   // Send a confirmation to the server that requested the connection (so that the resilience code can mark
@@ -504,12 +538,13 @@ static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct ras
 }
 
 // Handles the deadPeer broadcast.
-void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) {
-  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine));
+void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone) {
+  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&(*pReq)->deadPeer.addr, rasLine));
 
-  if (!rasPeerIsDead(&req->deadPeer.addr)) {
-    rasConnDisconnect(&req->deadPeer.addr);
-    (void)rasPeerDeclareDead(&req->deadPeer.addr);
+  *pReqLen = rasCollDataLength(RAS_BC_DEADPEER);
+  if (!rasPeerIsDead(&(*pReq)->deadPeer.addr)) {
+    rasConnDisconnect(&(*pReq)->deadPeer.addr);
+    (void)rasPeerDeclareDead(&(*pReq)->deadPeer.addr);
     *pDone = false;
   } else {
     INFO(NCCL_RAS, "RAS already knew it was dead");
@@ -530,6 +565,7 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock) {
 
   INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine));
 
+  memset(&msg, '\0', sizeof(msg));
   msg.type = RAS_MSG_CONNINITACK;
   msg.connInitAck.nack = 1;
   offset = 0;
@@ -557,16 +593,16 @@ static void* rasThreadMain(void*) {
   INFO(NCCL_RAS, "RAS thread started");
 
   // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket).
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
   rasPfds[pfd].fd = rasNotificationPipe[0];
   rasPfds[pfd].events = POLLIN;
 
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
-  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
+  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, exit);
   rasPfds[pfd].fd = rasNetListeningSocketFd;
   rasPfds[pfd].events = POLLIN;
 
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
   rasPfds[pfd].fd = rasClientListeningSocket;
   rasPfds[pfd].events = POLLIN;
 
@@ -595,32 +631,37 @@ static void* rasThreadMain(void*) {
       if (rasPfds[pollIdx].revents) {
         nEvents--;
         if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) {
-          (void)rasLocalHandle();
+          bool terminate = false;
+          NCCLCHECKGOTO(rasLocalHandle(&terminate), ret, exit);
+          if (terminate)
+            goto exit;
         } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) {
           (void)rasNetAcceptNewSocket();
         } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) {
           (void)rasClientAcceptNewSocket();
         } else {
           // Check if it's one of the RAS sockets.
-          int sockIdx;
-          for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
-            struct rasSocket* sock = rasSockets+sockIdx;
-            if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) {
-              rasSockEventLoop(sockIdx, pollIdx);
+          struct rasSocket* sock;
+          for (sock = rasSocketsHead; sock;) {
+            struct rasSocket* sockNext = sock->next;
+            if (rasPfds[pollIdx].fd == sock->sock.fd) {
+              rasSockEventLoop(sock, pollIdx);
               break;
             }
-          } // for (sockIdx)
+            sock = sockNext;
+          } // for (sock)
 
-          if (sockIdx == nRasSockets) {
+          if (sock == nullptr) {
             // Try a client socket instead.
-            for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) {
-              struct rasClient* client = rasClients+clientIdx;
-              if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) {
-                rasClientEventLoop(clientIdx, pollIdx);
+            for (struct rasClient* client = rasClientsHead; client;) {
+              struct rasClient* clientNext = client->next;
+              if (rasPfds[pollIdx].fd == client->sock) {
+                rasClientEventLoop(client, pollIdx);
                 break;
               }
-            } // for (clientIdx)
-          } // if (sockIdx == nRasSockets)
+              client = clientNext;
+            } // for (client)
+          } // if (sock == nullptr)
         } // dynamic fds
       } // if (revents)
     } // for (pollIdx)
@@ -636,14 +677,9 @@ static void* rasThreadMain(void*) {
     rasCollsHandleTimeouts(now, &nextWakeup);
   } // for (;;)
 
-fail:
-  WARN("fatal error - RAS thread terminating");
-  std::lock_guard<std::mutex> lock(rasInitMutex);
-  (void)close(rasNotificationPipe[1]);
-  (void)close(rasNotificationPipe[0]);
-  (void)close(rasClientListeningSocket);
-  (void)ncclSocketClose(&rasNetListeningSocket);
-  rasInitialized = false;
+exit:
+  rasThreadCleanup();
+  INFO(NCCL_RAS, "RAS thread terminating");
   return nullptr;
 }
 
diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h
index 715fff4..17326c3 100644
--- a/src/ras/ras_internal.h
+++ b/src/ras/ras_internal.h
@@ -42,6 +42,14 @@ typedef enum {
   RAS_COLL_COMMS = 1002, // Collect data about all communicators.
 } rasCollectiveType;
 
+// Unique communicator identifier.  commHash by itself is definitely not guaranteed to be unique.
+// Combined with the two other hashes, the chance is much better...
+// All three fields are used for sorting.
+struct rasCommId {
+  uint64_t commHash;
+  uint64_t hostHash, pidHash; // These are the hashes of the *first* rank (comm->peerInfo[0]).
+};
+
 // Payload of a collective request message (RAS_MSG_COLLREQ).
 struct rasCollRequest {
   union ncclSocketAddress rootAddr;
@@ -56,6 +64,10 @@ struct rasCollRequest {
     struct {
     } conns;
     struct {
+      int nSkipMissingRanksComms; // Number of elements in the array below.
+      // Communicators for which we do *not* need the missingRanks data in the responses
+      // (see struct rasCollCommsMissingRank later).
+      struct rasCommId skipMissingRanksComms[0]; // Variable length, sorted.
     } comms;
   };
 };
@@ -69,8 +81,8 @@ struct rasCollResponse {
   int nPeers;
   int nData; // Size of data in bytes.
   union ncclSocketAddress peers[0]; // Variable length.
-  // The peersAddrs array is followed by:
-  //alignas(int64_t) char data[0]; // Variable length, collective-dependent.
+  // The peers array is followed by:
+  // alignas(int64_t) char data[0]; // Variable length, collective-dependent.
 };
 
 // Describes a peer NCCL process.  Every RAS thread keeps an (identical) array of them, one entry for each
@@ -80,6 +92,8 @@ struct rasPeerInfo {
   pid_t pid;
   uint64_t cudaDevs; // Bitmask.  This is for local devices so 64 bits is enough.
   uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
+  uint64_t hostHash, pidHash; // Taken from ncclComm, but with the commHash subtracted to make it
+                              // communicator-independent.
 };
 
 // Describes a RAS message.  Every message is preceded by a (32-bit) message length.  All data in the host
@@ -112,7 +126,7 @@ struct rasMsg {
       int nPeers;
       int nDeadPeers;
       struct rasPeerInfo peers[0]; // Variable length.
-      // The peers array is followed by the following:
+      // The peers array is followed by:
       //union ncclSocketAddress deadPeers[0]; // Variable length.
     } peersUpdate;
     struct {
@@ -218,6 +232,9 @@ struct rasMsgMeta {
 // Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response).
 // For every collective operation, each participating RAS thread will create its own.
 struct rasCollective {
+  struct rasCollective* next;
+  struct rasCollective* prev;
+
   union ncclSocketAddress rootAddr;
   uint64_t rootId;
 
@@ -227,15 +244,16 @@ struct rasCollective {
   bool timeoutWarned;
 
   int64_t startTime; // For timeout calculations.
-  int fromConnIdx; // The connection we received the request from.
+  struct rasConnection* fromConn; // The connection we received the request from.
 
-  int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive.
+  struct rasConnection** fwdConns; // Connections we forwarded the request to; replaced by nullptr's as the
+                                   // responses arrive.
   int nFwdSent; // Count of the above (local process only).
   int nFwdRecv; // Count of the responses received or timeouts (local process only).
 
   int nLegTimeouts; // Collective (from this process and the responses we received).
 
-  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).
+  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).  Unsorted.
   int nPeers;
 
   char* data; // Collective (from this process and the responses we received).
@@ -261,13 +279,14 @@ struct rasCollConns {
 struct rasCollComms {
   int nComms;
   struct comm {
-    uint64_t commHash;
-    int commNRanks;
-    int nRanks; // number of elements in the array below, *not* in the communicator.
+    struct rasCommId commId;
+    int commNRanks; // >= nRanks + nMissingRanks
+    int nRanks; // Number of elements in the ranks array below, *not* in the communicator.
+    int nMissingRanks; // Number of elements in the missingRanks array below.
     struct rank {
       int commRank;
       int peerIdx; // Index within rasCollective->peers, *not* rasPeers.
-      uint64_t collOpCount;
+      uint64_t collOpCounts[NCCL_NUM_FUNCTIONS];
       struct {
         ncclResult_t initState:4;
         ncclResult_t asyncError:4;
@@ -278,34 +297,47 @@ struct rasCollComms {
       char cudaDev;
       char nvmlDev;
     } ranks[0]; // Variable length. Sorted by commRank.  Optimized for 1 GPU/process.
-  } comms[0]; // Variable length. Sorted by commHash.
+    // The ranks array is followed by:
+    // struct rasCollCommsMissingRank missingRanks[0]; // Variable length.  Sorted by commRank.
+  } comms[0]; // Variable length.  Sorted by commId.
+};
+
+// Provides info about missing ranks.  An array of these structures can be part of struct rasCollComms above.
+// Because the arrays are of variable length, we can't describe them in C.  To ensure that adding
+// rasCollCommsMissingRank structures doesn't mess up the alignment, we explicitly request one.
+struct alignas(struct rasCollComms) rasCollCommsMissingRank {
+  int commRank;
+  union ncclSocketAddress addr;
+  // We don't need pid here as we can look it up in rasPeers via addr.
+  char cudaDev;
+  char nvmlDev;
 };
 
 // Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one
 // or one of the fallbacks).
 struct rasLinkConn {
+  struct rasLinkConn* next;
   int peerIdx; // Index in the rasPeers array of the peer this entry describes.  Could be -1 (an entry initiated
                // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates).
-  int connIdx; // Index in the rasConns array of the connection to the above peer.  Could be -1 (a placeholder
-               // for a connection to be started by the remote peer).
+  struct rasConnection* conn; // The connection to the above peer.  Could be nullptr (a placeholder for a connection
+                              // to be started by the remote peer).
   bool external; // true if the entry exists only due to an external request (requested by a remote peer, most
                  // likely as part of fault recovery).  Such connections are kept as fallbacks even if there's a
                  // valid primary connection, in order to ensure that keep-alive messages are sent.
 };
 
 // Describes a link that forms the backbone of the RAS network.  Links focus on direction (previous/next in
-// case of 1-D topology) rather than a particular destination.  The are implemented using rasConnections, but
+// case of 1-D topology) rather than a particular destination.  They are implemented using rasConnections, but
 // they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS
 // network is reconfigured or a peer dies.
 struct rasLink {
   int direction; // 1 for nextLink, -1 for prevLink.
 
-  // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having
-  // problems with the primary connection).  The elements are de-facto ordered (highest-preference ones have
-  // the lowest indices).
+  // First element is the primary connection; any additional ones are fallbacks (that get created if we are having
+  // problems with the primary connection).  The highest-preference elements come first; the list is de-facto sorted
+  // by peerIdx, though peerIdx values can wrap around (given the ring/torus topology) and they can also be -1
+  // (the latter are stored at the end).
   struct rasLinkConn* conns;
-  int nConns;
-  int connsSize; // Array size; could be larger than nConns.
 
   // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect
   // the peer on the other side to do so) but that peer failed to initiate.
@@ -315,15 +347,15 @@ struct rasLink {
 // Describes a connection to another peer on the RAS network.  It is meant to be more persistent than a volatile
 // socket (described by the rasSocket structure), which can be affected by transient network issues.
 struct rasConnection {
-  bool inUse;
+  struct rasConnection* next;
+  struct rasConnection* prev;
 
   union ncclSocketAddress addr;
 
-  // Index of the current rasSocket in the rasSockets array.  Note that multiple rasSocket entries may point back
+  // Pointer to the current rasSocket.  Note that multiple rasSocket entries may point back
   // to a single entry here, for sockets that are in the process of being terminated and re-established.
-  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
-  // -1 if there is no such socket.
-  int sockIdx;
+  // nullptr if there is no such socket.
+  struct rasSocket* sock;
 
   // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges.
   // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash.
@@ -371,16 +403,18 @@ typedef enum {
 
 // Describes a socket implementing communication between two peers.
 struct rasSocket {
+  struct rasSocket* next;
+  struct rasSocket* prev;
+
   struct ncclSocket sock;
 
   rasSocketStatus status;
 
   int pfd; // Index in the rasPfds array.
 
- // Index of the corresponding entry in the rasConns array.
-  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
-  // -1 if there is no connection (normal condition on the accept side before the connInit message).
-  int connIdx;
+  // Pointer to the corresponding entry in the rasConns array.
+  // nullptr if there is no connection (a normal condition on the accept side before the connInit message).
+  struct rasConnection* conn;
 
   int64_t createTime;
   int64_t lastSendTime;
@@ -404,7 +438,10 @@ typedef enum {
 
 // Describes a RAS client.
 struct rasClient {
-  int sock;
+  struct rasClient* next;
+  struct rasClient* prev;
+
+  int sock; // File descriptor
 
   rasClientStatus status;
 
@@ -420,7 +457,7 @@ struct rasClient {
   int64_t timeout;
 
   // State stored during asynchronous operations such as collectives.
-  int collIdx; // Index to the onging rasCollective.
+  struct rasCollective* coll;
 };
 
 
@@ -440,31 +477,33 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
 ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent);
 ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed);
 ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock);
-void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone);
+void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone);
 ncclResult_t rasGetNewPollEntry(int* index);
 
 
 // rasnet.cc
 extern struct rasLink rasNextLink, rasPrevLink;
-extern struct rasConnection* rasConns;
-extern int nRasConns;
-extern struct rasSocket *rasSockets;
-extern int nRasSockets;
+extern struct rasConnection* rasConnsHead;
+extern struct rasConnection* rasConnsTail;
+extern struct rasSocket *rasSocketsHead;
+extern struct rasSocket *rasSocketsTail;
 
 ncclResult_t getNewConnEntry(struct rasConnection** pConn);
-ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx);
-int rasConnFind(const union ncclSocketAddress* addr);
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn);
+struct rasConnection* rasConnFind(const union ncclSocketAddress* addr);
 void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup);
 void rasConnDisconnect(const union ncclSocketAddress* addr);
 ncclResult_t rasNetAcceptNewSocket();
 void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup);
 void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0,
                         bool retry = true);
-void rasSockEventLoop(int sockIdx, int pollIdx);
+void rasSockEventLoop(struct rasSocket* sock, int pollIdx);
 void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup);
 ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock);
-ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false,
-                               bool insert = false, bool pretend = false, int* pLinkIdx = nullptr);
+ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn);
+ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx);
+void rasNetTerminate();
+
 
 // peers.cc
 extern struct rasPeerInfo* rasPeers;
@@ -483,29 +522,35 @@ ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr);
 bool rasPeerIsDead(const union ncclSocketAddress* addr);
 int ncclSocketsCompare(const void* p1, const void* p2);
 bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2);
+void rasPeersTerminate();
 
 
 // collectives.cc
-extern struct rasCollective* rasCollectives;
+extern struct rasCollective* rasCollectivesHead;
+extern struct rasCollective* rasCollectivesTail;
 
 void rasCollReqInit(struct rasCollRequest* req);
-ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr,
-                               int* pCollIdx = nullptr, int fromConnIdx = -1);
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone = nullptr,
+                               struct rasCollective** pColl = nullptr, struct rasConnection* fromConn = nullptr);
 ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock);
 ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock);
-void rasCollsPurgeConn(int connIdx);
+void rasCollsPurgeConn(struct rasConnection* conn);
 void rasCollFree(struct rasCollective* coll);
 void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasCollectivesTerminate();
+
 
 // client_support.cc
 extern int rasClientListeningSocket;
-extern struct rasClient* rasClients;
-extern int nRasClients;
+extern struct rasClient* rasClientsHead;
+extern struct rasClient* rasClientsTail;
+
 ncclResult_t rasClientInitSocket();
 ncclResult_t rasClientAcceptNewSocket();
 ncclResult_t rasClientResume(struct rasCollective* coll);
-void rasClientEventLoop(int clientIdx, int pollIdx);
+void rasClientEventLoop(struct rasClient* client, int pollIdx);
 const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size);
+void rasClientSupportTerminate();
 
 #endif // !NCCL_RAS_CLIENT
 
diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc
index 441ad19..43aa042 100644
--- a/src/ras/rasnet.cc
+++ b/src/ras/rasnet.cc
@@ -13,90 +13,106 @@
 struct rasLink rasNextLink = {1}, rasPrevLink = {-1};
 
 // Connections on the RAS network.
-struct rasConnection* rasConns;
-int nRasConns;
+struct rasConnection* rasConnsHead;
+struct rasConnection* rasConnsTail;
 
 // Sockets implementing the RAS network.
-struct rasSocket *rasSockets;
-int nRasSockets;
+struct rasSocket *rasSocketsHead;
+struct rasSocket *rasSocketsTail;
 
 // Magic file descriptor number when we want poll() to ignore an entry.  Anything negative would do, but
 // I didn't want to use -1 because it has a special meaning for us.
 #define POLL_FD_IGNORE -2
 
+static void freeConnEntry(struct rasConnection* conn);
 static void rasConnOpen(struct rasConnection* conn);
 static ncclResult_t rasConnPrepare(struct rasConnection* conn);
 static void rasConnTerminate(struct rasConnection* conn);
 
 static ncclResult_t getNewSockEntry(struct rasSocket** pSock);
+static void freeSockEntry(struct rasSocket* sock);
 
 static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup);
-static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup);
+static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup);
 static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack = false);
 
-static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx);
 static void rasConnResume(struct rasConnection* conn);
 static void rasLinkSanitizeFallbacks(struct rasLink* link);
-static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx = -1);
-static int rasLinkFindConn(const struct rasLink* link, int connIdx);
+static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend = false,
+                                   int* pLinkIdx = nullptr, struct rasLinkConn** pLinkConn = nullptr,
+                                   bool insert = true);
+static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx);
+static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external = false);
+static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn,
+                                           int* pLinkIdx = nullptr);
 
 
 ///////////////////////////////////////////////
 // Functions related to the RAS connections. //
 ///////////////////////////////////////////////
 
-// Allocates an entry in the rasConns array, enlarging the array if necessary.
+// Allocates a new entry in the rasConnections list.
 ncclResult_t getNewConnEntry(struct rasConnection** pConn) {
   struct rasConnection* conn;
-  int i;
-  for (i = 0; i < nRasConns; i++)
-    if (!rasConns[i].inUse)
-      break;
-  if (i == nRasConns) {
-    NCCLCHECK(ncclRealloc(&rasConns, nRasConns, nRasConns+RAS_INCREMENT));
-    nRasConns += RAS_INCREMENT;
-  }
 
-  conn = rasConns+i;
-  memset(conn, '\0', sizeof(*conn));
-  conn->inUse = true;
-  conn->sockIdx = -1;
+  NCCLCHECK(ncclCalloc(&conn, 1));
+
   ncclIntruQueueConstruct(&conn->sendQ);
   conn->travelTimeMin = INT64_MAX;
   conn->travelTimeMax = INT64_MIN;
 
+  if (rasConnsHead) {
+    rasConnsTail->next = conn;
+    conn->prev = rasConnsTail;
+    rasConnsTail = conn;
+  } else {
+    rasConnsHead = rasConnsTail = conn;
+  }
+
   *pConn = conn;
   return ncclSuccess;
 }
 
+// Frees an entry from the rasConns list.
+static void freeConnEntry(struct rasConnection* conn) {
+  if (conn == nullptr)
+    return;
+
+  if (conn == rasConnsHead)
+    rasConnsHead = rasConnsHead->next;
+  if (conn == rasConnsTail)
+    rasConnsTail = rasConnsTail->prev;
+  if (conn->prev)
+    conn->prev->next = conn->next;
+  if (conn->next)
+    conn->next->prev = conn->prev;
+  free(conn);
+}
+
 // Creates a new RAS network connection to a remote peer address.
-ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) {
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn) {
   ncclResult_t ret = ncclSuccess;
-  struct rasConnection* conn = nullptr;
+  struct rasConnection* conn;
 
   // First check if a connection entry for this peer already exists.
-  int connIdx = rasConnFind(addr);
-  if (connIdx != -1) {
-    conn = rasConns+connIdx;
-  }
+  conn = rasConnFind(addr);
 
-  if (conn && conn->sockIdx != -1) {
+  if (conn && conn->sock) {
     // An entry exists and has a socket associated with it -- nothing left for us to do.
-    if (pConnIdx)
-      *pConnIdx = connIdx;
+    if (pConn)
+      *pConn = conn;
     goto exit;
   }
 
-  if (!conn) {
+  if (conn == nullptr) {
     NCCLCHECKGOTO(getNewConnEntry(&conn), ret, exit);
     memcpy(&conn->addr, addr, sizeof(conn->addr));
     // We are establishing a new connection -- start the timeout.
     conn->startRetryTime = clockNano();
-    connIdx = conn - rasConns;
   }
 
-  if (pConnIdx)
-    *pConnIdx = connIdx;
+  if (pConn)
+    *pConn = conn;
 
   rasConnOpen(conn);
 
@@ -107,7 +123,7 @@ exit:
 // Opens a connection to a remote peer.
 static void rasConnOpen(struct rasConnection* conn) {
   ncclResult_t ret; // Not used.
-  struct rasSocket* sock;
+  struct rasSocket* sock = nullptr;
   bool closeSocketOnFail = false;
   int ready;
 
@@ -120,10 +136,8 @@ static void rasConnOpen(struct rasConnection* conn) {
 
   NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
 
-  // We delay the initialization of sockIdx, connIdx and status until this point so that in case of failures
-  // we don't need to clean them up.
-  conn->sockIdx = sock-rasSockets;
-  sock->connIdx = conn-rasConns;
+  conn->sock = sock;
+  sock->conn = conn;
   rasPfds[sock->pfd].fd = sock->sock.fd;
 
   // We ignore the possibly ready status of the socket at this point and consider it CONNECTING because
@@ -141,6 +155,7 @@ exit:
 fail:
   if (closeSocketOnFail)
     (void)ncclSocketClose(&sock->sock);
+  freeSockEntry(sock);
   goto exit;
 }
 
@@ -166,16 +181,13 @@ static ncclResult_t rasConnPrepare(struct rasConnection* conn) {
 }
 
 // Searches through rasConns for a connection with a provided address.
-int rasConnFind(const union ncclSocketAddress* addr) {
-  // rasConns is not sorted (given the number of indices, it would be a massive hassle to keep it that way)
-  // so binary search won't do...
-  for (int i = 0; i < nRasConns; i++) {
-    struct rasConnection* conn = rasConns+i;
-    if (conn->inUse && memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0)
-      return i;
+struct rasConnection* rasConnFind(const union ncclSocketAddress* addr) {
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+    if (memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0)
+      return conn;
   }
 
-  return -1;
+  return nullptr;
 }
 
 // Handles any connection-related timeouts.  Many timeouts affect the underlying sockets and thus have been handled
@@ -184,58 +196,56 @@ int rasConnFind(const union ncclSocketAddress* addr) {
 // This is also where we declare peers as dead, etc.
 // Invoked from the main RAS event loop.
 void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
-    struct rasConnection* conn = rasConns+connIdx;
-
-    if (!conn->inUse)
-      continue;
-
-    if (conn->sockIdx != -1) {
-      struct rasSocket* sock = rasSockets+conn->sockIdx;
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    if (conn->sock) {
       bool sockTerminated = false;
 
       // Retry the socket connections that have been refused.
-      if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) {
-        if (now - sock->lastSendTime > RAS_CONNECT_RETRY) {
+      if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting) {
+        if (now - conn->sock->lastSendTime > RAS_CONNECT_RETRY) {
           int ready;
-          if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) {
+          if (ncclSocketReady(&conn->sock->sock, &ready) != ncclSuccess) {
             INFO(NCCL_RAS, "Unexpected error from ncclSocketReady; terminating the socket connection with %s",
-                 ncclSocketToString(&sock->sock.addr, rasLine));
-            rasSocketTerminate(sock, /*finalize*/true);
+                 ncclSocketToString(&conn->addr, rasLine));
+            rasSocketTerminate(conn->sock, /*finalize*/true);
             // We will retry below in the same loop.
             sockTerminated = true;
           } else {
             // We update lastSendTime even if !ready because we need it up-to-date for timeout calculations.
-            sock->lastSendTime = clockNano();
-            if (!ready && sock->sock.state == ncclSocketStateConnecting)
-              *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+            conn->sock->lastSendTime = clockNano();
+            if (!ready && conn->sock->sock.state == ncclSocketStateConnecting)
+              *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY);
             else
-              rasPfds[sock->pfd].fd = sock->sock.fd; // Enable the handling via the main loop.
+              rasPfds[conn->sock->pfd].fd = conn->sock->sock.fd; // Enable the handling via the main loop.
           } // if (ncclSocketReady)
         } else {
-          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+          *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY);
         }
-      } // if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting)
+      } // if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting)
 
       // For connections that have data to send but that we've been unable to send a message on for a while,
       // consider their sockets lost and terminate them.
-      if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) {
-        if (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) {
+      if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY) {
+        if (now - std::max(conn->sock->lastSendTime,
+                           ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) {
           INFO(NCCL_RAS, "RAS send stuck timeout error (%lds) on socket connection with %s",
-               (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) /
-               CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
-          rasSocketTerminate(sock, /*finalize*/false, RAS_STUCK_TIMEOUT);
+               (now - std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) /
+               CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+          rasSocketTerminate(conn->sock, /*finalize*/false, RAS_STUCK_TIMEOUT);
           // We will retry below in the same loop.
         } else {
-          *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime,
-                                                       ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+RAS_STUCK_TIMEOUT);
+          *nextWakeup = std::min(*nextWakeup,
+                                 std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+
+                                 RAS_STUCK_TIMEOUT);
         }
-      } // if (!ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY)
-    } // if (conn->sockIdx != -1)
+      } // if (!ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY)
+    } // if (conn->sock)
 
     // For connections that are being (re-)established, irrespective of whether there's a valid socket associated
-    // with them (conn->startIdx != -1), we need to check if any connection-level timeout has expired.
+    // with them, we need to check if any connection-level timeout has expired.
     if (conn->startRetryTime) {
+      bool connTerminated = false;
       // If we've been trying to open a connection for too long (60s), give up and mark the peer as dead
       // so that we don't try again.
       if (now - conn->startRetryTime > RAS_PEER_DEAD_TIMEOUT) {
@@ -248,82 +258,83 @@ void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
         rasCollReqInit(&bCast);
         bCast.type = RAS_BC_DEADPEER;
         memcpy(&bCast.deadPeer.addr, &conn->addr, sizeof(bCast.deadPeer.addr));
-        (void)rasNetSendCollReq(&bCast, rasCollDataLength(RAS_BC_DEADPEER));
+        (void)rasNetSendCollReq(&bCast);
 
-        continue;
+        connTerminated = true;
       } else {
         *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_PEER_DEAD_TIMEOUT);
       }
 
       // RAS_STUCK_TIMEOUT has already been handled in the socket function (we'll pick it up later via
-      // the conn->sockIdx == -1 test).
+      // the conn->sock == nullptr test).
 
-      // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try
-      // to establish fallback connections.
-      if (now - conn->startRetryTime > RAS_CONNECT_WARN) {
-        if (!conn->experiencingDelays) {
-          INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s",
-               (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+      if (!connTerminated) {
+        // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try
+        // to establish fallback connections.
+        if (now - conn->startRetryTime > RAS_CONNECT_WARN) {
+          if (!conn->experiencingDelays) {
+            INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s",
+                 (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
 
-          // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback
-          // connection(s).  At this point, it's mostly just a precaution; we will continue trying to establish
-          // the primary connection until RAS_PEER_DEAD_TIMEOUT expires.
-          conn->experiencingDelays = true;
-          (void)rasLinkAddFallback(&rasNextLink, connIdx);
-          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-          // rasConns may have been reallocated by the above calls.
-          conn = rasConns+connIdx;
+            // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback
+            // connection(s).  At this point, it's mostly just a precaution; we will continue trying to establish
+            // the primary connection until RAS_PEER_DEAD_TIMEOUT expires.
+            conn->experiencingDelays = true;
+            (void)rasLinkAddFallback(&rasNextLink, conn);
+            (void)rasLinkAddFallback(&rasPrevLink, conn);
 
-          // Stop collectives from waiting for a response over it.
-          rasCollsPurgeConn(connIdx);
-        } // if (!conn->experiencingDelays)
-      } else {
-        *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN);
-      }
-
-      // If a socket was terminated (or never opened, due to some error), try to open it now.
-      // We retry once a second.
-      if (conn->sockIdx == -1) {
-        if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) {
-          INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)",
-               ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays,
-               (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0));
-          rasConnOpen(conn);
+            // Stop collectives from waiting for a response over it.
+            rasCollsPurgeConn(conn);
+          } // if (!conn->experiencingDelays)
+        } else {
+          *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN);
         }
-        if (conn->sockIdx == -1)
-          *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY);
-      }
+
+        // If a socket was terminated (or never opened, due to some error), try to open it now.
+        // We retry once a second.
+        if (conn->sock == nullptr) {
+          if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) {
+            INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)",
+                 ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays,
+                 (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0));
+            rasConnOpen(conn);
+          }
+          if (conn->sock == nullptr)
+            *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY);
+        }
+      } // if (!connTerminated)
     } // if (conn->startRetryTime)
-  } // for (connIdx)
+
+    conn = connNext;
+  } // for (conn)
 }
 
 // Checks if we have a connection to a given peer and if so, terminates it.  The connection is removed from the
 // RAS links, though fallbacks are initiated if necessary.  Typically called just before declaring a peer dead.
 void rasConnDisconnect(const union ncclSocketAddress* addr) {
-  int connIdx = rasConnFind(addr);
-  if (connIdx != -1) {
-    (void)rasLinkAddFallback(&rasNextLink, connIdx);
-    (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-    rasLinkDropConn(&rasNextLink, connIdx);
-    rasLinkDropConn(&rasPrevLink, connIdx);
+  struct rasConnection* conn = rasConnFind(addr);
+  if (conn) {
+    (void)rasLinkAddFallback(&rasNextLink, conn);
+    (void)rasLinkAddFallback(&rasPrevLink, conn);
+    rasLinkConnDrop(&rasNextLink, conn);
+    rasLinkConnDrop(&rasPrevLink, conn);
 
-    rasConnTerminate(rasConns+connIdx);
+    rasConnTerminate(conn);
   }
 }
 
 // Terminates a connection and frees the rasConns entry.
 static void rasConnTerminate(struct rasConnection* conn) {
-  int connIdx = conn - rasConns;
-
   // Make sure there are no lingering rasSockets pointing to it.
-  for (int i = 0; i < nRasSockets; i++) {
-    struct rasSocket* sock = rasSockets+i;
-    if (sock->status != RAS_SOCK_CLOSED && sock->connIdx == connIdx)
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
+    if (sock->conn == conn)
       rasSocketTerminate(sock, /*finalize*/true);
+    sock = sockNext;
   }
 
   // Also check any ongoing collectives.
-  rasCollsPurgeConn(connIdx);
+  rasCollsPurgeConn(conn);
 
   while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&conn->sendQ)) {
     free(meta);
@@ -331,8 +342,7 @@ static void rasConnTerminate(struct rasConnection* conn) {
 
   INFO(NCCL_RAS, "RAS terminating a connection with %s", ncclSocketToString(&conn->addr, rasLine));
 
-  conn->inUse = false;
-  conn->sockIdx = -1; // Should be that way already, but just to be extra sure...
+  freeConnEntry(conn);
 }
 
 
@@ -344,7 +354,7 @@ static void rasConnTerminate(struct rasConnection* conn) {
 // corresponding rasConnection can't be established without knowing the peer's address.
 ncclResult_t rasNetAcceptNewSocket() {
   ncclResult_t ret = ncclSuccess;
-  struct rasSocket* sock;
+  struct rasSocket* sock = nullptr;
   int ready;
   bool socketInitialized = false;
   NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail);
@@ -370,91 +380,98 @@ exit:
 fail:
   if (socketInitialized)
     NCCLCHECK(ncclSocketClose(&sock->sock));
+  freeSockEntry(sock);
   goto exit;
 }
 
-// Returns the index of the first available entry in the rasConns array, enlarging the array if necessary.
+// Allocates a new entry in the rasSockets list.
 static ncclResult_t getNewSockEntry(struct rasSocket** pSock) {
   struct rasSocket* sock;
-  int i;
-  for (i = 0; i < nRasSockets; i++)
-    if (rasSockets[i].status == RAS_SOCK_CLOSED)
-      break;
-  if (i == nRasSockets) {
-    NCCLCHECK(ncclRealloc(&rasSockets, nRasSockets, nRasSockets+RAS_INCREMENT));
-    nRasSockets += RAS_INCREMENT;
-  }
 
-  sock = rasSockets+i;
-  memset(sock, '\0', sizeof(*sock));
+  NCCLCHECK(ncclCalloc(&sock, 1));
+
   sock->pfd = -1;
-  sock->connIdx = -1;
   sock->createTime = sock->lastSendTime = sock->lastRecvTime = clockNano();
 
+  if (rasSocketsHead) {
+    rasSocketsTail->next = sock;
+    sock->prev = rasSocketsTail;
+    rasSocketsTail = sock;
+  } else {
+    rasSocketsHead = rasSocketsTail = sock;
+  }
+
   *pSock = sock;
   return ncclSuccess;
 }
 
+// Frees an entry from the rasSockets list.
+static void freeSockEntry(struct rasSocket* sock) {
+  if (sock == nullptr)
+    return;
+
+  if (sock == rasSocketsHead)
+    rasSocketsHead = rasSocketsHead->next;
+  if (sock == rasSocketsTail)
+    rasSocketsTail = rasSocketsTail->prev;
+  if (sock->prev)
+    sock->prev->next = sock->next;
+  if (sock->next)
+    sock->next->prev = sock->prev;
+  free(sock);
+}
+
 // Invoked from the main RAS event loop to handle RAS socket timeouts.
 void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
-    struct rasSocket* sock = rasSockets+sockIdx;
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
 
-    if (sock->status == RAS_SOCK_CLOSED)
-      continue;
-
-    // For socket connections that are still being established, give up on the ones that take too long to initialize.
     if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) {
+      // For socket connections that are still being established, give up on the ones that take too long to initialize.
       if (now - sock->createTime > RAS_STUCK_TIMEOUT) {
-        if (sock->connIdx == -1) {
+        if (sock->conn == nullptr) {
           INFO(NCCL_RAS, "RAS init timeout error (%lds) on incoming socket connection from %s",
                (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
         } else {
-          struct rasConnection* conn = rasConns+sock->connIdx;
           INFO(NCCL_RAS, "RAS init timeout error (%lds) on socket connection with %s "
                "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
                (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine),
-               conn->experiencingDelays, (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0),
-               sock->status);
+               sock->conn->experiencingDelays,
+               (sock->conn->startRetryTime ? (now-sock->conn->startRetryTime)/1e9 : 0.0), sock->status);
         }
         rasSocketTerminate(sock, /*finalize*/true);
         // We may retry later.
-        continue;
       } else {
         *nextWakeup = std::min(*nextWakeup, sock->createTime+RAS_STUCK_TIMEOUT);
       }
-    } // if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE)
-
-    // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long.
-    if (sock->status == RAS_SOCK_TERMINATING) {
+    } else if (sock->status == RAS_SOCK_TERMINATING) {
+      // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long.
       if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_STUCK_TIMEOUT) {
         INFO(NCCL_RAS, "RAS termination stuck timeout error (%lds) on socket connection with %s",
              (now-std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock, /*finalize*/true);
         // This socket is presumably already being re-established, if needed.
-        continue;
       } else {
         *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_STUCK_TIMEOUT);
       }
-    } // if (sock->status == RAS_SOCK_TERMINATING)
-
-    // Terminate sockets that haven't been used in a good while.  In principle this shouldn't trigger for anything
-    // important due to shorter timeouts on RAS network connections, but in case of weird situations like process
-    // suspend, rasSocketTerminate will do additional checking.
-    if (sock->status == RAS_SOCK_READY) {
+    } else if (sock->status == RAS_SOCK_READY) {
+      // Terminate sockets that haven't been used in a good while.  In principle this shouldn't trigger for anything
+      // important due to shorter timeouts on RAS network connections, but in case of weird situations like process
+      // suspend, rasSocketTerminate will do additional checking.
       if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_IDLE_TIMEOUT) {
         INFO(NCCL_RAS, "RAS idle timeout (%lds) on socket connection with %s",
              (now - std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock, /*finalize*/false, /*startRetryOffset*/0, /*retry*/false);
-        continue;
         // The RAS network timeout handler will terminate the conn it was associated with, if any.
       } else {
         *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_IDLE_TIMEOUT);
       }
     } // if (sock->status == RAS_SOCK_READY)
-  } // for (sockIdx)
+
+    sock = sockNext;
+  } // for (sock)
 }
 
 // Handles the termination of a RAS socket.
@@ -464,19 +481,19 @@ void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
 // For not fully established sockets, we can terminate immediately as there's no useful data to extract.
 void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) {
   assert(sock->status != RAS_SOCK_CLOSED);
-  if (sock->connIdx != -1) {
-    struct rasConnection* conn = rasConns+sock->connIdx;
-    // If the sockIdx of the connection points back to us, it means that we are the current socket of this
+  if (sock->conn) {
+    struct rasConnection* conn = sock->conn;
+    // If the sock of the connection points back to us, it means that we are the current socket of this
     // connection, so we have additional work to do before we can terminate it.
-    if (conn->sockIdx == sock-rasSockets) {
+    if (conn->sock == sock) {
       // Reset it to indicate there's no valid socket associated with that connection anymore.
-      conn->sockIdx = -1;
+      conn->sock = nullptr;
 
       // Don't attempt to retry on sockets that have been unused for so long that the remote peer probably
       // deliberately closed them.  Make an exception for sockets that are part of the RAS network links.
       if ((retry &&
            clockNano() - std::max(sock->lastSendTime, sock->lastRecvTime) < RAS_IDLE_TIMEOUT - RAS_IDLE_GRACE_PERIOD) ||
-          rasLinkFindConn(&rasNextLink, sock->connIdx) != -1 || rasLinkFindConn(&rasPrevLink, sock->connIdx) != -1) {
+          rasLinkConnFind(&rasNextLink, sock->conn) || rasLinkConnFind(&rasPrevLink, sock->conn)) {
         // For connections that were fine until now, the connection-level timeout starts at termination, and possibly
         // even earlier, depending on what event trigerred the termination -- if it was another timeout expiring, then
         // we need to include that timeout as well.
@@ -507,11 +524,11 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet
       } // if (retry)
 
       // Stop collectives from waiting for a response over this connection.
-      rasCollsPurgeConn(sock->connIdx);
-    } // if (conn->sockIdx == sock-rasSockets)
-  } // if (sock->connIdx != -1)
+      rasCollsPurgeConn(sock->conn);
+    } // if (conn->sock == sock)
+  } // if (sock->conn)
 
-  if (sock->status != RAS_SOCK_CONNECTING && sock->connIdx != -1 && !finalize && (rasPfds[sock->pfd].events & POLLIN)) {
+  if (sock->status != RAS_SOCK_CONNECTING && sock->conn && !finalize && (rasPfds[sock->pfd].events & POLLIN)) {
     if (sock->status != RAS_SOCK_TERMINATING) {
       // The receiving side is still open -- close just the sending side.
       (void)ncclSocketShutdown(&sock->sock, SHUT_WR);
@@ -525,20 +542,15 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet
   } else {
     // Either the caller requested finalization or we cannot receive on it.
     (void)ncclSocketClose(&sock->sock);
-    sock->status = RAS_SOCK_CLOSED;
     rasPfds[sock->pfd].fd = -1;
     rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
-    sock->pfd = sock->connIdx = -1;
-    sock->recvOffset = sock->recvLength = 0;
     free(sock->recvMsg);
-    sock->recvMsg = nullptr;
+    freeSockEntry(sock);
   }
 }
 
 // Handles a ready socket FD from the main event loop.
-void rasSockEventLoop(int sockIdx, int pollIdx) {
-  struct rasSocket* sock = rasSockets+sockIdx;
-
+void rasSockEventLoop(struct rasSocket* sock, int pollIdx) {
   if (sock->status == RAS_SOCK_CONNECTING) {
     int ready;
     // Socket is not yet fully established. Continue the OS or NCCL-level handshake.
@@ -554,15 +566,15 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
         (connectSide ? sock->lastSendTime : sock->lastRecvTime) = clockNano();
         sock->status = RAS_SOCK_HANDSHAKE;
         if (connectSide) {
-          assert(sock->connIdx != -1);
-          if (rasConns[sock->connIdx].sockIdx == sockIdx) {
-            if (rasConnPrepare(rasConns+sock->connIdx) != ncclSuccess) {
+          assert(sock->conn);
+          if (sock->conn->sock == sock) {
+            if (rasConnPrepare(sock->conn) != ncclSuccess) {
               INFO(NCCL_RAS, "RAS unexpected error from rasConnPrepare; terminating the socket connection with %s",
                    ncclSocketToString(&sock->sock.addr, rasLine));
               rasSocketTerminate(sock);
               // We may retry further down.
             }
-          } else {
+          } else { // sock->conn->sock != sock
             // The connection this socket is associated with no longer considers it to be the current one.
             // This could possibly happen due to a race condition.  Simply terminate it.
             INFO(NCCL_RAS, "RAS connected with %s via a socket that's no longer current!",
@@ -581,10 +593,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
     if (sock->status != RAS_SOCK_TERMINATING && (rasPfds[pollIdx].revents & POLLOUT)) {
       int closed = 0;
       bool allSent = false;
-      assert(sock->connIdx != -1);
-      struct rasConnection* conn = rasConns+sock->connIdx;
-      assert(conn->sockIdx == sockIdx);
-      if (rasConnSendMsg(conn, &closed, &allSent) != ncclSuccess) {
+      assert(sock->conn);
+      assert(sock->conn->sock == sock);
+      if (rasConnSendMsg(sock->conn, &closed, &allSent) != ncclSuccess) {
         INFO(NCCL_RAS, "RAS unexpected error from rasConnSendMsg; terminating the socket connection with %s",
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock);
@@ -612,9 +623,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
           // We may retry further down.
         } else if (closed) {
           const char* socketType;
-          if (sock->connIdx == -1)
+          if (sock->conn == nullptr)
             socketType = "incoming";
-          else if (rasConns[sock->connIdx].sockIdx != sockIdx)
+          else if (sock->conn->sock != sock)
             socketType = "old";
           else if (sock->status == RAS_SOCK_HANDSHAKE)
             socketType = "new";
@@ -624,25 +635,21 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
                socketType, ncclSocketToString(&sock->sock.addr, rasLine));
           rasSocketTerminate(sock, /*finalize*/true);
           // We may retry further down.
-        } else {
+        } else { // !closed
           sock->lastRecvTime = clockNano();
           if (msg) {
             (void)rasMsgHandle(msg, sock);
             free(msg);
-            // Message handlers can terminate a socket in certain cases; we need to check for
-            // that here so that we don't try to receive from a closed socket.
-            // No handlers are currently believed to create new sockets but better to be safe than sorry
-            // and re-init the sock variable.
-            sock = rasSockets+sockIdx;
-            if (sock->status == RAS_SOCK_CLOSED)
+            // Message handlers can terminate a socket in various cases.  We re-check rasPfds.events to ensure that
+            // this hasn't happened here (rasSocketTerminate will reset it when finalizing a socket).
+            if (!(rasPfds[pollIdx].revents & POLLIN))
               break;
           }
-          if (sock->connIdx != -1) {
-            struct rasConnection* conn = rasConns+sock->connIdx;
-            if (conn->sockIdx == sockIdx && (conn->startRetryTime || conn->experiencingDelays))
-              rasConnResume(conn);
+          if (sock->conn) {
+            if (sock->conn->sock == sock && (sock->conn->startRetryTime || sock->conn->experiencingDelays))
+              rasConnResume(sock->conn);
           }
-        }
+        } // !closed
       } while (msg);
     } // if (POLLIN)
   } // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING
@@ -658,109 +665,95 @@ void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup) {
   // A connection can belong to multiple links but, when it comes to various timeouts, we want to handle each
   // connection just once.  We solve that with a simple flag within a connection.  This also allows us to distinguish
   // connections that are part of a link from those that are not.
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
-    rasConns[connIdx].linkFlag = false;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    conn->linkFlag = false;
 
   (void)rasLinkHandleNetTimeouts(&rasNextLink, now, nextWakeup);
   (void)rasLinkHandleNetTimeouts(&rasPrevLink, now, nextWakeup);
 
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
-    struct rasConnection* conn = rasConns+connIdx;
-    if (conn->inUse && !conn->linkFlag) {
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    if (!conn->linkFlag) {
       // The connection is not part of any link.  Check if it should be terminated.
-      if (conn->sockIdx == -1 && ncclIntruQueueEmpty(&conn->sendQ)) {
+      if (conn->sock == nullptr && ncclIntruQueueEmpty(&conn->sendQ))
         rasConnTerminate(conn);
-        continue;
-      }
     }
+    conn = connNext;
   }
 }
 
 // Checks for and handles timeouts at the link level; primarily the keep-alives for link connections.
 static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-    if (linkConn->connIdx != -1) {
-      if (!rasConns[linkConn->connIdx].linkFlag) {
-        rasConnHandleNetTimeouts(linkConn->connIdx, now, nextWakeup);
-        // rasConns may have been reallocated by the above call, which is why we don't have a conn variable here.
-        // For the same reason we re-init linkConn.
-        linkConn = link->conns+i;
-        rasConns[linkConn->connIdx].linkFlag = true;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
+    if (linkConn->conn) {
+      if (!linkConn->conn->linkFlag) {
+        rasConnHandleNetTimeouts(linkConn->conn, now, nextWakeup);
+        linkConn->conn->linkFlag = true;
       }
-    } else if (i == 0 && link->lastUpdatePeersTime != 0) {
+    } else if (linkConn == link->conns && link->lastUpdatePeersTime != 0) {
       // This triggers when rasLinkReinitConns didn't create the primary connection because we have a higher address
       // than the peer.  If that peer fails to initiate within RAS_CONNECT_WARN, we need to take action.
       if (now - link->lastUpdatePeersTime > RAS_CONNECT_WARN) {
         INFO(NCCL_RAS, "RAS peer connect timeout warning (%lds) on socket connection from %s",
              (now-link->lastUpdatePeersTime) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
-        if (linkConn->connIdx != -1) {
-          rasConns[linkConn->connIdx].linkFlag = true;
+        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn));
+        if (linkConn->conn) {
+          linkConn->conn->linkFlag = true;
         }
-        // We used to connect to the first fallback but I think trying to connect to the calculated primary first
-        // in this case is more intuitive.
-        //(void)rasLinkTryFallback(link, -1);
         link->lastUpdatePeersTime = 0;
       } else {
         *nextWakeup = std::min(*nextWakeup, link->lastUpdatePeersTime+RAS_CONNECT_WARN);
       }
-    } // if (i == 0 && link->lastUpdatePeerTime != 0)
-  } // for (i)
+    } // if (linkConn == link->conns && link->lastUpdatePeerTime != 0)
+  } // for (linkConn)
 
   return ncclSuccess;
 }
 
 // Handles the sending of keep-alive messages and related timeouts for connections that are part of the RAS links.
-static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup) {
-  struct rasConnection* conn = rasConns+connIdx;
-  if (conn->sockIdx != -1) {
-    struct rasSocket* sock = rasSockets+conn->sockIdx;
-
-    if (sock->status == RAS_SOCK_READY) {
+static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup) {
+  if (conn->sock) {
+    if (conn->sock->status == RAS_SOCK_READY) {
       // Send a regular keep-alive message if we haven't sent anything in a while and we don't have anything queued.
       if (ncclIntruQueueEmpty(&conn->sendQ)) {
-        if (now - sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) {
+        if (now - conn->sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) {
           rasConnSendKeepAlive(conn);
         } else {
-          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_KEEPALIVE_INTERVAL);
+          *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_KEEPALIVE_INTERVAL);
         }
       }
 
       // For short timeouts print a warning but also pessimistically immediately try to establish fallback connections.
-      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) {
+      if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) {
         if (!conn->experiencingDelays) {
           INFO(NCCL_RAS, "RAS keep-alive timeout warning (%lds) on socket connection with %s",
-               (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+               (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
 
           // At this point, it's mostly just a precaution; we will continue with the primary connection until
           // RAS_PEER_DEAD_TIMEOUT expires.
           conn->experiencingDelays = true;
-          (void)rasLinkAddFallback(&rasNextLink, connIdx);
-          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-          // rasConns and rasSockets may have been reallocated by the above calls.
-          conn = rasConns+connIdx;
-          sock = rasSockets+conn->sockIdx;
+          (void)rasLinkAddFallback(&rasNextLink, conn);
+          (void)rasLinkAddFallback(&rasPrevLink, conn);
 
-          // Stop collectives from waiting for a response over it.
-          rasCollsPurgeConn(connIdx);
+          // Stop ongoing collectives from waiting for a response over this connection.
+          rasCollsPurgeConn(conn);
         }
       } else {
-        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN);
+        *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN);
       }
 
       // For long timeouts we need to act.
-      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) {
+      if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) {
         INFO(NCCL_RAS, "RAS keep-alive timeout error (%lds) on socket connection with %s",
-             (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
-        rasSocketTerminate(sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR);
+             (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+        rasSocketTerminate(conn->sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR);
         *nextWakeup = now; // Retry will be in the next iteration of the main loop so ensure we don't wait.
       } else {
-        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR);
+        *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR);
       }
-    } // if (sock->status == RAS_SOCK_READY)
-  } // if (conn->sockIdx != -1)
+    } // if (conn->sock->status == RAS_SOCK_READY)
+  } // if (conn->sock)
 }
 
 // Sends a keep-alive message to a peer on the RAS network.
@@ -768,17 +761,17 @@ static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack) {
   struct rasMsg* msg = nullptr;
   int msgLen = rasMsgLength(RAS_MSG_KEEPALIVE);
   if (rasMsgAlloc(&msg, msgLen) == ncclSuccess) {
-    int linkIdx;
+    struct rasLinkConn* linkConn;
     msg->type = RAS_MSG_KEEPALIVE;
     msg->keepAlive.peersHash = rasPeersHash;
     msg->keepAlive.deadPeersHash = rasDeadPeersHash;
     msg->keepAlive.nack = (nack ? 1 : 0);
 
-    linkIdx = rasLinkFindConn(&rasNextLink, conn-rasConns);
-    if (linkIdx != -1 && !rasNextLink.conns[linkIdx].external)
+    linkConn = rasLinkConnFind(&rasNextLink, conn);
+    if (linkConn && !linkConn->external)
       msg->keepAlive.linkMask |= 2; // Our rasNextLink should be the peer's rasPrevLink.
-    linkIdx = rasLinkFindConn(&rasPrevLink, conn-rasConns);
-    if (linkIdx != -1 && !rasPrevLink.conns[linkIdx].external)
+    linkConn = rasLinkConnFind(&rasPrevLink, conn);
+    if (linkConn && !linkConn->external)
       msg->keepAlive.linkMask |= 1; // Our rasPrevLink should be the peer's rasNextLink.
 
     (void)clock_gettime(CLOCK_REALTIME, &msg->keepAlive.realTime);
@@ -793,46 +786,51 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
   int64_t travelTime;
   int peerIdx;
 
-  assert(sock->connIdx != -1);
-  struct rasConnection* conn = rasConns+sock->connIdx;
+  assert(sock->conn);
   SYSCHECK(clock_gettime(CLOCK_REALTIME, &currentTime), "clock_gettime");
   travelTime = (currentTime.tv_sec-msg->keepAlive.realTime.tv_sec)*1000*1000*1000 +
     (currentTime.tv_nsec-msg->keepAlive.realTime.tv_nsec);
 
-  if (msg->keepAlive.peersHash != conn->lastRecvPeersHash) {
-    conn->lastRecvPeersHash = msg->keepAlive.peersHash;
+  if (msg->keepAlive.peersHash != sock->conn->lastRecvPeersHash) {
+    sock->conn->lastRecvPeersHash = msg->keepAlive.peersHash;
   }
-  if (msg->keepAlive.deadPeersHash != conn->lastRecvDeadPeersHash) {
-    conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash;
+  if (msg->keepAlive.deadPeersHash != sock->conn->lastRecvDeadPeersHash) {
+    sock->conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash;
   }
 
   // Make sure that the connection is part of the appropriate links forming the RAS network.  In particular, this
   // will add any externally-requested connections to the appropriate links (or remove existing ones, if no longer
   // needed).
-  peerIdx = rasPeerFind(&conn->addr);
+  peerIdx = rasPeerFind(&sock->conn->addr);
   // Note: it's possible for peerIdx to be -1 at this point if, due to races, the keepAlive arrives before
   // the peers update.
-  (void)rasLinkUpdateConn(&rasNextLink, (msg->keepAlive.linkMask & 1) ? sock->connIdx : -1, peerIdx, /*external*/true);
-  (void)rasLinkUpdateConn(&rasPrevLink, (msg->keepAlive.linkMask & 2) ? sock->connIdx : -1, peerIdx, /*external*/true);
+  if (msg->keepAlive.linkMask & 1)
+    (void)rasLinkConnAddExternal(&rasNextLink, sock->conn, peerIdx);
+  else
+    rasLinkConnDrop(&rasNextLink, sock->conn, /*external*/true);
+  if (msg->keepAlive.linkMask & 2)
+    (void)rasLinkConnAddExternal(&rasPrevLink, sock->conn, peerIdx);
+  else
+    rasLinkConnDrop(&rasPrevLink, sock->conn, /*external*/true);
 
   // If the keep-alive message is from a peer that doesn't actually need this connection (i.e., for that peer the
   // connection is just an external fallback), we should check if *we* still need it.  It might be that we don't,
-  // and because we stopped sending the keep-alives, our peer doesn't know about it.  rasLinkUpdateConn calls above
-  // will have wiped any external fallbacks, so anything that remains must be needed.
+  // and because we stopped sending the keep-alives, our peer doesn't know about it.  The rasLinkConnDrop calls
+  // above will have wiped any external fallbacks, so anything that remains must be needed.
   if (!msg->keepAlive.nack && msg->keepAlive.linkMask == 0) {
-    if (rasLinkFindConn(&rasNextLink, sock->connIdx) == -1 && rasLinkFindConn(&rasPrevLink, sock->connIdx) == -1) {
+    if (rasLinkConnFind(&rasNextLink, sock->conn) == nullptr && rasLinkConnFind(&rasPrevLink, sock->conn) == nullptr) {
       // We don't need this connection either.  Notify the peer about it.  To avoid an infinite loop, we set the
       // special nack flag in the message to distinguish it from regular keep-alives.
-      rasConnSendKeepAlive(conn, /*nack*/true);
+      rasConnSendKeepAlive(sock->conn, /*nack*/true);
     }
   }
 
-  if (conn->travelTimeMin > travelTime)
-    conn->travelTimeMin = travelTime;
-  if (conn->travelTimeMax < travelTime)
-    conn->travelTimeMax = travelTime;
-  conn->travelTimeSum += travelTime;
-  conn->travelTimeCount++;
+  if (sock->conn->travelTimeMin > travelTime)
+    sock->conn->travelTimeMin = travelTime;
+  if (sock->conn->travelTimeMax < travelTime)
+    sock->conn->travelTimeMax = travelTime;
+  sock->conn->travelTimeSum += travelTime;
+  sock->conn->travelTimeCount++;
 
   if (msg->keepAlive.peersHash != rasPeersHash || msg->keepAlive.deadPeersHash != rasDeadPeersHash) {
     // This could happen due to a short-lived race condition between the peers propagation
@@ -842,7 +840,7 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
     INFO(NCCL_RAS, "RAS keepAlive hash mismatch from %s (peersHash 0x%lx, deadPeersHash 0x%lx)",
          ncclSocketToString(&sock->sock.addr, rasLine), msg->keepAlive.peersHash, msg->keepAlive.deadPeersHash);
     INFO(NCCL_RAS, "RAS my peersHash 0x%lx, deadPeersHash 0x%lx", rasPeersHash, rasDeadPeersHash);
-    NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers));
+    NCCLCHECK(rasConnSendPeersUpdate(sock->conn, rasPeers, nRasPeers));
   }
   return ncclSuccess;
 }
@@ -857,100 +855,104 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
 // External connections are generally ignored by this whole process: in particular, we don't add fallbacks for
 // timing out external connections.  However, we will use an active external connection if it would be a better
 // option than whatever we can come up with.
-static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) {
-  int peerIdx = -1;
-  int linkIdx = -1;
+ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn) {
+  struct rasLinkConn* foundLinkConn = nullptr;
+  struct rasLinkConn* firstExtLinkConn = nullptr;
   int firstExtLinkIdx = -1;
-  int newPeerIdx;
+  int newPeerIdx, i;
 
   // First check if the connection is part of this link.  In the process also check if any of the link's connections
   // might be active -- if so, there's no need to initiate any more fallbacks and we can bail out.
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) {
     if (linkConn->peerIdx == -1) {
-      // Such elements are always at the very end of the array and we can't use them so we can just as well break.
+      // Such elements are always at the end and we can't use them so we can just as well break.
       break;
     }
 
     // Check for any other connection that might be a viable fallback (basically, anything that is not experiencing
     // delays).
-    if (linkConn->connIdx != -1 && linkConn->connIdx != connIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
-      if (!conn->experiencingDelays) {
-        if (!linkConn->external)
+    if (linkConn->conn && linkConn->conn != conn) {
+      if (!linkConn->conn->experiencingDelays) {
+        if (!linkConn->external) {
           goto exit; // We don't need to do anything if there's a non-external connection.
-        else if (linkConn->peerIdx != -1) {
+        } else if (linkConn->peerIdx != -1) {
           // Record the location of the first potentially viable external connection in the chain; we may prefer it
           // over anything we can come up with.
-          if (firstExtLinkIdx == -1)
+          if (firstExtLinkConn == nullptr) {
+            firstExtLinkConn = linkConn;
             firstExtLinkIdx = i;
-          if (linkIdx != -1)
+          }
+          if (foundLinkConn)
             break; // Break out of the loop if we already have all the data we might need.
         } // linkConn->external && linkConn->peerIdx != -1
-      } // if (!conn->experiencingDelays)
-    } // if (linkConn->connIdx != -1)
+      } // if (!linkConn->conn->experiencingDelays)
+    } // if (linkConn->conn && linkConn->conn != conn)
 
-    if (linkConn->connIdx == connIdx) {
+    if (linkConn->conn == conn) {
       if (linkConn->external)
         goto exit; // We don't add fallbacks for external connections...
-      peerIdx = linkConn->peerIdx;
-      linkIdx = i;
+      foundLinkConn = linkConn;
       // We are not breaking out of the loop here because we want to check for active connections on *all* potentially
       // viable elements (in particular, there could be some external ones beyond this one).
     }
   }
 
-  if (linkIdx == -1)
+  if (foundLinkConn == nullptr)
     goto exit;
 
   // We found an existing element so the connection is part of the link.  No existing non-external connections of this
   // link are active, so a fallback is needed.
-  assert(peerIdx != -1);
-  newPeerIdx = rasLinkCalculatePeer(link, peerIdx, /*isFallback*/linkIdx > 0);
+  assert(foundLinkConn->peerIdx != -1);
+  newPeerIdx = rasLinkCalculatePeer(link, foundLinkConn->peerIdx, /*isFallback*/(foundLinkConn != link->conns));
   // In principle we want to add (at most) one fallback.  However, if the found fallback connection already exists
   // and is also experiencing delays, we need to keep iterating.
   while (newPeerIdx != -1) {
-    int newConnIdx = rasConnFind(&rasPeers[newPeerIdx].addr);
+    struct rasConnection* newConn = rasConnFind(&rasPeers[newPeerIdx].addr);
+    int linkIdx;
+    struct rasLinkConn* newLinkConn;
     // If we previously found a potential external fallback connection, check if it's better than what we just found.
-    if (firstExtLinkIdx != -1) {
+    if (firstExtLinkConn) {
       linkIdx = -1;
       // Calculate the index that the newly found fallback would have (pretend mode).
-      NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/true,
-                                  &linkIdx));
+      NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/true, &linkIdx));
       assert(linkIdx != -1);
       if (firstExtLinkIdx < linkIdx) {
         // The external connection *is* better -- use it as a fallback instead and be done.
-        link->conns[firstExtLinkIdx].external = false;
+        firstExtLinkConn->external = false;
         goto exit;
       }
     }
-    NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/false,
-                                &linkIdx));
-    if (firstExtLinkIdx != -1 && linkIdx <= firstExtLinkIdx)
-      firstExtLinkIdx++; // Adjust if we inserted a new conn at a lower index.
+    NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/false, &linkIdx, &newLinkConn));
+    if (firstExtLinkConn && linkIdx <= firstExtLinkIdx)
+      firstExtLinkIdx++; // Adjust if we inserted a new entry ahead of this one.
 
     INFO(NCCL_RAS, "RAS link %d: %s fallback connection %d with %s",
-         link->direction, (newConnIdx == -1 ? "opening new" : "calculated existing"),
+         link->direction, (newConn == nullptr ? "opening new" : "calculated existing"),
          linkIdx, ncclSocketToString(&rasPeers[newPeerIdx].addr, rasLine));
     // Note that we don't follow here our convention of "lower address is the one establishing connections" --
     // that convention is for optimizing regular operations, but we don't want to take chances during fault
     // recovery. It may temporarily result in duplicate connections, but we have a mechanism to deal with those.
-    if (newConnIdx == -1)
-      NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &link->conns[linkIdx].connIdx));
+    if (newConn == nullptr) {
+      NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &newConn));
+      newLinkConn->conn = newConn;
+    }
 
-    struct rasConnection* conn = rasConns+link->conns[linkIdx].connIdx;
     // If the fallback connection is also experiencing delays, we need to keep trying.
-    if (!conn->experiencingDelays)
+    if (!newConn->experiencingDelays)
       break;
     INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
-         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         newConn->experiencingDelays, (newConn->startRetryTime ? (clockNano()-newConn->startRetryTime)/1e9 : 0.0),
+         (newConn->sock ? newConn->sock->status : -1));
 
     newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/true);
   }
-  if (newPeerIdx == -1)
-      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
+  if (newPeerIdx == -1) {
+    int nConns = 0;
+    for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next)
+      nConns++;
+    INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (total %d)", link->direction, nConns);
+  }
 exit:
   return ncclSuccess;
 }
@@ -958,7 +960,7 @@ exit:
 // Invoked when we receive a message over a connection that was just activated or was experiencing delays.
 // Cleans up the fallbacks, timers, etc, as appropriate.
 static void rasConnResume(struct rasConnection* conn) {
-  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+  if (conn->sock && conn->sock->status == RAS_SOCK_READY) {
     INFO(NCCL_RAS, "RAS %s connection with %s (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
          (conn->experiencingDelays && conn->startRetryTime == 0 ? "recovered" : "established"),
          ncclSocketToString(&conn->addr, rasLine), (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
@@ -972,218 +974,362 @@ static void rasConnResume(struct rasConnection* conn) {
     rasLinkSanitizeFallbacks(&rasPrevLink);
 
     if (!ncclIntruQueueEmpty(&conn->sendQ))
-      rasPfds[rasSockets[conn->sockIdx].pfd].events |= POLLOUT;
+      rasPfds[conn->sock->pfd].events |= POLLOUT;
   }
 }
 
 // Checks if the primary connection is fully established and if so, purges the fallbacks (as they are no longer needed).
 static void rasLinkSanitizeFallbacks(struct rasLink* link) {
-  if (link->nConns > 0 && link->conns[0].connIdx != -1) {
-    struct rasConnection* conn = rasConns+link->conns[0].connIdx;
-    if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
+  if (link->conns && link->conns->conn) {
+    struct rasConnection* conn = link->conns->conn;
+    if (conn->sock && conn->sock->status == RAS_SOCK_READY && !conn->experiencingDelays) {
       // We have a good primary.  Simply drop all the fallbacks (the external ones will get recreated via the
       // keepAlive messages).
-      for (int i = 1; i < link->nConns; i++) {
+      int i = 1;
+      for (struct rasLinkConn* linkConn = link->conns->next; linkConn; i++) {
+        struct rasLinkConn* linkConnNext = linkConn->next;
         INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
-             link->direction, (link->conns[i].external ? "external " : ""), i,
-             ncclSocketToString(&rasConns[link->conns[i].connIdx].addr, rasLine));
+             link->direction, (linkConn->external ? "external " : ""), i,
+             ncclSocketToString(&linkConn->conn->addr, rasLine));
+        free(linkConn);
+        linkConn = linkConnNext;
       }
-      link->nConns = 1;
+      link->conns->next = nullptr;
       link->lastUpdatePeersTime = 0;
     }
   }
 }
 
-// Attempt to drop a connection from a link.
-static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx) {
-  if (linkIdx == -1)
-    linkIdx = rasLinkFindConn(link, connIdx);
-  if (linkIdx != -1) {
-    if (linkIdx == 0) {
-      INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s",
-           link->direction, ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-    } else {
-      INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
-           link->direction, (link->conns[linkIdx].external ? "external " : ""), linkIdx,
-           ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-    }
-    memmove(link->conns+linkIdx, link->conns+linkIdx+1, (link->nConns-(linkIdx+1))*sizeof(*link->conns));
-    if (link->nConns > 1)
-      link->nConns--;
-    else {
-      link->conns[0].peerIdx = link->conns[0].connIdx = -1;
-    }
-
-    if (linkIdx == 0) {
-      // First ensure that the conn becoming the primary is not marked as external (we don't want to lose it if
-      // the remote peer loses interest in it).
-      link->conns[0].external = false;
-      if (link->conns[0].connIdx != -1) {
-        INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary",
-             link->direction, ncclSocketToString(&rasConns[link->conns[0].connIdx].addr, rasLine));
-      }
-      rasLinkSanitizeFallbacks(link);
-    }
-  }
-}
-
-// Checks if a given connection is a member of this link and if so, returns its entry index.
-// Returns -1 if connection not found.
-static int rasLinkFindConn(const struct rasLink* link, int connIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    if (link->conns[i].connIdx == connIdx)
-      return i;
-  }
-  return -1;
-}
-
-// Note: the behavior of this function has become super-complex and so it should be considered for refactoring.
-// Searches for and updates an entry in a RAS network link.  The conns array is de-facto sorted by peerIdx: it is
-// ordered by preference, though peerIdx values can wrap around (given the ring/torus topology) and they can also
-// be -1 (the latter are stored at the end).
-// external provides an updated value for the entry's external field.  A false value, if requested, is always set;
-// a true value, however, is only set if a new entry is added (external == true implies insert), i.e., if an entry
-// already exists and the function is invoked with external == true, the new value will be ignored.
-// If insert is set, it will, if necessary, insert a new entry if one is not already there.
-// If pretend is set, it will not modify the array and will just set *pLinkIdx as appropriate.
-// pLinkIdx is a pointer to an (optional) result where the index of the added/updated entry is stored.
-// -1 can be passed as peerIdx if unknown (possible in case of race conditions, and only if external).
-// -1 can be passed as connIdx if unknown or, if insert is *not* set, to indicate that the entry is to be removed
-// (the entry's external must match the argument external for it to be removed).
-ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external, bool insert,
-                               bool pretend, int* pLinkIdx) {
+// Adds an entry to a RAS network link (or updates one, if it already exists).
+// conn can be nullptr if the connection doesn't exist (yet).
+// peerIdx *cannot* be -1 when this function is invoked.
+// If pretend is true, the function will not modify the list and will just set *pLinkIdx and *pLinkConn as appropriate.
+// pLinkIdx and pLinkConn are (optional) pointers to the results; the index/address of the added/updated entry are
+// stored there.
+// insert (true by default) determines whether this is an "add" function (as implied by the name) or an "update" --
+// if set to false, it will refuse to add a new entry (but will update an existing one as needed).
+// Note: there is some code duplication between this function and rasLinkConnAddExternal so changes to one of them
+// may need to be sync'ed to the other one as well.  They used to be a single function that could do it all but the
+// logic was extremely difficult to follow then.
+static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend,
+                                   int* pLinkIdx, struct rasLinkConn** pLinkConn, bool insert) {
+  struct rasLinkConn* oldLinkConn = nullptr;
+  struct rasLinkConn* linkConnPrev = nullptr;
   int i, oldLinkIdx = -1;
 
-  if (external && connIdx != -1)
-    insert = true;
+  assert(peerIdx != -1);
+  if (conn) {
+    // Start by checking if we already have an element with this conn.
+    oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx);
+    if (oldLinkConn) {
+      if (pLinkConn)
+        *pLinkConn = oldLinkConn;
+      if (oldLinkConn->peerIdx != -1) {
+        assert(oldLinkConn->peerIdx == peerIdx);
 
-  if (connIdx != -1) {
-    // Start by checking if we already have an element with this connIdx.
-    oldLinkIdx = rasLinkFindConn(link, connIdx);
-    if (oldLinkIdx != -1) {
-      struct rasLinkConn* linkConn = link->conns+oldLinkIdx;
-      if (linkConn->peerIdx != -1)
-        assert(linkConn->peerIdx == peerIdx);
-
-      if (linkConn->peerIdx == peerIdx) {
-        if (!external && !pretend)
-          linkConn->external = false; // Ensure that external is cleared if so requested.
+        if (!pretend)
+          oldLinkConn->external = false; // Ensure that external is cleared.
         if (pLinkIdx)
           *pLinkIdx = oldLinkIdx;
-        goto exit; // Nothing more to do if both connIdx and peerIdx are up to date.
-      }
+        goto exit; // Nothing more to do if both conn and peerIdx are up to date.
+      } // if (oldLinkConn->peerIdx != -1)
 
-      // Otherwise (linkConn->peerIdx == -1 && peerIdx != -1) we have a conn that, due to -1 peerIdx, is in a wrong
-      // place in the array -- we need to find the right spot.  linkConn->peerIdx == -1 can only happen for external
-      // connections.
-      assert(external);
-    }
-  }
+      // Otherwise oldLinkConn->peerIdx == -1.  The oldLinkConn is in a wrong place in the list -- we need to find
+      // the right spot.  This can happen only for external connections.
+    } // if (oldLinkConn)
+  } // if (conn)
 
-  if (peerIdx != -1) {
-    // Search for the right spot in the conns array.
-    for (i = 0; i < link->nConns; i++) {
-      struct rasLinkConn* linkConn = link->conns+i;
-      if (peerIdx != -1 && linkConn->peerIdx == peerIdx) {
-        // The exact conn element already exists.
-        if (connIdx == -1 && !insert) {
-          // Drop the connection from the link.
-          if (linkConn->external == external) {
-            if (!pretend)
-              rasLinkDropConn(link, linkConn->connIdx, i);
-            else if (pLinkIdx)
-              *pLinkIdx = i;
-          }
-        } else { // connIdx != -1 || insert
-          if (!pretend) {
-            if (linkConn->connIdx != -1)
-              assert(linkConn->connIdx == connIdx);
-            else
-              linkConn->connIdx = connIdx;
-            if (!external)
-              linkConn->external = false; // Ensure that external is cleared if so requested.
-            if (i == 0) {
-              // We received a connection from the remote peer that matches the primary connection we've been
-              // waiting for.
-              rasLinkSanitizeFallbacks(link);
-            }
-          } // if (!pretend)
-          if (pLinkIdx)
-            *pLinkIdx = i;
-        } // connIdx != -1 || insert
+  // Search for the right spot in the conns list.
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (linkConn->peerIdx == peerIdx) {
+      // The exact linkConn element already exists.
+      if (linkConn->conn)
+        assert(linkConn->conn == conn);
+      if (!pretend) {
+        if (linkConn->conn == nullptr)
+          linkConn->conn = conn;
+        linkConn->external = false; // Ensure that external is cleared.
+        if (linkConn == link->conns) {
+          // We received a connection from the remote peer that matches the primary connection we've been
+          // waiting for.
+          rasLinkSanitizeFallbacks(link);
+        }
+      } // if (!pretend)
+      if (pLinkIdx)
+        *pLinkIdx = i;
+      if (pLinkConn)
+        *pLinkConn = linkConn;
+      goto exit;
+    } // if (linkConn->peerIdx == peerIdx)
 
-        goto exit;
-      } // if (peerIdx != -1 && linkConn->peerIdx == peerIdx)
-      if (!insert)
-        continue;
-      // Ensure that the i-1 index is also valid.
-      if (i == 0)
-        continue;
-      // Conns with peerIdx == -1 are stored at the end, so anything else needs to go before them.
-      if (peerIdx != -1 && linkConn->peerIdx == -1)
+    // Ensure that the previous element is valid.
+    if (linkConnPrev == nullptr)
+      continue;
+    // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done.
+    if (linkConn->peerIdx == -1)
+      break;
+    // Detect a roll-over and handle it specially.
+    if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) {
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 ||
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
         break;
-      // Detect a roll-over and handle it specially.
-      if (link->direction * (link->conns[i-1].peerIdx - linkConn->peerIdx) > 0) {
-        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 ||
-            link->direction * (peerIdx - linkConn->peerIdx) < 0)
-          break;
-      } else { // Regular, monotonic case with the peerIdx value between two existing elements.
-        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 &&
-            link->direction * (peerIdx - linkConn->peerIdx) < 0)
-          break;
-      }
-    } // for (i)
-  } else {
-    // If peerIdx == -1, insert the new element at the very end.  This can only happen for external connections.
-    assert(external && oldLinkIdx == -1);
-    i = link->nConns;
-  }
-  if (!insert)
-    goto exit;
+    } else { // Regular, monotonic case with the peerIdx value between two existing elements.
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 &&
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
+    }
+  } // for (linkConn)
 
-  // i holds the index at which to insert a new element.
-  if (pretend) {
-    if (pLinkIdx)
-      *pLinkIdx = i;
-    goto exit;
-  }
-
-  if (oldLinkIdx == -1) {
-    struct rasLinkConn* linkConn;
-    if (link->nConns == link->connsSize) {
-      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
-      link->connsSize += RAS_INCREMENT;
-    }
-    linkConn = link->conns+i;
-    // Shift existing conns with indices >= i to make room for the new one.
-    memmove(linkConn+1, linkConn, (link->nConns-i)*sizeof(*link->conns));
-    linkConn->peerIdx = peerIdx;
-    linkConn->connIdx = connIdx;
-    linkConn->external = external;
-    if (external) {
-      INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i,
-           ncclSocketToString((connIdx != -1 ? &rasConns[connIdx].addr : &rasPeers[peerIdx].addr), rasLine));
-    }
-    link->nConns++;
-  }
-  else { // oldLinkIdx > -1
-    // We already have the conn, we just need to move it to a new spot.
-    struct rasLinkConn* linkConn = link->conns+i;
-    assert(i <= oldLinkIdx); // We can only get here if linkConn->peerIdx == -1 && peerIdx != -1.
-    if (i != oldLinkIdx) {
-      struct rasLinkConn tmp;
-      struct rasLinkConn* linkConnNext = link->conns+i+1; // Just to silence the compiler.
-      // Move the existing conn from index oldLinkIdx to a (lower) index i, shifting the existing conns
-      // with indices in the range [i, oldLinkIdx).
-      memcpy(&tmp, link->conns+oldLinkIdx, sizeof(tmp));
-      memmove(linkConnNext, linkConn, (oldLinkIdx-i)*sizeof(*linkConn));
-      memcpy(linkConn, &tmp, sizeof(*linkConn));
-    }
-    if (!external)
-      linkConn->external = false; // Ensure that external is cleared if so requested.
-  } // oldLinkIdx > -1
+  // The new element should be inserted after linkConnPrev (which is at index i-1).
   if (pLinkIdx)
     *pLinkIdx = i;
+  if (pretend)
+    goto exit;
+
+  if (oldLinkConn) {
+    if (i != oldLinkIdx) {
+      // We already have the entry, but we need to move it to a new spot (which must be earlier in the list).
+      assert(i < oldLinkIdx);
+      // Remove oldLinkConn from its old spot.
+      for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) {
+        if (linkConn->next == oldLinkConn) {
+          linkConn->next = oldLinkConn->next;
+          break;
+        }
+      } // for (linkConn)
+      // Insert it at its new spot.
+      oldLinkConn->next = linkConnPrev->next;
+      linkConnPrev->next = oldLinkConn;
+    } // if (i != oldLinkIdx)
+    oldLinkConn->peerIdx = peerIdx;
+    oldLinkConn->external = false;
+  } else if (insert) {
+    struct rasLinkConn* linkConn;
+    NCCLCHECK(ncclCalloc(&linkConn, 1));
+    if (linkConnPrev) {
+      linkConn->next = linkConnPrev->next;
+      linkConnPrev->next = linkConn;
+    } else {
+      assert(link->conns == nullptr); // We never add an element that would replace an existing primary.
+      link->conns = linkConn;
+      // linkConn->next is already nullptr.
+    }
+    linkConn->peerIdx = peerIdx;
+    linkConn->conn = conn;
+    linkConn->external = false;
+    if (pLinkConn)
+      *pLinkConn = linkConn;
+  } // oldLinkConn == nullptr && insert
+
 exit:
   return ncclSuccess;
 }
+
+// Adds an external entry in a RAS network link (or updates one, if already exists).
+// conn *cannot* be nullptr when this function is invoked.
+// peerIdx can be -1 if unknown (possible in case of a race condition between keepAlive and peers update).
+// Note: there is some code duplication between this function and rasLinkConnAdd so changes to one of them
+// may need to be sync'ed to the other one as well.  They used to be a single function that could do it all but the
+// logic was extremely difficult to follow then.
+static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx) {
+  struct rasLinkConn* oldLinkConn = nullptr;
+  struct rasLinkConn* linkConnPrev = nullptr;
+  int i, oldLinkIdx = -1;
+
+  assert(conn);
+  oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx);
+  if (oldLinkConn) {
+    if (oldLinkConn->peerIdx != -1)
+      assert(oldLinkConn->peerIdx == peerIdx);
+
+    if (oldLinkConn->peerIdx == peerIdx)
+      goto exit; // Nothing more to do if both conn and peerIdx are up to date.  Note that we neither check nor
+                 // update the value of external here.
+
+    // Otherwise (oldLinkConn->peerIdx == -1 && peerIdx != -1) oldLinkConn, due to its -1 peerIdx, is in
+    // a wrong place in the array -- we need to find the right spot.  oldLinkConn->peerIdx == -1 can only happen for
+    // external connections.
+  } // if (oldLinkConn)
+
+  // Search for the right spot in the conns list.
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (peerIdx == -1) {
+      // We simply want to find the end of the list so that we can insert a new entry with -1 peerIdx there.
+      continue;
+    }
+    if (linkConn->peerIdx == peerIdx) {
+      // The exact linkConn element already exists.
+      if (linkConn->conn)
+        assert(linkConn->conn == conn);
+      if (linkConn->conn == nullptr)
+        linkConn->conn = conn;
+      if (linkConn == link->conns) {
+        // We received a connection from the remote peer that matches the primary connection we've been
+        // waiting for.  This shouldn't trigger for external connections (rasLinkConnUpdate should be invoked first,
+        // which will update the entry's conn, so rasLinkConnFind invoked at the top of this function should succeed),
+        // but better safe than sorry...
+        rasLinkSanitizeFallbacks(link);
+      }
+      goto exit;
+    } // if (linkConn->peerIdx == peerIdx)
+
+    // Ensure that the previous element is valid.
+    if (linkConnPrev == nullptr)
+      continue;
+    // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done.
+    if (linkConn->peerIdx == -1)
+      break;
+    // Detect a roll-over and handle it specially.
+    if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) {
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 ||
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
+    } else { // Regular, monotonic case with the peerIdx value between two existing elements.
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 &&
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
+    }
+  } // for (linkConn)
+
+  // The new element should be inserted after linkConnPrev (which is at index i-1).
+  if (oldLinkConn) {
+    if (i != oldLinkIdx) {
+      // We already have the entry, but we need to move it to a new spot (which must be earlier in the list).
+      assert(i < oldLinkIdx);
+      INFO(NCCL_RAS, "RAS link %d: moving %sfallback connection with %s from %d to %d", link->direction,
+           (oldLinkConn->external ? "external " : ""), ncclSocketToString(&conn->addr, rasLine), oldLinkIdx, i);
+      // Remove oldLinkConn from its old spot.
+      for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) {
+        if (linkConn->next == oldLinkConn) {
+          linkConn->next = oldLinkConn->next;
+          break;
+        }
+      } // for (linkConn)
+      // Insert it at its new spot.
+      oldLinkConn->next = linkConnPrev->next;
+      linkConnPrev->next = oldLinkConn;
+    } // if (i != oldLinkIdx)
+    oldLinkConn->peerIdx = peerIdx;
+    oldLinkConn->external = false;
+  } else { // oldLinkConn == nullptr
+    struct rasLinkConn* linkConn;
+    NCCLCHECK(ncclCalloc(&linkConn, 1));
+    if (linkConnPrev) {
+      INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i,
+           ncclSocketToString(&conn->addr, rasLine));
+      linkConn->next = linkConnPrev->next;
+      linkConnPrev->next = linkConn;
+      linkConn->external = true;
+    } else {
+      INFO(NCCL_RAS, "RAS link %d: adding external fallback with %s as a new primary connection", link->direction,
+           ncclSocketToString(&conn->addr, rasLine));
+      linkConn->next = link->conns;
+      link->conns = linkConn;
+      linkConn->external = false; // Primary connections are never external.
+    }
+    linkConn->peerIdx = peerIdx;
+    linkConn->conn = conn;
+  } // oldLinkConn == nullptr
+
+exit:
+  return ncclSuccess;
+}
+
+// Updates an existing entry in a RAS network link, if any.
+// Basically an easy-to-use variant of rasLinkConnAdd.
+// For this function, conn cannot be a nullptr and peerIdx cannot be -1.
+ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx) {
+  assert(conn && peerIdx != -1);
+
+  NCCLCHECK(rasLinkConnAdd(link, conn, peerIdx, /*pretend*/false, /*pLinkIdx*/nullptr, /*pLinkConn*/nullptr,
+                           /*insert*/false));
+  return ncclSuccess;
+}
+
+// Attempts to drop a connection from a link.
+// If the optional external argument is true, it will drop a connection only if its external flag is set
+// (otherwise the flag is ignored and a connection is always dropped if found).
+static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external) {
+  struct rasLinkConn* linkConnPrev = nullptr;
+  int i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (linkConn->conn == conn && (!external || linkConn->external)) {
+      if (linkConnPrev) {
+        INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
+             link->direction, (linkConn->external ? "external " : ""), i,
+             ncclSocketToString(&conn->addr, rasLine));
+        linkConnPrev->next = linkConn->next;
+        free(linkConn);
+      } else { // linkConnPrev == nullptr
+        INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s",
+             link->direction, ncclSocketToString(&conn->addr, rasLine));
+        if (linkConn->next) {
+          link->conns = linkConn->next;
+          // Ensure that the conn becoming the primary is not marked as external (we don't want to lose it if
+          // the remote peer loses interest in it).
+          link->conns->external = false;
+          if (link->conns->conn)
+            INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary",
+                 link->direction, ncclSocketToString(&link->conns->conn->addr, rasLine));
+          rasLinkSanitizeFallbacks(link);
+          free(linkConn);
+        } else { // linkConn->next == nullptr
+          // We prefer the primary entry to always be present, even if empty.
+          linkConn->peerIdx = -1;
+          linkConn->conn = nullptr;
+        } // linkConn->next == nullptr
+      } // linkConnPrev == nullptr
+      break;
+    } // if (linkConn->conn == conn)
+  } // for (linkConn)
+}
+
+// Checks if a given connection is a member of this link and if so, returns its link entry.
+// Optionally returns the position of the connection in the conns list.
+// Returns nullptr if connection not found.
+static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn,
+                                           int* pLinkIdx) {
+  int i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) {
+    if (linkConn->conn == conn) {
+      if (pLinkIdx)
+        *pLinkIdx = i;
+      return linkConn;
+    }
+  }
+  if (pLinkIdx)
+    *pLinkIdx = -1;
+  return nullptr;
+}
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasNetTerminate() {
+  for (struct rasLinkConn* linkConn = rasNextLink.conns; linkConn;) {
+    struct rasLinkConn* linkConnNext = linkConn->next;
+    free(linkConn);
+    linkConn = linkConnNext;
+  }
+  for (struct rasLinkConn* linkConn = rasPrevLink.conns; linkConn;) {
+    struct rasLinkConn* linkConnNext = linkConn->next;
+    free(linkConn);
+    linkConn = linkConnNext;
+  }
+  rasNextLink.conns = rasPrevLink.conns = nullptr;
+  rasNextLink.lastUpdatePeersTime = rasPrevLink.lastUpdatePeersTime = 0;
+
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    rasConnTerminate(conn);
+    conn = connNext;
+  }
+  // rasConnsHead and rasConnsTail are taken care of by rasConnTerminate().
+
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
+    rasSocketTerminate(sock);
+    sock = sockNext;
+  }
+  // rasSocketsHead and rasSocketsTail are taken care of by rasSocketTerminate().
+}
diff --git a/src/register/register.cc b/src/register/register.cc
index 9e8f6ea..930367a 100644
--- a/src/register/register.cc
+++ b/src/register/register.cc
@@ -92,8 +92,8 @@ static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) {
     }
   }
   if (reg->state & NVLS_REG_COMPLETE) {
-    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) {
-      WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize);
+    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize) != ncclSuccess) {
+      WARN("rank %d deregister NVLS buffer %p dev %d ucsize %ld mcsize %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize);
     }
     reg->regAddr = (CUdeviceptr)NULL;
   }
diff --git a/src/transport.cc b/src/transport.cc
index 5629ce7..f98b77a 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -11,11 +11,12 @@
 #include "timer.h"
 #include "transport.h"
 
-struct ncclTransport* ncclTransports[NTRANSPORTS] = {
+struct ncclTransport* ncclTransports[NTRANSPORTS+1] = {
   &p2pTransport,
   &shmTransport,
   &netTransport,
-  &collNetTransport
+  &collNetTransport,
+  &profilerTransport // Not really used for transport, only to create proxy ops polling on profiler counters.
 };
 
 template <int type>
@@ -111,12 +112,14 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
   gettimeofday(&timeStart, NULL);
   timeLast = timeStart; // struct copy
   bool timeReported = false;
+  cudaStream_t hostStream, deviceStream;
 
   NCCLCHECK(ncclCalloc(&data, maxPeers));
   NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail);
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
   // First time initialization
   for (int i=1; i<comm->nRanks; i++) {
     int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
@@ -195,7 +198,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
                 if (ret == ncclSuccess) {
                   conn->connected = 1;
                   /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
-                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail);
                 } else if (ret == ncclInProgress) {
                   allChannelsConnected = false;
                 }
@@ -214,7 +217,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
                 if (ret == ncclSuccess) {
                   conn->connected = 1;
                   /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
-                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail);
                 } else if (ret == ncclInProgress) {
                   allChannelsConnected = false;
                 }
@@ -286,8 +289,9 @@ exit:
   if (sendData) free(sendData);
   if (recvData) free(recvData);
 
-  NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
+  NCCLCHECK(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
   return ret;
 fail:
   goto exit;
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 6718012..c1ccfca 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -103,7 +103,7 @@ struct sendResources {
   int rank;
   int nranks;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   uint64_t* gdcSync;
   void* gdrDesc;
@@ -124,7 +124,7 @@ struct recvResources {
   int rank;
   int nranks;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int needFlush;
   uint64_t* gdcSync;
@@ -143,9 +143,19 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG
   return ncclSuccess;
 }
 
+// Returns the flags to be used by a call to cuMemGetHandleForAddressRange.
+static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) {
+  int flags = 0;
+#if CUDA_VERSION >= 12080
+  // Force mapping on PCIe on systems with both PCI and C2C attachments.
+  if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE;
+#endif
+  return flags;
+}
+
 struct setupReq {
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int needFlush;
   struct ncclCollNetSharedRes* collNet;
 };
@@ -168,8 +178,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.collNet = comm->collNetSharedRes;
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
-      req.useGdr ? "/GDRDMA" : "");
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "");
   return ncclSuccess;
 }
 
@@ -192,8 +202,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.collNet = comm->collNetSharedRes;
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
-      req.useGdr ? "/GDRDMA" : "");
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "");
   return ncclSuccess;
 }
 
@@ -454,6 +464,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
 }
 
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
   static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big");
@@ -505,16 +516,17 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
   struct connectMapMem* mapMem = map->mems+bank;
   NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
-  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
+  int dmabuf_fd = -1;
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                                                  NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
-                                                  &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                                       NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                                       &resources->sendMhandles[NCCL_PROTO_SIMPLE]),
+                  ret, fail);
     (void)close(dmabuf_fd);
   } else // FALL-THROUGH to nv_peermem GDR path
 #endif
@@ -525,10 +537,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   }
 
   *((struct connectMap**)respBuff) = &resources->map;
-  return ncclSuccess;
+
+exit:
+  return ret;
+fail:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+  }
+  goto exit;
 }
 
 static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
 
@@ -574,16 +594,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
   struct connectMapMem* mapMem = map->mems+bank;
   NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
-  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
+  int dmabuf_fd = -1;
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                                                  NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
-                                                  &resources->mhandles[NCCL_PROTO_SIMPLE]));
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                                       NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                                       &resources->mhandles[NCCL_PROTO_SIMPLE]),
+                  ret, fail);
     (void)close(dmabuf_fd);
   } else // FALL-THROUGH to nv_peermem GDR path
 #endif
@@ -600,7 +621,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
   if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
   *((struct connectMap**)respBuff) = &resources->map;
-  return ncclSuccess;
+
+exit:
+  return ret;
+fail:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+  }
+  goto exit;
 }
 
 static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
@@ -737,7 +765,7 @@ static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct
 }
 
 static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t recvParts;
+  ncclNetSGE_t recvParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   ssize_t nBytes;
@@ -779,7 +807,7 @@ static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, stru
 }
 
 static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t recvParts;
+  ncclNetSGE_t recvParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   recvParts.mhandle = recvMhandle;
@@ -796,7 +824,7 @@ static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct
 }
 
 static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) {
-  ncclNetSGE_v9_t sendParts;
+  ncclNetSGE_t sendParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   ssize_t nBytes;
@@ -835,7 +863,7 @@ static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState,
 }
 
 static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t sendParts;
+  ncclNetSGE_t sendParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   sendParts.mhandle = sendMhandle;
@@ -1150,6 +1178,7 @@ struct collnetRegInfo {
 
 static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
   ncclResult_t ret = ncclSuccess;
+  int gdrEnable = -1;
   if (regRecord) {
     if (regRecord->state & COLLNET_REG_COMPLETE) {
       // reuse previous registration
@@ -1165,6 +1194,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
 
       if (conn->flags & NCCL_DIRECT_NIC) {
         struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
+        gdrEnable = 1;
         NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
         if (handle) {
           regRecord->state |= COLLNET_REG_COMPLETE;
@@ -1174,7 +1204,8 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
           INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
         }
       } else {
-        WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
+        gdrEnable = 0;
+        goto fail;
       }
     }
   }
@@ -1183,6 +1214,7 @@ exit:
 fail:
   *outRegBufFlag = 0;
   *outHandle = NULL;
+  INFO(NCCL_REG, "rank %d - COLLNET failed to register userbuff %p, buffSize %ld, type %s, GDR %d", comm->rank, userbuff, buffSize, type == collNetRecv ? "Recv" : "Send", gdrEnable);
   goto exit;
 }
 
@@ -1268,17 +1300,20 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
 
+  int dmabuf_fd = -1;
 #if CUDART_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
-    (void)close(dmabuf_fd);
     needReg = false;
   }
 #endif
 peermem:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+    dmabuf_fd = -1;
+  }
   if (needReg) {
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
@@ -1301,17 +1336,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
 
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
+  int dmabuf_fd = -1;
   #if CUDART_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
-    (void)close(dmabuf_fd);
     needReg = false;
   }
 #endif
 peermem:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+    dmabuf_fd = -1;
+  }
   if (needReg) {
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
@@ -1600,4 +1638,4 @@ struct ncclTransport collNetTransport = {
   canConnect,
   { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
   { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
-};
\ No newline at end of file
+};
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 8760b42..40d334f 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -92,7 +92,7 @@ struct sendNetResources {
   int tpLocalRank;
   int tpRemoteRank;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int maxRecvs;
   uint64_t* gdcSync;
@@ -123,7 +123,7 @@ struct recvNetResources {
   int tpRemoteRank;
   int tpRemoteProxyRank;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int needFlush;
   int maxRecvs;
@@ -168,7 +168,7 @@ struct setupReq {
   int tpRemoteRank;
   int shared;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int needFlush;
   int channelId;
   int connIndex;
@@ -180,6 +180,16 @@ static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large
 // Forward declaration
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
 
+// Returns the flags to be used by a call to cuMemGetHandleForAddressRange.
+static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) {
+  int flags = 0;
+#if CUDA_VERSION >= 12080
+  // Force mapping on PCIe on systems with both PCI and C2C attachments.
+  if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE;
+#endif
+  return flags;
+}
+
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
@@ -204,11 +214,14 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
   if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+        req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+        req.shared ? "/Shared" : "");
   } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+        proxyRank,
+        req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+        req.shared ? "/Shared" : "");
   }
   *((int*)connectInfo) = comm->topParentRanks[proxyRank];
   memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
@@ -247,18 +260,19 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
   memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+      req.shared ? "/Shared" : "");
   return ncclSuccess;
 }
 
-static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) {
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
+static ncclResult_t netMapShm(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct connectMapMem* mem) {
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, proxyConn->rank, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
   return ncclSuccess;
 }
 
 static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) {
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
+  NCCLCHECK(ncclShmAllocateShareableBuffer(mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
   return ncclSuccess;
 }
 
@@ -292,6 +306,7 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
 
 struct netSendConnectArgs {
   ncclNetHandle_t handle;
+  int trafficClass;
 };
 
 struct netRecvConnectArgs {
@@ -315,6 +330,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
     INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
     netSendConnectArgs args = {0};
     memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t));
+    args.trafficClass = comm->config.trafficClass;
     NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId));
   } else {
     opId =  send;
@@ -343,7 +359,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
       }
     }
   } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
-    if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM));
+    if (!map->sameProcess) NCCLCHECK(netMapShm(comm, &send->proxyConn, map->mems + NCCL_NET_MAP_HOSTMEM));
     if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
       map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL;
       NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
@@ -692,9 +708,11 @@ static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version,
 
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+  ncclNetCommConfig_t commConfig = {0};
   if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError;
   ncclResult_t ret = ncclSuccess;
   netSendConnectArgs* req = (netSendConnectArgs*) reqBuff;
+  commConfig.trafficClass = req->trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : req->trafficClass;
   NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle));
   if (resources->shared) {
     // Shared buffers
@@ -714,15 +732,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
+      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
-      ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+      ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     }
   } else {
     // Connect to remote peer
-    ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+    ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -748,7 +766,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 
   if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, proxyState->buffSizes[p], buffs[p]);
+      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]);
       resources->buffSizes[p] = proxyState->buffSizes[p];
     }
   } else {
@@ -765,7 +783,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
     }
 
-    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
@@ -820,7 +838,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
-        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
         NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
@@ -904,7 +922,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
   if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, proxyState->buffSizes[p], buffs[p]);
+      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]);
       resources->buffSizes[p] = proxyState->buffSizes[p];
     }
   } else {
@@ -915,14 +933,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
           proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels,
           &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
     resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
-    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
 
   if (proxyState->allocP2pNetLLBuffers) {
-    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
+    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*devMem*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
     resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
   }
 
@@ -964,7 +982,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
-        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
         NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
@@ -1175,11 +1193,12 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
             // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
             // since size is a plain integer.
             // coverity[use_invalid:FALSE]
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot));
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
               TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
               sub->transSize += size;
               sub->transmitted += args->sliceSteps;
+              sub->profilerSteps++;
               ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
               ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait);
               args->idle = 0;
@@ -1280,6 +1299,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       size_t sizes[NCCL_PROXY_MAX_SUBS];
       int tags[NCCL_PROXY_MAX_SUBS];
       void* mhandles[NCCL_PROXY_MAX_SUBS];
+      void* phandles[NCCL_PROXY_MAX_SUBS];
       for (int i=0; i<subGroup->groupSize; i++) {
         struct ncclProxySubArgs* sub = subGroup + i;
         int postedStepId = sub->posted;
@@ -1323,6 +1343,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
           if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
           tags[subCount] = resources->tpRemoteRank;
           mhandles[subCount] = sub->recvMhandle;
+          phandles[subCount] = sub;
           subCount++;
         }
       }
@@ -1332,7 +1353,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
         bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1);
         if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
-        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr));
         if (*requestPtr) {
           subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr;
           subGroup->recvRequestsSubCount = subCount;
@@ -1341,6 +1362,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             int postedStepId = sub->posted;
             TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]);
             sub->posted += args->sliceSteps;
+            sub->profilerSteps++;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
             ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait);
           }
@@ -1558,7 +1580,7 @@ exit:
   return ret;
 fail:
   *outRegBufFlag = 0;
-  WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
+  INFO(NCCL_REG, "rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
   goto exit;
 }
 
@@ -1639,7 +1661,7 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   /* DMA-BUF support */
   if (resources->useDmaBuf) {
     int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     (void)close(dmabuf_fd);
     needReg = false;
@@ -1673,7 +1695,7 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
   /* DMA-BUF support */
   if (resources->useDmaBuf) {
     int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     (void)close(dmabuf_fd);
     needReg = false;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index bc54133..bfff6e5 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -11,6 +11,7 @@
 #include "graph.h"
 #include "utils.h"
 #include "param.h"
+#include "profiler/net_ib.h"
 
 #include <assert.h>
 #include <pthread.h>
@@ -85,6 +86,11 @@ struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 static int ncclIbRelaxedOrderingEnabled = 0;
 
+#define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
+
+#define NCCL_IB_SL_DEFAULT 0
+#define NCCL_IB_TC_DEFAULT 0
+
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
 NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1);
 NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
@@ -92,8 +98,8 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
 NCCL_PARAM(IbPkey, "IB_PKEY", 0);
 NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
-NCCL_PARAM(IbSl, "IB_SL", 0);
-NCCL_PARAM(IbTc, "IB_TC", 0);
+NCCL_PARAM(IbSl, "IB_SL", -1);
+NCCL_PARAM(IbTc, "IB_TC", -1);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
@@ -327,6 +333,9 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum,
   close(fd);
 
   if (ret == -1) {
+    // In containerized environments, read could return EINVAL if the GID index is not mapped to the
+    // container sysfs. In this case return ncclSuccess and let the caller move to next GID index.
+    if (errno == EINVAL) return ncclSuccess;
     WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
     return ncclSystemError;
   }
@@ -359,7 +368,7 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port
       return ncclSuccess;
     }
     int usrRoceVer = roceVer;
-    int gidRoceVerNum, gidRoceVerNumCandidate;
+    int gidRoceVerNum, gidRoceVerNumCandidate = -1;
     const char* deviceName = wrap_ibv_get_device_name(context->device);
     NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
     NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate));
@@ -530,8 +539,8 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
     }
     ncclIbDev* dev = ncclIbDevs + props->devs[i];
     if (dev->link != dev0->link) {
-      WARN("NET/IB : Trying to merge multiple devices together with different link_layer properties %s -> %d, %s -> %d. Try only selecting NICs with one type of link using NCCL_IB_HCA",
-        dev0->devName, dev0->link, dev->devName, dev->link);
+      WARN("NET/IB : Attempted to merge incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+        props->devs[0], dev0->devName, dev0->portNum, NCCL_IB_LLSTR(dev0->link), props->devs[i], dev->devName, dev->portNum, NCCL_IB_LLSTR(dev->link));
       return ncclInvalidUsage;
     }
   }
@@ -548,8 +557,11 @@ ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
   return res;
 }
 
-ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
+static ncclProfilerCallback_t ncclProfilerFunction;
+
+ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
   ncclResult_t ret = ncclSuccess;
+  ncclProfilerFunction = profFunction;
   if (ncclParamIbDisable()) return ncclInternalError;
   static int shownIbHcaEnv = 0;
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
@@ -571,7 +583,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
       struct ibv_device** devices;
 
       // Check if user defined which IB device:port to use
-      char* userIbEnv = getenv("NCCL_IB_HCA");
+      const char* userIbEnv = ncclGetEnv("NCCL_IB_HCA");
       if (userIbEnv != NULL && shownIbHcaEnv++ == 0) INFO(NCCL_NET|NCCL_ENV, "NCCL_IB_HCA set to %s", userIbEnv);
       struct netIf userIfs[MAX_IB_DEVS];
       bool searchNot = userIbEnv && userIbEnv[0] == '^';
@@ -634,7 +646,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
 
           TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
-              portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
+              NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
 
           PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
           ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
@@ -666,7 +678,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
     ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
     for (int d = 0; d < ncclNIbDevs; d++) {
         snprintf(line+strlen(line), sizeof(line)-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
-          ncclIbDevs[d].portNum, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          ncclIbDevs[d].portNum, NCCL_IB_LLSTR(ncclIbDevs[d].link));
     }
     char addrline[SOCKET_NAME_MAXLEN+1];
     INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
@@ -832,6 +844,8 @@ struct ncclIbConnectionMetadata {
   char devName[MAX_MERGED_DEV_NAME];
   uint64_t fifoAddr;
   int ndevs;
+  int tc;
+  int sl;
 };
 
 enum ncclIbCommState {
@@ -873,12 +887,23 @@ struct ncclIbGidInfo {
 #define NCCL_NET_IB_REQ_FLUSH 3
 const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" };
 
+#define MAX_QPS_PER_REQ 8
+struct ncclProfilerInfo {
+  void* qpEventHandles[MAX_QPS_PER_REQ];
+  int qpIndex[MAX_QPS_PER_REQ];
+  int nEventHandles;
+  ncclProfilerNetIbDescr_v1_t data;
+};
+
 struct ncclIbRequest {
   struct ncclIbNetCommBase* base;
   int type;
   struct ncclSocket* sock;
   int events[NCCL_IB_MAX_DEVS_PER_NIC];
   struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC];
+#ifdef NCCL_ENABLE_NET_PROFILING
+  struct ncclProfilerInfo pInfo[NCCL_NET_IB_MAX_RECVS];
+#endif
   int nreqs;
   union {
     struct {
@@ -1084,7 +1109,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc, int tc, int sl) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
@@ -1100,7 +1125,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
     qpAttr.ah_attr.grh.flow_label = 0;
     qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex;
     qpAttr.ah_attr.grh.hop_limit = 255;
-    qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : ncclParamIbTc();
+    qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : tc;
   } else {
     //pick lid if subnet prefixs are same, FLID if they are not
     if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) ==
@@ -1122,10 +1147,10 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
 	qpAttr.ah_attr.grh.hop_limit = 255;
     }
   }
-  qpAttr.ah_attr.sl = ncclParamIbSl();
+  qpAttr.ah_attr.sl = sl;
   qpAttr.ah_attr.src_path_bits = 0;
   qpAttr.ah_attr.port_num = info->ib_port;
-  TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port);
+  TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u sl: %d tc: %d", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port, qpAttr.ah_attr.sl, qpAttr.ah_attr.grh.traffic_class);
   NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
   return ncclSuccess;
 }
@@ -1164,12 +1189,13 @@ fail:
   goto exit;
 }
 
-ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   ncclResult_t ret = ncclSuccess;
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   struct ncclIbCommStage* stage = &handle->stage;
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
   int ready;
+  uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED;
   *sendComm = NULL;
 
   if (stage->state == ncclIbCommStateConnect)      goto ib_connect_check;
@@ -1199,7 +1225,7 @@ ib_connect_check:
   // IB Setup
   struct ncclIbMergedDev* mergedDev;
   if (dev >= ncclNMergedIbDevs) {
-    WARN("NET/IB : Trying to use non-existant virtual device %d", dev);
+    WARN("NET/IB : Trying to use non-existent virtual device %d", dev);
     return ncclInternalError;
   }
 
@@ -1305,8 +1331,17 @@ ib_recv_dev_list:
             devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
     }
+    if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = devInfo->link_layer;
+    if (link_layer != devInfo->link_layer) {
+      int ibDev0 = comm->devs[0].base.ibDevN;
+      WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           commDev->base.ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
   meta.fifoAddr = (uint64_t)comm->fifo;
+  meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
+  meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
   strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
 
   stage->state = ncclIbCommStateSend;
@@ -1332,13 +1367,16 @@ ib_connect:
 
   comm->base.nRemDevs = remMeta.ndevs;
 
-  int link_layer;
-  link_layer = remMeta.devs[0].link_layer;
-  for (int i = 1; i < remMeta.ndevs; i++) {
-    if (remMeta.devs[i].link_layer != link_layer) {
-      WARN("NET/IB : Can't connect net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d",
-      i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer);
-      return ncclInternalError;
+  // ensure that the remote devices have the same link layer than the local devices used in the connection.
+  if (comm->base.vProps.ndevs > 0) {
+    int ibDev0 = comm->devs[0].base.ibDevN;
+    link_layer = ncclIbDevs[ibDev0].portAttr.link_layer;
+    for (int i = 0; i < remMeta.ndevs; i++) {
+      if (remMeta.devs[i].link_layer != link_layer) {
+        WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+             NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+        return ncclInternalError;
+      }
     }
   }
 
@@ -1373,7 +1411,7 @@ ib_connect:
 
     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
     remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu);
-    NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false, remMeta.tc, remMeta.sl), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
   }
 
@@ -1459,6 +1497,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   struct ncclIbCommStage* stage = &lComm->stage;
   struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
   int ready;
+  int link_layer = IBV_LINK_LAYER_UNSPECIFIED;
   *recvComm = NULL;
 
   if (stage->state == ncclIbCommStateAccept)   goto ib_accept_check;
@@ -1497,7 +1536,7 @@ ib_recv_dev_list:
   ncclNetVDeviceProps_t remoteVProps;
   memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t));
   if (lComm->dev >= ncclNMergedIbDevs) {
-    WARN("NET/IB : Trying to use non-existant virtual device %d", lComm->dev);
+    WARN("NET/IB : Trying to use non-existent virtual device %d", lComm->dev);
     return ncclInternalError;
   }
 
@@ -1555,6 +1594,13 @@ ib_recv:
     ibDev = ncclIbDevs + ibDevN;
     NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
     NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail);
+    if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = ibDev->portAttr.link_layer;
+    if (link_layer != ibDev->portAttr.link_layer) {
+      int ibDev0 = rComm->devs[0].base.ibDevN;
+      WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
 
   // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
@@ -1562,6 +1608,12 @@ ib_recv:
     rComm->base.remDevs[i] = remMeta.devs[i];
     rComm->base.remDevs[i].remoteGid.global.interface_id  = rComm->base.remDevs[i].gid.global.interface_id;
     rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].gid.global.subnet_prefix;
+    if (remMeta.devs[i].link_layer != link_layer) {
+      int ibDev0 = rComm->devs[0].base.ibDevN;
+      WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
 
   // Stripe QP creation across merged devs
@@ -1598,7 +1650,7 @@ ib_recv:
       meta.qpInfo[q].ece_supported = 0;
     }
 
-    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true, remMeta.tc, remMeta.sl), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
   }
 
@@ -1629,7 +1681,7 @@ ib_recv:
       devInfo.gid.global.subnet_prefix        = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
       devInfo.gid.global.interface_id         = rCommDev->base.gidInfo.localGid.global.interface_id;
       devInfo.mtu         = ibDev->portAttr.active_mtu;
-      NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail);
+      NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false, remMeta.tc, remMeta.sl), ret, fail);
       NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail);
     }
 
@@ -1646,6 +1698,8 @@ ib_recv:
     meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey;
   }
   meta.fifoAddr = (uint64_t)rComm->sizesFifo;
+  meta.sl = remMeta.sl;
+  meta.tc = remMeta.tc;
 
   for (int q = 0; q < rComm->base.nqps; q++) {
     meta.qpInfo[q].qpn      = rComm->base.qps[q].qp->qp_num;
@@ -1842,7 +1896,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
 
 NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
 
-ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandle) {
   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
   volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
   int nreqs = slots[0].nreqs;
@@ -1860,6 +1914,9 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
     wr->wr.rdma.remote_addr = slots[r].addr;
     wr->next = wr + 1;
     wr_id += (reqs[r] - comm->base.reqs) << (r*8);
+#ifdef NCCL_ENABLE_NET_PROFILING
+    reqs[r]->pInfo[0].nEventHandles = 0;
+#endif
   }
 
   // Write size as immediate data. In the case of multi-send, only write
@@ -1929,6 +1986,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
     }
 
     struct ibv_send_wr* bad_wr;
+#ifdef NCCL_ENABLE_NET_PROFILING
+    // QP profiling loop
+    for (int r=0; r<nreqs && pHandle; r++) {
+      // Store comm qpIndex for this request
+      int nEventHandles = reqs[r]->pInfo[0].nEventHandles;
+      reqs[r]->pInfo[0].qpIndex[nEventHandles%MAX_QPS_PER_REQ] = qpIndex;
+      // Store info for profiler
+      int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+      reqs[r]->pInfo[0].data.type = ncclProfileQp;
+      reqs[r]->pInfo[0].data.qp.device = devIndex;
+      reqs[r]->pInfo[0].data.qp.wr_id = comm->wrs[r].wr_id;
+      reqs[r]->pInfo[0].data.qp.opcode = comm->wrs[r].opcode;
+      reqs[r]->pInfo[0].data.qp.qpNum = qp->qp->qp_num;
+      reqs[r]->pInfo[0].data.qp.length = comm->sges[r].length;
+      NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles%MAX_QPS_PER_REQ], 0, pHandle, pluginId, &reqs[r]->pInfo[0].data));
+      reqs[r]->pInfo[0].nEventHandles++;
+    }
+#endif
     NCCLCHECK(wrap_ibv_post_send(qp->qp, comm->wrs, &bad_wr));
 
     for (int r=0; r<nreqs; r++) {
@@ -1945,7 +2020,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -2018,7 +2093,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
     }
 
     TIME_START(0);
-    NCCLCHECK(ncclIbMultiSend(comm, slot));
+    NCCLCHECK(ncclIbMultiSend(comm, slot, phandle));
 
     // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
     memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
@@ -2109,7 +2184,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -2121,6 +2196,9 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   req->type = NCCL_NET_IB_REQ_RECV;
   req->sock = &comm->base.sock;
   req->nreqs = n;
+#ifdef NCCL_ENABLE_NET_PROFILING
+  for (int r = 0; r < n && phandles; r++) req->pInfo[r].nEventHandles = 0;
+#endif
 
   for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     req->devBases[i] = &comm->devs[i].base;
@@ -2141,6 +2219,19 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   for (int i = 0; i < nqps; i++) {
     struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
     ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+#ifdef NCCL_ENABLE_NET_PROFILING
+    // Start a QP event for every request in the multirecv and every qp
+    for (int r = 0; r < n && phandles; r++) {
+      // Store info for profiler
+      int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+      req->pInfo[r].data.type = ncclProfileQp;
+      req->pInfo[r].data.qp.device = qp->devIndex;
+      req->pInfo[r].data.qp.wr_id = wr.wr_id;
+      req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
+      NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[i], 0, phandles[r], pluginId, &req->pInfo[r].data));
+      req->pInfo[r].nEventHandles++;
+    }
+#endif
     NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr));
     comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
   }
@@ -2196,6 +2287,16 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
 
 #define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name)
 
+#ifdef NCCL_ENABLE_NET_PROFILING
+static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
+  for (int i = 0; i < MAX_QPS_PER_REQ; i++) {
+    int qpIndex = req->pInfo[request].qpIndex[i];
+    if (req->base->qps[qpIndex].qp->qp_num == qpNumber) return i;
+  }
+  return 0;
+}
+#endif
+
 ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
   *done = 0;
@@ -2205,11 +2306,24 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
       TRACE(NCCL_NET, "r=%p done", r);
       *done = 1;
       if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
-        for (int i=0; i<r->nreqs; i++) sizes[i] = r->recv.sizes[i];
+        for (int i=0; i<r->nreqs; i++) {
+          sizes[i] = r->recv.sizes[i];
+#ifdef NCCL_ENABLE_NET_PROFILING
+          for (int j = 0; j < r->pInfo[i].nEventHandles; j++) {
+            NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], 1, NULL, 0, NULL));
+          }
+#endif
+        }
       }
       if (sizes && r->type == NCCL_NET_IB_REQ_SEND) {
         sizes[0] = r->send.size;
+#ifdef NCCL_ENABLE_NET_PROFILING
+        for (int j = 0; j < r->pInfo[0].nEventHandles; j++) {
+          NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], 1, NULL, 0, NULL));
+        }
+#endif
       }
+      // Stop all remaining Qp events for this event
       NCCLCHECK(ncclIbFreeRequest(r));
       return ncclSuccess;
     }
@@ -2264,6 +2378,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
                 return ncclInternalError;
               }
               sendReq->events[i]--;
+#ifdef NCCL_ENABLE_NET_PROFILING
+              // Stop Qp event for sendReq
+              NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[getReqQpIndex(sendReq, j, wc->qp_num)], 1, NULL, 0, NULL));
+#endif
             }
           } else {
             if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
@@ -2276,6 +2394,12 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
               }
             }
             req->events[i]--;
+#ifdef NCCL_ENABLE_NET_PROFILING
+            // Stop Qp event for workFifo
+            for (int j = 0; j < req->nreqs; j++) {
+              NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[getReqQpIndex(req, j, wc->qp_num)], 1, NULL, 0, NULL));
+            }
+#endif
           }
         }
         // Once the IB fatal event is reported in the async thread, we want to propagate this error
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 235dee8..8034d95 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -9,6 +9,7 @@
 #include "socket.h"
 #include "net.h"
 #include "param.h"
+#include "profiler/net_socket.h"
 
 #include <pthread.h>
 #include <stdlib.h>
@@ -35,7 +36,10 @@ static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
+static ncclProfilerCallback_t ncclProfilerFunction;
+
+ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+  ncclProfilerFunction = profFunction;
   if (ncclNetIfs == -1) {
     pthread_mutex_lock(&ncclNetSocketLock);
     if (ncclNetIfs == -1) {
@@ -158,6 +162,11 @@ struct ncclNetSocketTask {
   ncclResult_t result;
 };
 
+struct ncclProfilerInfo {
+  void* eHandle;
+  void* pHandle;
+};
+
 struct ncclNetSocketRequest {
   int op;
   void* data;
@@ -168,6 +177,7 @@ struct ncclNetSocketRequest {
   struct ncclNetSocketComm* comm;
   struct ncclNetSocketTask* tasks[MAX_SOCKETS];
   int nSubs;
+  struct ncclProfilerInfo pInfo;
 };
 
 struct ncclNetSocketTaskQueue {
@@ -180,6 +190,7 @@ struct ncclNetSocketThreadResources {
   struct ncclNetSocketTaskQueue threadTaskQueue;
   int stop;
   struct ncclNetSocketComm* comm;
+  struct ncclProfilerInfo* pInfo;
   pthread_mutex_t threadLock;
   pthread_cond_t  threadCond;
 };
@@ -210,6 +221,9 @@ void* persistentSocketThread(void *args_) {
   struct ncclNetSocketComm* comm = resource->comm;
   struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue;
   int nSocksPerThread = comm->nSocks / comm->nThreads;
+#ifdef NCCL_ENABLE_NET_PROFILING
+  void* eHandle[MAX_REQUESTS*MAX_SOCKETS] = { 0 };
+#endif
   while (1) {
     int idle = 1;
     int mark = myQueue->next; // mark newest task seen
@@ -220,13 +234,33 @@ void* persistentSocketThread(void *args_) {
         for (int j=0; j<nSocksPerThread; j++) {
           struct ncclNetSocketTask* r = myQueue->tasks+i+j;
           if (r != NULL && r->used == 1 && r->offset < r->size) {
+#ifdef NCCL_ENABLE_NET_PROFILING
+            if (!eHandle[i+j]) {
+              ncclProfilerNetSockDescr_v1_t data;
+              data.type = ncclProfileSocket;
+              data.sock.fd = r->sock->fd;
+              data.sock.op = r->op;
+              data.sock.length = r->size;
+              ncclProfilerFunction(&eHandle[i+j], 0, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
+            }
+#endif
             r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
             if (r->result != ncclSuccess) {
+#ifdef NCCL_ENABLE_NET_PROFILING
+              ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL);
+              eHandle[i+j] = NULL;
+#endif
               WARN("NET/Socket : socket progress error");
               return NULL;
             }
             idle = 0;
             if (r->offset < r->size) repeat = 1;
+#ifdef NCCL_ENABLE_NET_PROFILING
+            if (repeat == 0) {
+              ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL);
+              eHandle[i+j] = NULL;
+            }
+#endif
           }
         }
       } while (repeat);
@@ -326,7 +360,7 @@ fail:
   goto exit;
 }
 
-ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     return ncclInternalError;
   }
@@ -444,7 +478,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi
   return ncclInternalError;
 }
 
-ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) {
+ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclProfilerInfo* pInfo, int op, void* data, int size, struct ncclNetSocketTask** req) {
   int tid = comm->nextSock % comm->nThreads;
   struct ncclNetSocketThreadResources* res = comm->threadResources+tid;
   struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue;
@@ -457,6 +491,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void*
     NCCLCHECK(ncclCalloc(&queue->tasks, queue->len));
     queue->next = 0;
     res->comm = comm;
+#ifdef NCCL_ENABLE_NET_PROFILING
+    res->pInfo = pInfo;
+#endif
     pthread_mutex_init(&res->threadLock, NULL);
     pthread_cond_init(&res->threadCond, NULL);
     PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create");
@@ -520,7 +557,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
       int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
       while (chunkOffset < r->size) {
         int chunkSize = std::min(taskSize, r->size-chunkOffset);
-        NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+        NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
         chunkOffset += chunkSize;
       }
     }
@@ -544,6 +581,16 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
         }
       }
     } else { // progress request using main thread
+#ifdef NCCL_ENABLE_NET_PROFILING
+      if (!r->pInfo.eHandle) {
+        ncclProfilerNetSockDescr_v1_t data;
+        data.type = ncclProfileSocket;
+        data.sock.fd = r->ctrlSock->fd;
+        data.sock.op = r->op;
+        data.sock.length = r->size;
+        ncclProfilerFunction(&r->pInfo.eHandle, 0, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
+      }
+#endif
       if (r->offset < r->size) {
         NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset));
       }
@@ -551,6 +598,10 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
         if (size) *size = r->size;
         *done = 1;
         r->used = 0;
+#ifdef NCCL_ENABLE_NET_PROFILING
+        ncclProfilerFunction(&r->pInfo.eHandle, 1, NULL, 0, NULL);
+        r->pInfo.eHandle = NULL;
+#endif
       }
     }
   }
@@ -562,16 +613,26 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v
 }
 ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
 
-ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
   NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request));
+#ifdef NCCL_ENABLE_NET_PROFILING
+  // NCCL core profiler callback
+  struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request;
+  req->pInfo.pHandle = phandle;
+#endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
   if (n != 1) return ncclInternalError;
   NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request));
+#ifdef NCCL_ENABLE_NET_PROFILING
+  // NCCL core profiler callback
+  struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request;
+  if (phandles) req->pInfo.pHandle = phandles[0];
+#endif
   return ncclSuccess;
 }
 
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 3fe25a3..d99f7cb 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -108,29 +108,29 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
-  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size));
-  CUCHECK(cuMemUnmap(ptr, size));
-  CUCHECK(cuMemAddressFree(ptr, size));
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) {
+  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize));
+  CUCHECK(cuMemUnmap(ptr, mcsize));
+  CUCHECK(cuMemAddressFree(ptr, mcsize));
   CUCHECK(cuMemRelease(*mcHandler));
-  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size);
+  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d ucsize %ld mcsize %ld", comm->rank, (void*)ptr, dev, ucsize, mcsize);
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) {
-  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr);
+ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t ucsize, void* ucptr, CUmemGenericAllocationHandle* ucHandle, size_t mcsize, void* mcptr, CUmemGenericAllocationHandle* mcHandle) {
+  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) ucsize %zu MC handle 0x%llx(%p) mcsize %zd", *ucHandle, ucptr, ucsize, *mcHandle, mcptr, mcsize);
 
   // Release the UC memory and mapping
   if (ucptr) {
-    CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
-    CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
+    CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, ucsize));
+    CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, ucsize));
     CUCHECK(cuMemRelease(*ucHandle));
   }
 
   // Release the MC memory and mapping
   if (mcptr) {
-    CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
-    CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
+    CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, mcsize));
+    CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, mcsize));
     CUCHECK(cuMemRelease(*mcHandle));
   }
 
@@ -197,25 +197,27 @@ fail:
   goto exit;
 }
 
-static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularity_flags mcOption, const CUmemAccessDesc* desc, size_t* sizePtr, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr) {
+static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc* desc, size_t size, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr, size_t* ucsizePtr, size_t* mcsizePtr) {
   char shareableHandle[NVLS_HANDLE_SIZE];
   CUmulticastObjectProp mcprop;
   CUmemAllocationProp ucprop;
   ncclResult_t ret = ncclSuccess;
-  size_t size = *sizePtr;
-  size_t originSize = size;
+  size_t mcsize;
+  size_t ucsize;
   size_t ucgran, mcgran;
   int allocMcHandle = 0;
 
+  mcsize = ucsize = size;
   *ucptr = *mcptr = NULL;
+  memset(shareableHandle, '\0', sizeof(shareableHandle));
   memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
   mcprop.numDevices = comm->localRanks;
   mcprop.handleTypes = ncclCuMemHandleType;
   mcprop.flags = 0;
   mcprop.size = size;
-  CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, mcOption), ret, fail);
-  ALIGN_SIZE(size, mcgran);
-  *sizePtr = mcprop.size = size;
+  CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+  ALIGN_SIZE(mcsize, mcgran);
+  mcprop.size = mcsize;
 
   if (comm->localRank == 0) {
     NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
@@ -235,26 +237,29 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
   ucprop.location.id = comm->cudaDev;
   ucprop.requestedHandleTypes = ncclCuMemHandleType;
   CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-  // Map a VA for UC memory
-  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, size, ucgran, 0U, 0), ret, fail);
+  ALIGN_SIZE(ucsize, ucgran);
+  // Map a VA for UC memory with MC alignment and size
+  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, ucsize, ucgran, 0U, 0), ret, fail);
 
   // Alloc local physical mem for this NVLS group
-  CUCHECKGOTO(cuMemCreate(ucHandle, size, &ucprop, 0), ret, fail);
-  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, size, 0, *ucHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail);
-  CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail);
+  CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail);
+  CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail);
 
   // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
   // Bind physical memory to the Multicast group
   // NB: It will block until all ranks have been added to the Group
-  CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail);
+  CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/), ret, fail);
 
   // Map mc virtual address
-  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, size, mcgran, 0U, 0), ret, fail);
-  CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, size, 0, *mcHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, size, desc, 1), ret, fail);
-  INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld size %ld (%ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, size, originSize);
+  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, mcsize, mcgran, 0U, 0), ret, fail);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, mcsize, 0, *mcHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, mcsize, desc, 1), ret, fail);
+  *ucsizePtr = ucsize;
+  *mcsizePtr = mcsize;
+  INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld ucsize %ld mcsize %ld (inputsize %ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, ucsize, mcsize, size);
 
 exit:
   return ret;
@@ -273,6 +278,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
   size_t nvlsTotalSize = 0;
   struct ncclNvlsSharedRes* resources = NULL;
   int nChannels = -1;
+  cudaStream_t deviceStream, hostStream;
 
   if (comm->nvlsSupport == 0 || comm->nvlsResources->inited) return ncclSuccess;
   // initialize after checking comm->nvlsSupport
@@ -288,10 +294,10 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
   INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu",
        comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize);
 
-  NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_RECOMMENDED, &resources->accessDesc, &nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff), res, fail);
-  resources->buffSize = nvlsTotalSize;
+  NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff, &resources->buffUCSize, &resources->buffMCSize), res, fail);
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail);
   for (int h = 0; h < nHeads; h++) {
     int nvlsPeer = comm->nRanks + 1 + h;
     for (int c = 0; c < nChannels; c++) {
@@ -306,15 +312,16 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
       peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * buffSize;
       peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * buffSize;
 
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
     }
   }
 
-  NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail);
-  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail);
   // For now, the barrier is a must that guarantees all buffers are mc-mapped before accessing peer's buffer
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, fail);
   comm->nvlsResources->inited = true;
@@ -374,6 +381,7 @@ setup:
     size_t memSize = 64;
     size_t creditSize = nChannels * 2 * memSize * nHeads;
     int nvlsStepSize = comm->nvlsChunkSize;
+    cudaStream_t hostStream, deviceStream;
 
     NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail);
     comm->nvlsResources->inited = false;
@@ -398,11 +406,11 @@ setup:
     resources->accessDesc.location.id = comm->cudaDev;
     resources->dev = comm->cudaDev;
 
-    NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_MINIMUM, &resources->accessDesc, &creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit), res, fail);
-    resources->creditSize = creditSize;
+    NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit, &resources->creditUCSize, &resources->creditMCSize), res, fail);
 
     // Set up head and tail only for now
-    NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail);
     for (int h = 0; h < nHeads; h++) {
       int nvlsPeer = comm->nRanks + 1 + h;
       for (int c = 0; c < nChannels; c++) {
@@ -440,14 +448,15 @@ setup:
         peer->send[0].conn.stepSize = nvlsStepSize;
         peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
 
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
       }
     }
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail);
-    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail);
   }
 
   // MNNVL does not support NVLS buffer registration
@@ -488,13 +497,13 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
       NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
 
     if (resources->ucCredit || resources->mcCredit) {
-      NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle));
-      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle));
+      NCCLCHECK(nvlsGroupUnbind(comm, resources->creditUCSize, &resources->mcCreditHandle));
+      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditUCSize, resources->ucCredit, &resources->ucCreditHandle, resources->creditMCSize, resources->mcCredit, &resources->mcCreditHandle));
     }
 
     if (comm->nvlsResources->inited) {
-      NCCLCHECK(nvlsGroupUnbind(comm, resources->buffSize, &resources->mcBuffHandle));
-      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffSize, resources->ucBuff, &resources->ucBuffHandle, resources->mcBuff, &resources->mcBuffHandle));
+      NCCLCHECK(nvlsGroupUnbind(comm, resources->buffUCSize, &resources->mcBuffHandle));
+      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffUCSize, resources->ucBuff, &resources->ucBuffHandle, resources->buffMCSize, resources->mcBuff, &resources->mcBuffHandle));
     }
     free(resources);
     comm->nvlsResources = NULL;
@@ -513,7 +522,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   size_t minSize = SIZE_MAX;
   struct localRegData* regData = NULL;
   cudaPointerAttributes attr;
-  size_t ucgran, mcgran;
+  size_t ucgran, mcgran, ucsize, mcsize;
 
   NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
 
@@ -538,13 +547,12 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
         CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
 
         CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr), ret, fail);
-        if (regSize % mcgran == 0) {
-          regRecord->regSize = regSize;
-        } else {
-          regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr);
-        }
-
-        if (regRecord->addr % ucgran == 0 && regRecord->regSize % mcgran == 0) {
+        if (regRecord->addr % ucgran == 0) {
+          if (regSize % ucgran != 0) {
+            regRecord->regUCSize = ALIGN_SIZE(regSize, ucgran);
+          } else {
+            regRecord->regUCSize = regSize;
+          }
           regRecord->state |= NVLS_REG_POSSIBLE;
           memcpy(&regData[comm->localRank].reg, regRecord, sizeof(struct ncclReg));
           regData[comm->localRank].offset = userBuff - regRecord->addr;
@@ -564,13 +572,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
       goto fail;
     }
     /* get minimal reg size of nvls buffers */
-    if (minSize > regData[i].reg.regSize)
-      minSize = regData[i].reg.regSize;
+    if (minSize > regData[i].reg.regUCSize)
+      minSize = regData[i].reg.regUCSize;
   }
 
   /* start registration */
+  mcsize = ucsize = minSize;
   mcprop.size = minSize;
   CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+  ALIGN_SIZE(mcsize, mcgran);
+  mcprop.size = mcsize;
+
   if (comm->localRank == 0) {
     NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
@@ -583,16 +595,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   // Coverity complains that regRecord could be NULL.  That won't in practice be the case because we've already checked
   // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
   // coverity[var_deref_op]
-  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
+  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, ucsize, 0), ret, fail);
 
   // Create a VA for the NVLS
-  CUCHECKGOTO(cuMemAddressReserve(&regPtr, minSize, mcgran, 0U, 0), ret, fail);
+  CUCHECKGOTO(cuMemAddressReserve(&regPtr, mcsize, mcgran, 0U, 0), ret, fail);
   // Map the VA locally
-  CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
+  CUCHECKGOTO(cuMemMap(regPtr, mcsize, 0, mcHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess(regPtr, mcsize, &comm->nvlsResources->accessDesc, 1), ret, fail);
 
   regRecord->regAddr = regPtr;
-  regRecord->regSize = minSize;
+  regRecord->regUCSize = ucsize;
+  regRecord->regMCSize = mcsize;
   regRecord->dev = comm->nvlsResources->dev;
   regRecord->mcHandle = mcHandle;
   regRecord->state |= NVLS_REG_COMPLETE;
@@ -706,7 +719,7 @@ exit:
   return ncclSuccess;
 fail:
   regBufUsed = 0;
-  WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
+  INFO(NCCL_REG, "rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
   goto exit;
 }
 
@@ -843,7 +856,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) {
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index dac7621..aed84c5 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -407,6 +407,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 	  comm->peerInfo[intermediateRank].nvmlDev, useReadStr);
   }
 
+  memset(&req, '\0', sizeof(req));
   req.size = sendSize;
   req.refcount = 0;
   if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
@@ -466,6 +467,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = intermediateRank;
   }
 
+  memset(&req, '\0', sizeof(req));
   req.size = recvSize;
   req.refcount = 0;
   if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
@@ -527,7 +529,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
 
   if (useMemcpy) {
     // Attach to peer's SHM segment
-    NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
+    NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
 
     recv->conn.tail = &resources->devShm->recvMem.tail;
     recv->conn.head = &resources->devShm->sendMem.head;
@@ -634,7 +636,7 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
 
     // Create a SHM segment for the peer to attach to
     shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
-    NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
+    NCCLCHECK(ncclShmAllocateShareableBuffer(shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
 
     NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
     memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
@@ -805,7 +807,7 @@ static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size
 ncclResult_t ret = ncclSuccess;
   struct ncclIpcRegInfo* newInfo = NULL;
   uintptr_t* peerRmtAddrs = NULL;
-  bool legacyIpcCap = false;
+  int legacyIpcCap = 0;
   size_t baseSize = 0;
   void* baseAddr = NULL;
   bool needUpdate = false;
@@ -916,13 +918,16 @@ ncclResult_t ret = ncclSuccess;
       if (type == NCCL_IPC_COLLECTIVE) {
         // for collective, store registered remote buffers into dev memory for future reference
         if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
-          NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+          cudaStream_t hostStream, deviceStream;
+          NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
           if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
-            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
           if (needUpdate)
-            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-          NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
+            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), ret, fail);
         }
         peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
       } else {
@@ -941,7 +946,7 @@ fail:
   *offsetOut = 0;
   *peerRmtAddrsOut = NULL;
   if (newInfo) free(newInfo);
-  WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc);
+  INFO(NCCL_REG, "rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %d type %s", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc ? *isLegacyIpc : -1, ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR ? "POSIX_FD" : "FABRIC");
   goto exit;
 }
 
diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc
new file mode 100644
index 0000000..3e32843
--- /dev/null
+++ b/src/transport/profiler.cc
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "transport.h"
+#include "proxy.h"
+#include "profiler.h"
+
+static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->shared = 1;
+  return ncclSuccess;
+}
+
+// The following ncclProxySubArgs are overloaded by the profiler progress function:
+// - base       : is set to the current value of workCounter[channelId]
+// - posted     : is set to sub->nsteps to indicate that the profiler has started the event
+// - transmitted: is set to sub->nsteps to indicate that the profiler has stopped the event
+static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s = 0; s < args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs + s;
+      sub->base = sub->workCounter;
+      sub->posted = sub->transmitted = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    for (int s = 0; s < args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs + s;
+      uint64_t* workStarted = (uint64_t *)sub->sendbuff;
+      uint64_t* workCompleted = (uint64_t *)sub->recvbuff;
+      if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId]) {
+        ncclProfilerStartKernelChEvent(args, s);
+        sub->posted = sub->nsteps;
+        continue; // allow events on every channel to start
+      }
+      if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId]) {
+        ncclProfilerStopKernelChEvent(args, s);
+        sub->transmitted = sub->nsteps;
+        args->done++;
+      }
+    }
+    if (args->done == args->nsubs) args->state = ncclProxyOpNone;
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport profilerTransport = {
+  "Prof",
+  NULL,
+  { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+  { NULL, NULL, NULL, NULL, NULL, profilerProxyConnect, NULL, profilerProxyProgress, NULL, NULL }
+};
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index d2d6906..aa3e6c4 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -18,6 +18,7 @@ struct shmBuffInfo {
 };
 
 struct shmConnectInfo {
+  int rank;
   ncclShmIpcDesc_t desc;
   struct shmBuffInfo buf;
 };
@@ -120,6 +121,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn));
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
 
+  info->rank = comm->rank;
   resources->hostMem = (struct ncclSendMem*)info->buf.hptr;
   resources->devHostMem = (struct ncclSendMem*)info->buf.dptr;
 
@@ -150,6 +152,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn));
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
 
+  info->rank = comm->rank;
   resources->hostMem = (struct ncclRecvMem*)info->buf.hptr;
   resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr;
 
@@ -163,7 +166,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
   char* buff;
 
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
 
   buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -197,7 +200,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   char* buff;
 
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
 
   buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -464,7 +467,7 @@ static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
 exit:
@@ -485,7 +488,7 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
 exit:
@@ -517,9 +520,9 @@ static void initCeOperation() {
   }
 }
 
-ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
-  if (desc == NULL || hptr == NULL || tpProxyRank < -1) {
-    WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank);
+ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
+  if (desc == NULL || hptr == NULL) {
+    WARN("Invalid argument desc %p, hptr %p", desc, hptr);
     return ncclInvalidArgument;
   }
 #if CUDART_VERSION >= 12020
@@ -532,7 +535,6 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
     if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
       // Return the native cuMem handle for later Export/Import via UDS
       memcpy(&desc->shmci.data, &handle, sizeof(handle));
-      desc->shmci.tpProxyRank = tpProxyRank;
     } else {
       CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0));
     }
@@ -560,7 +562,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
   return ncclSuccess;
 }
 
-ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
   if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) {
     WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut);
     return ncclInvalidArgument;
@@ -584,7 +586,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
       // UDS fd support
       int fd = -1;
       // Send cuMem handle to remote for conversion to an fd
-      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd));
+      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, proxyRank, &desc->shmci.data, &fd));
       CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
       (void) close(fd);
     } else {
@@ -625,7 +627,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
     descOut->shmci.ptr = *hptr = (void *)hostptr;
     descOut->legacy = false;
     if (dptr) *dptr = (void *)hostptr;
-    INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
+    INFO(NCCL_SHM, "CUMEM imported shareable host buffer from proxyRank %d size %zi ptr %p, granularity %ld", proxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
   } else {
     char shmPath[SHM_PATH_MAX];
     snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);