Add documentation for NCCL NET plugins

Also repurpose dummy plugin as example, including headers and compat layers from v6 to v2.
2022-11-21 06:03:27 -08:00 · 2022-11-21 06:03:27 -08:00 · 55b1d8ab98
commit 55b1d8ab98
parent 2f4cb874ba
12 changed files with 906 additions and 80 deletions
--- a/ext-net/README.md
+++ b/ext-net/README.md
@ -0,0 +1,352 @@
+# NCCL Net Plugin Documentation
+
+This page describes the NCCL Net plugin API and how to implement a network plugin for NCCL.
+
+# Overview
+
+To allow NCCL to work on any network type, NCCL provides a way to use external plugins. Plugins
+implement the NCCL network API, and decouple NCCL binary builds which are built against a
+particular version of the GPU stack (i.e. CUDA) from the network code which is built against a
+particular version of the networking stack. That way, we can easily integrate any CUDA version
+with any network stack version.
+
+NCCL network plugins come as a shared library called `libnccl-net.so`. That shared library
+contains one or more implementations of the NCCL NET API, in the form of versioned structs,
+filled with pointers to all required functions.
+
+# Plugin architecture
+
+## Plugin name and supporting multiple network plugins
+
+When NCCL is initialized, it will look for a `libnccl-net.so` library and dynamically load it,
+then look for symbols inside the library.
+
+The `NCCL_NET_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
+will look for a library with a name of `libnccl-net-${NCCL_NET_PLUGIN}.so`. It is therefore
+advised to name the library following that pattern, with a symlink pointing `libnccl-net.so`
+to `libnccl-net-${NCCL_NET_PLUGIN}.so`. That way, if there are multiple plugins in the path,
+setting `NCCL_NET_PLUGIN` will allow users to select the right plugin.
+
+## Struct versioning
+
+Once a library is found, NCCL will look for a symbol named `ncclNet_vX`, with `X` increasing
+over time. The versioning ensures that the plugin and the NCCL core are compatible.
+
+Plugins are encouraged to provide multiple of those symbols, implementing multiple versions
+of the NCCL NET API, so that the same plugin can be compiled and support a wide range of NCCL
+versions.
+
+Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
+for the latest ncclNet struct version, but also looking for older ones so that older plugins
+would still work.
+
+## In-network collective operations, a.k.a. collNet
+
+Additionally to the ncclNet structure, network plugins can provide a collNet structure which
+implements in-network collective operations, if supported. That can be used by the NCCL collNet
+algorithm to accelerate inter-node reductions in allReduce.
+
+The collNet struct is a different, optional struct provided by the network plugin, but its
+versioning is tied to the ncclNet struct and many functions are common between the two to
+ease the implementation.
+
+## Headers management
+
+To help users build plugins effortlessly, plugins should copy the `ncclNet_vX` definitions
+they support to their internal includes. An example is shown in `ext-net/example/` where we keep
+all headers in the `nccl/` directory and provide thin layers to implement old versions on top
+of newer ones.
+
+The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
+from old API versions. It also provides error codes in `err.h`.
+
+# API (v6)
+
+Below is the main `ncclNet_v6` struct. Each function is explained in later sections.
+
+```
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+```
+
+## Error codes
+
+All plugins functions use NCCL error codes as return value. `ncclSuccess` should be returned upon
+success.
+
+Otherwise, plugins can return one of the following:
+ - `ncclSystemError` is the most common error for network plugins, when a call to the linux kernel
+or a system library fails. This typically includes all network/hardware errors.
+ - `ncclInternalError` is returned when the NCCL core code is using the network plugin in an
+incorrect way, for example allocating more requests than it should, or passing an invalid argument
+to calls.
+ - `ncclInvalidUsage` should be returned when the error is most likely a user error. This can
+include misconfiguration, but also sizes mismatch.
+ - `ncclInvalidArgument` should usually not be used by plugins since arguments should be checked by
+the NCCL core layer.
+ - `ncclUnhandledCudaError` is returned when an error comes from CUDA. Since network plugins should
+not need to rely on CUDA, this should not be common.
+
+## Operation overview
+
+NCCL will call the `init` function first, then query the number of network devices with the
+`devices` function, getting each network device properties with `getProperties`.
+
+To establish a connection between two network devices, NCCL will first call `listen` on the
+receiving side, pass the returned handle to the sender side of the connection, and call `connect`
+with that handle. Finally, `accept` will be called on the receiving side to finalize the connection
+establishment.
+
+Once the connection is established, communication will be done using the functions `isend`,
+`irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on
+all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers.
+
+In certain conditions, `iflush` will be called after a receive calls completes to allow the network
+plugin to flush data and ensure the GPU will observe the newly written data.
+
+To close the connections NCCL will call `closeListen` to close the object returned by `listen`,
+`closeSend` to close the object returned by `connect` and `closeRecv` to close the object returned
+by `accept`.
+
+## API Functions
+
+### Initialization
+`name`
+
+The `name` field should point to a character string with the name of the network plugin. This will
+be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
+
+Note: setting `NCCL_NET=<plugin name>` will ensure a specific network implementation is used, with
+a matching `name`. This is not to be confused with `NCCL_NET_PLUGIN` which defines a suffix to the
+`libnccl-net.so`library name to load.
+
+`init`
+
+As soon as NCCL finds the plugin and the correct ncclNet symbol, it will call the `init` function.
+This will allow the plugin to discover network devices and make sure they are usable. If the
+`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
+internal ones.
+
+To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
+function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
+the plugin code adding the following definitions:
+
+```
+#define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+```
+
+`devices`
+
+Once the plugin is initialized, NCCL will query the number of devices available. It should not
+be zero, otherwise NCCL initialization will fail. If no device is present or usable, the `init`
+function should not return `ncclSuccess`.
+
+`getProperties`
+
+Right after getting the number of devices, NCCL will query properties for each available network
+device. These properties are critical when multiple adapters are present to ensure NCCL uses each
+adapter in the most optimized way.
+
+The `name` is only used for logging.
+
+The `pciPath` is the base for all topology detection and should point to the PCI device directory
+in /sys. This is typically the directory pointed by `/sys/class/net/eth0/device` or
+`/sys/class/infiniband/mlx5_0/device`. If the network interface is virtual, then `pciPath` should
+be `NULL`.
+
+The `guid` field is used to determine when network adapters are connected to multiple PCI
+endpoints. For normal cases, it can be set to the device number. If multiple network devices have
+the same guid, then NCCL will consider the are sharing the same network port to the fabric, hence
+it will not use the port multiple times.
+
+The `ptrSupport` field indicates whether or not CUDA pointers are supported. If so, it should be
+set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HOST`. If the plugin
+supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and
+provide a `regMrDmaBuf` function.
+
+The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
+important to ensure proper optimization of flows within the node.
+
+The `port` field indicates the port number. This is important again for topology detection and flow
+optimization within the node when a NIC with a single PCI connection is connected to the fabric
+with multiple ports.
+
+The `latency` field indicates the network latency in microseconds. This can be useful to improve
+the NCCL tuning and make sure NCCL switches from tree to ring at the right size.
+
+The `maxComms` field indicates the maximum number of connections we can create.
+
+The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped
+receive).
+
+### Connection establishment
+
+Connections are used in an unidirectional manner. There is therefore a sender side and a receiver
+side.
+
+`listen`
+
+To create a connection, NCCL will start by calling `listen` on the receiver side. This function
+takes a device number as input argument, and should return a local `listenComm` object, and a
+`handle` to pass to the other side, so that the sender side can connect to the receiver.
+
+The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
+
+This call should never block, but contrary to `connect` and `accept`, `listenComm` should never
+be `NULL` if the call succeeds.
+
+`connect`
+
+NCCL will use its bootstrap infrastructure to provide the `handle` to the sender side, then call
+`connect` on the sender side on a given device index `dev`, providing the `handle`. `connect`
+should not block either, and instead set `sendComm` to `NULL` and return `ncclSuccess`. In that
+case, NCCL will call `accept` again until it succeeds.
+
+`accept`
+
+To finalize the connection, the receiver side will call `accept` on the `listenComm` returned by
+the `listen` call previously. If the sender did not connect yet, `accept` should not block. It
+should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
+succeeds.
+
+`closeListen`/`closeSend`/`closeRecv`
+
+Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
+`closeListen`/`closeSend`/`closeRecv` to free the associated resources.
+
+### Communication
+
+Communication is done using asynchronous send and receive operations: `isend`, `irecv` and `test`.
+To support RDMA capabilities, buffer registration and flush functions are provided.
+
+To keep track of asynchronous send, receive and flush operations, requests are returned to NCCL,
+then queried with `test`. Each `sendComm` or `recvComm` must be able to handle
+`NCCL_NET_MAX_REQUESTS` requests in parallel.
+
+Note: That value should be multiplied by the multi-receive capability of the plugin for the sender
+side, so that we can effectively have `NCCL_NET_MAX_REQUESTS` multi-receive operations happening
+in parallel. So, if we have a `maxRecvs`value of 8 and `NCCL_NET_MAX_REQUESTS` is 8, then each
+`sendComm` must be able to handle up to 8x8=64 concurrent `isend` operations.
+
+`regMr`
+
+Prior to sending or receiving data, NCCL will call `regMr` with any buffers later used for
+communication. It will provide a `sendComm` or `recvComm` as `comm` argument, then the buffer
+pointer `data`, `size`, and `type` being either `NCCL_PTR_HOST`, or `NCCL_PTR_CUDA` if the network
+supports CUDA pointers.
+
+The network plugin can use the output argument `mhandle` to keep any reference to that memory
+registration, as this `mhandle` will be passed back for all `isend`, `irecv`, `iflush` and
+`deregMr` calls.
+
+`regMrDmaBuf`
+
+If the plugin has set the `NCCL_PTR_DMABUF` property in `ptrSupport`, NCCL will use `regMrDmaBuf`
+instead of `regMr`. If the property was not set, `regMrDmaBuf` can be set to `NULL`.
+
+
+`deregMr`
+
+When buffers will no longer be used for communication, NCCL will call `deregMr` to let the plugin
+free resources. This function is used to deregister handles returned by both `regMr` and
+`regMrDmaBuf`.
+
+`isend`
+
+Data will be sent through the connection using `isend`, passing the `sendComm` previously
+created by `connect`, and the buffer described by `data`, `size`, and `mhandle`. A `tag` must be
+used if the network supports multi-receive operations (see `irecv`) to distinguish between
+different sends matching the same multi-receive. Otherwise it can be set to 0.
+
+The `isend` operation returns a handle in the `request` argument for further calls to `test`. If
+the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
+`isend` again later.
+
+`irecv`
+
+To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
+`n` will allow NCCL to perform a multi-receive, to allow grouping of multiple sends through a
+single network connection. Each buffer will be described by the `data`, `sizes`, and `mhandles`
+arrays. `tags` will specify a tag for each receive so that each of the `n` independent `isend`
+operations is received into the right buffer.
+
+If all receive operations can be initiated, `irecv` will return a handle in the `request` pointer,
+otherwise it will set it to `NULL`. In the case of multi-receive, all `n` receive operations are
+handled by a single request handle.
+
+The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation.
+The contrary (receive size being lower than the send size) is an error, however.
+
+Note: for a given connection, send/receive operations should always match in the order they were
+posted. Tags provided for receive operations are only used to assign a given send operation to one
+of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
+matching on any receive operation posted.
+
+`test`
+
+After an `isend` or `irecv` operation is initiated, NCCL will call `test` on the request handles
+until they complete. When that happens, `done` will be set to 1 and `sizes` will be set to the
+real size sent or received, the latter being potentially lower than the size passed to `irecv`.
+
+In the case of a multi-receive, all receives will be considered as done as a single operation (the
+goal being to allow aggregation), hence they share a single request and a single `done` status.
+However, they can have different sizes, so when `done` is non-zero, the `sizes` array should
+contain the `n` sizes corresponding to the buffers passed to `irecv`.
+
+Once `test` returns 1 in `done`, the request handle can be freed, meaning that NCCL will never
+call `test` again on that request (until it is reallocated by another call to `isend` or `irecv`).
+
+`iflush`
+
+After a receive operation completes, if the operation was targeting GPU memory and received a
+non-zero number of bytes, NCCL will call `iflush` to let the network flush any buffer and ensure
+the GPU can read it right after without seeing stale data. This flush operation is decoupled from
+the `test` code to improve latency of `LL*` protocols, as those are capable of determining when
+data is valid or not.
+
+`iflush` returns a request which needs to be queried with `test` until it completes.
--- a/ext-net/dummy/plugin.c
+++ b/ext-net/dummy/plugin.c
@ -1,80 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <nccl.h>
-#include <nccl_net.h>
-
-#define __hidden __attribute__ ((visibility("hidden")))
-
-__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
-__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
-__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
-__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
-__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
-__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) { return ncclInternalError; }
-__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
-__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
-
-ncclNet_t NCCL_PLUGIN_SYMBOL = {
-  "Dummy",
-  pluginInit,
-  pluginDevices,
-  pluginPciPath,
-  pluginPtrSupport,
-  pluginListen,
-  pluginConnect,
-  pluginAccept,
-  pluginRegMr,
-  pluginDeregMr,
-  pluginIsend,
-  pluginIrecv,
-  pluginFlush,
-  pluginTest,
-  pluginCloseSend,
-  pluginCloseRecv,
-  pluginCloseListen
-};
-
-__hidden ncclResult_t pluginCollNetInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
-__hidden ncclResult_t pluginCollNetDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
-__hidden ncclResult_t pluginCollNetPciPath(int dev, char** path) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginCollNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetFlush(void* collComm, void* data, int size, void* mhandle) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetTest(void* request, int* done, int* size) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetCloseColl(void* collComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginCollNetCloseListen(void* listenComm) { return ncclInternalError; }
-
-ncclCollNet_t NCCL_COLLNET_PLUGIN_SYMBOL = {
-  "Dummy",
-  pluginCollNetInit,
-  pluginCollNetDevices,
-  pluginCollNetPciPath,
-  pluginCollNetPtrSupport,
-  pluginCollNetListen,
-  pluginCollNetConnect,
-  pluginCollNetReduceSupport,
-  pluginCollNetRegMr,
-  pluginCollNetDeregMr,
-  pluginCollNetIallreduce,
-  pluginCollNetFlush,
-  pluginCollNetTest,
-  pluginCollNetCloseColl,
-  pluginCollNetCloseListen
-};
--- a/ext-net/example/Makefile
+++ b/ext-net/example/Makefile
--- a/ext-net/example/nccl/err.h
+++ b/ext-net/example/nccl/err.h
@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "err.h"
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 8
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#include "net_v6.h"
+#include "net_v5.h"
+#include "net_v4.h"
+#include "net_v3.h"
+#include "net_v2.h"
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v2.h
+++ b/ext-net/example/nccl/net_v2.h
@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V2_H_
+#define NCCL_NET_V2_H_
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Return the device path in /sys. NCCL will call free on this path.
+  ncclResult_t (*pciPath)(int dev, char** path);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v2_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v3.h
+++ b/ext-net/example/nccl/net_v3.h
@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V3_H_
+#define NCCL_NET_V3_H_
+
+#define NCCL_NET_HANDLE_MAXSIZE_V3 64
+#define NCCL_NET_MAX_REQUESTS_V3 16
+
+typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v3_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v4.h
+++ b/ext-net/example/nccl/net_v4.h
@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V4_H_
+#define NCCL_NET_V4_H_
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+} ncclNetProperties_v4_t;
+
+// v4 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v4_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v5.h
+++ b/ext-net/example/nccl/net_v5.h
@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V5_H_
+#define NCCL_NET_V5_H_
+
+typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v6.h
+++ b/ext-net/example/nccl/net_v6.h
@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V6_H_
+#define NCCL_NET_V6_H_
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+}ncclNetProperties_v6_t;
+
+typedef ncclNetProperties_v6_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/types.h
+++ b/ext-net/example/nccl/types.h
@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+} ncclDataType_t;
+
+#endif
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@ -0,0 +1,200 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <nccl/net.h>
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+int max_requests = NCCL_NET_MAX_REQUESTS;
+
+__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
+
+__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
+__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v6_t* props) {
+  //pluginPciPath(dev, &props.pciPath);
+  //pluginPtrSupport(dev, &props.ptrSupport);
+  return ncclInternalError;
+}
+__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
+
+#define PLUGIN_NAME "Plugin"
+
+const ncclNet_v6_t ncclNetPlugin_v6 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties,
+  .listen = pluginListen,
+  .connect = pluginConnect,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v5 Compat */
+const ncclNet_v5_t ncclNetPlugin_v5 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties,
+  .listen = pluginListen,
+  .connect = pluginConnect,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v4 Compat */
+static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
+  ncclNetProperties_v6_t props_v6;
+  ncclResult_t ret = pluginGetProperties(dev, &props_v6);
+  if (ret != ncclSuccess) return ret;
+  props->name = props_v6.name;
+  props->pciPath = props_v6.pciPath;
+  props->guid = props_v6.guid;
+  props->ptrSupport = props_v6.ptrSupport;
+  props->speed = props_v6.speed;
+  props->port = props_v6.port;
+  props->maxComms = props_v6.maxComms;
+  return ncclSuccess;
+}
+static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
+  return pluginIsend(sendComm, data, size, 0, mhandle, request);
+}
+static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
+  int tag = 0;
+  return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request);
+}
+static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
+  return pluginIflush(recvComm, 1, &data, &size, &mhandle, request);
+}
+static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
+  ncclResult_t ret;
+  do {
+    ret = pluginConnect(dev, handle, sendComm);
+  } while (ret == ncclSuccess && *sendComm == NULL);
+  return ret;
+}
+static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
+  ncclResult_t ret;
+  do {
+    ret = pluginAccept(listenComm, recvComm);
+  } while (ret == ncclSuccess && *recvComm == NULL);
+  return ret;
+}
+const ncclNet_v4_t ncclNetPlugin_v4 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v4,
+  .listen = pluginListen,
+  .connect = pluginConnect_v4,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .iflush = pluginIflush_v4,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v3 Compat */
+static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) {
+  void* req;
+  ncclResult_t ret = pluginIflush_v4(recvComm, data, size, mhandle, &req);
+  int done = 0;
+  while (ret == ncclSuccess && done == 0) {
+    ret = pluginTest(req, &done, NULL);
+  }
+  return ret;
+}
+static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
+  max_requests = NCCL_NET_MAX_REQUESTS_V3;
+  return pluginInit(logFunction);
+}
+#include <string.h>
+static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
+  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
+  ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
+  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V3);
+  return ret;
+}
+static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) {
+  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
+  memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V3);
+  return pluginConnect_v4(dev, &pluginHandle, sendComm);
+}
+const ncclNet_v3_t ncclNetPlugin_v3 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v3,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v4,
+  .listen = pluginListen_v3,
+  .connect = pluginConnect_v3,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .flush = pluginFlush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v2 Compat */
+const ncclNet_v2_t ncclNetPlugin_v2 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v3,
+  .devices = pluginDevices,
+  .pciPath = pluginPciPath,
+  .ptrSupport = pluginPtrSupport,
+  .listen = pluginListen,
+  .connect = pluginConnect_v4,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .flush = pluginFlush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};