diff --git a/ext-net/README.md b/ext-net/README.md new file mode 100644 index 0000000..5361b44 --- /dev/null +++ b/ext-net/README.md @@ -0,0 +1,352 @@ +# NCCL Net Plugin Documentation + +This page describes the NCCL Net plugin API and how to implement a network plugin for NCCL. + +# Overview + +To allow NCCL to work on any network type, NCCL provides a way to use external plugins. Plugins +implement the NCCL network API, and decouple NCCL binary builds which are built against a +particular version of the GPU stack (i.e. CUDA) from the network code which is built against a +particular version of the networking stack. That way, we can easily integrate any CUDA version +with any network stack version. + +NCCL network plugins come as a shared library called `libnccl-net.so`. That shared library +contains one or more implementations of the NCCL NET API, in the form of versioned structs, +filled with pointers to all required functions. + +# Plugin architecture + +## Plugin name and supporting multiple network plugins + +When NCCL is initialized, it will look for a `libnccl-net.so` library and dynamically load it, +then look for symbols inside the library. + +The `NCCL_NET_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL +will look for a library with a name of `libnccl-net-${NCCL_NET_PLUGIN}.so`. It is therefore +advised to name the library following that pattern, with a symlink pointing `libnccl-net.so` +to `libnccl-net-${NCCL_NET_PLUGIN}.so`. That way, if there are multiple plugins in the path, +setting `NCCL_NET_PLUGIN` will allow users to select the right plugin. + +## Struct versioning + +Once a library is found, NCCL will look for a symbol named `ncclNet_vX`, with `X` increasing +over time. The versioning ensures that the plugin and the NCCL core are compatible. + +Plugins are encouraged to provide multiple of those symbols, implementing multiple versions +of the NCCL NET API, so that the same plugin can be compiled and support a wide range of NCCL +versions. + +Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking +for the latest ncclNet struct version, but also looking for older ones so that older plugins +would still work. + +## In-network collective operations, a.k.a. collNet + +Additionally to the ncclNet structure, network plugins can provide a collNet structure which +implements in-network collective operations, if supported. That can be used by the NCCL collNet +algorithm to accelerate inter-node reductions in allReduce. + +The collNet struct is a different, optional struct provided by the network plugin, but its +versioning is tied to the ncclNet struct and many functions are common between the two to +ease the implementation. + +## Headers management + +To help users build plugins effortlessly, plugins should copy the `ncclNet_vX` definitions +they support to their internal includes. An example is shown in `ext-net/example/` where we keep +all headers in the `nccl/` directory and provide thin layers to implement old versions on top +of newer ones. + +The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions +from old API versions. It also provides error codes in `err.h`. + +# API (v6) + +Below is the main `ncclNet_v6` struct. Each function is explained in later sections. + +``` +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v6_t; +``` + +## Error codes + +All plugins functions use NCCL error codes as return value. `ncclSuccess` should be returned upon +success. + +Otherwise, plugins can return one of the following: + - `ncclSystemError` is the most common error for network plugins, when a call to the linux kernel +or a system library fails. This typically includes all network/hardware errors. + - `ncclInternalError` is returned when the NCCL core code is using the network plugin in an +incorrect way, for example allocating more requests than it should, or passing an invalid argument +to calls. + - `ncclInvalidUsage` should be returned when the error is most likely a user error. This can +include misconfiguration, but also sizes mismatch. + - `ncclInvalidArgument` should usually not be used by plugins since arguments should be checked by +the NCCL core layer. + - `ncclUnhandledCudaError` is returned when an error comes from CUDA. Since network plugins should +not need to rely on CUDA, this should not be common. + +## Operation overview + +NCCL will call the `init` function first, then query the number of network devices with the +`devices` function, getting each network device properties with `getProperties`. + +To establish a connection between two network devices, NCCL will first call `listen` on the +receiving side, pass the returned handle to the sender side of the connection, and call `connect` +with that handle. Finally, `accept` will be called on the receiving side to finalize the connection +establishment. + +Once the connection is established, communication will be done using the functions `isend`, +`irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on +all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers. + +In certain conditions, `iflush` will be called after a receive calls completes to allow the network +plugin to flush data and ensure the GPU will observe the newly written data. + +To close the connections NCCL will call `closeListen` to close the object returned by `listen`, +`closeSend` to close the object returned by `connect` and `closeRecv` to close the object returned +by `accept`. + +## API Functions + +### Initialization +`name` + +The `name` field should point to a character string with the name of the network plugin. This will +be used for all logging, especially when `NCCL_DEBUG=INFO` is set. + +Note: setting `NCCL_NET=` will ensure a specific network implementation is used, with +a matching `name`. This is not to be confused with `NCCL_NET_PLUGIN` which defines a suffix to the +`libnccl-net.so`library name to load. + +`init` + +As soon as NCCL finds the plugin and the correct ncclNet symbol, it will call the `init` function. +This will allow the plugin to discover network devices and make sure they are usable. If the +`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on +internal ones. + +To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging +function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within +the plugin code adding the following definitions: + +``` +#define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) +#define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) +``` + +`devices` + +Once the plugin is initialized, NCCL will query the number of devices available. It should not +be zero, otherwise NCCL initialization will fail. If no device is present or usable, the `init` +function should not return `ncclSuccess`. + +`getProperties` + +Right after getting the number of devices, NCCL will query properties for each available network +device. These properties are critical when multiple adapters are present to ensure NCCL uses each +adapter in the most optimized way. + +The `name` is only used for logging. + +The `pciPath` is the base for all topology detection and should point to the PCI device directory +in /sys. This is typically the directory pointed by `/sys/class/net/eth0/device` or +`/sys/class/infiniband/mlx5_0/device`. If the network interface is virtual, then `pciPath` should +be `NULL`. + +The `guid` field is used to determine when network adapters are connected to multiple PCI +endpoints. For normal cases, it can be set to the device number. If multiple network devices have +the same guid, then NCCL will consider the are sharing the same network port to the fabric, hence +it will not use the port multiple times. + +The `ptrSupport` field indicates whether or not CUDA pointers are supported. If so, it should be +set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HOST`. If the plugin +supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and +provide a `regMrDmaBuf` function. + +The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is +important to ensure proper optimization of flows within the node. + +The `port` field indicates the port number. This is important again for topology detection and flow +optimization within the node when a NIC with a single PCI connection is connected to the fabric +with multiple ports. + +The `latency` field indicates the network latency in microseconds. This can be useful to improve +the NCCL tuning and make sure NCCL switches from tree to ring at the right size. + +The `maxComms` field indicates the maximum number of connections we can create. + +The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped +receive). + +### Connection establishment + +Connections are used in an unidirectional manner. There is therefore a sender side and a receiver +side. + +`listen` + +To create a connection, NCCL will start by calling `listen` on the receiver side. This function +takes a device number as input argument, and should return a local `listenComm` object, and a +`handle` to pass to the other side, so that the sender side can connect to the receiver. + +The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL. + +This call should never block, but contrary to `connect` and `accept`, `listenComm` should never +be `NULL` if the call succeeds. + +`connect` + +NCCL will use its bootstrap infrastructure to provide the `handle` to the sender side, then call +`connect` on the sender side on a given device index `dev`, providing the `handle`. `connect` +should not block either, and instead set `sendComm` to `NULL` and return `ncclSuccess`. In that +case, NCCL will call `accept` again until it succeeds. + +`accept` + +To finalize the connection, the receiver side will call `accept` on the `listenComm` returned by +the `listen` call previously. If the sender did not connect yet, `accept` should not block. It +should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it +succeeds. + +`closeListen`/`closeSend`/`closeRecv` + +Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call +`closeListen`/`closeSend`/`closeRecv` to free the associated resources. + +### Communication + +Communication is done using asynchronous send and receive operations: `isend`, `irecv` and `test`. +To support RDMA capabilities, buffer registration and flush functions are provided. + +To keep track of asynchronous send, receive and flush operations, requests are returned to NCCL, +then queried with `test`. Each `sendComm` or `recvComm` must be able to handle +`NCCL_NET_MAX_REQUESTS` requests in parallel. + +Note: That value should be multiplied by the multi-receive capability of the plugin for the sender +side, so that we can effectively have `NCCL_NET_MAX_REQUESTS` multi-receive operations happening +in parallel. So, if we have a `maxRecvs`value of 8 and `NCCL_NET_MAX_REQUESTS` is 8, then each +`sendComm` must be able to handle up to 8x8=64 concurrent `isend` operations. + +`regMr` + +Prior to sending or receiving data, NCCL will call `regMr` with any buffers later used for +communication. It will provide a `sendComm` or `recvComm` as `comm` argument, then the buffer +pointer `data`, `size`, and `type` being either `NCCL_PTR_HOST`, or `NCCL_PTR_CUDA` if the network +supports CUDA pointers. + +The network plugin can use the output argument `mhandle` to keep any reference to that memory +registration, as this `mhandle` will be passed back for all `isend`, `irecv`, `iflush` and +`deregMr` calls. + +`regMrDmaBuf` + +If the plugin has set the `NCCL_PTR_DMABUF` property in `ptrSupport`, NCCL will use `regMrDmaBuf` +instead of `regMr`. If the property was not set, `regMrDmaBuf` can be set to `NULL`. + + +`deregMr` + +When buffers will no longer be used for communication, NCCL will call `deregMr` to let the plugin +free resources. This function is used to deregister handles returned by both `regMr` and +`regMrDmaBuf`. + +`isend` + +Data will be sent through the connection using `isend`, passing the `sendComm` previously +created by `connect`, and the buffer described by `data`, `size`, and `mhandle`. A `tag` must be +used if the network supports multi-receive operations (see `irecv`) to distinguish between +different sends matching the same multi-receive. Otherwise it can be set to 0. + +The `isend` operation returns a handle in the `request` argument for further calls to `test`. If +the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call +`isend` again later. + +`irecv` + +To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument +`n` will allow NCCL to perform a multi-receive, to allow grouping of multiple sends through a +single network connection. Each buffer will be described by the `data`, `sizes`, and `mhandles` +arrays. `tags` will specify a tag for each receive so that each of the `n` independent `isend` +operations is received into the right buffer. + +If all receive operations can be initiated, `irecv` will return a handle in the `request` pointer, +otherwise it will set it to `NULL`. In the case of multi-receive, all `n` receive operations are +handled by a single request handle. + +The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation. +The contrary (receive size being lower than the send size) is an error, however. + +Note: for a given connection, send/receive operations should always match in the order they were +posted. Tags provided for receive operations are only used to assign a given send operation to one +of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag +matching on any receive operation posted. + +`test` + +After an `isend` or `irecv` operation is initiated, NCCL will call `test` on the request handles +until they complete. When that happens, `done` will be set to 1 and `sizes` will be set to the +real size sent or received, the latter being potentially lower than the size passed to `irecv`. + +In the case of a multi-receive, all receives will be considered as done as a single operation (the +goal being to allow aggregation), hence they share a single request and a single `done` status. +However, they can have different sizes, so when `done` is non-zero, the `sizes` array should +contain the `n` sizes corresponding to the buffers passed to `irecv`. + +Once `test` returns 1 in `done`, the request handle can be freed, meaning that NCCL will never +call `test` again on that request (until it is reallocated by another call to `isend` or `irecv`). + +`iflush` + +After a receive operation completes, if the operation was targeting GPU memory and received a +non-zero number of bytes, NCCL will call `iflush` to let the network flush any buffer and ensure +the GPU can read it right after without seeing stale data. This flush operation is decoupled from +the `test` code to improve latency of `LL*` protocols, as those are capable of determining when +data is valid or not. + +`iflush` returns a request which needs to be queried with `test` until it completes. diff --git a/ext-net/dummy/plugin.c b/ext-net/dummy/plugin.c deleted file mode 100644 index dcf0a23..0000000 --- a/ext-net/dummy/plugin.c +++ /dev/null @@ -1,80 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include -#include - -#define __hidden __attribute__ ((visibility("hidden"))) - -__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } -__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } -__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } -__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } -__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } -__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; } -__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; } -__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; } -__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} -__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; } -__hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; } -__hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) { return ncclInternalError; } -__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } -__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } -__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; } -__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; } - -ncclNet_t NCCL_PLUGIN_SYMBOL = { - "Dummy", - pluginInit, - pluginDevices, - pluginPciPath, - pluginPtrSupport, - pluginListen, - pluginConnect, - pluginAccept, - pluginRegMr, - pluginDeregMr, - pluginIsend, - pluginIrecv, - pluginFlush, - pluginTest, - pluginCloseSend, - pluginCloseRecv, - pluginCloseListen -}; - -__hidden ncclResult_t pluginCollNetInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } -__hidden ncclResult_t pluginCollNetDevices(int* ndev) { *ndev = 0; return ncclSuccess; } -__hidden ncclResult_t pluginCollNetPciPath(int dev, char** path) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} -__hidden ncclResult_t pluginCollNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetFlush(void* collComm, void* data, int size, void* mhandle) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetTest(void* request, int* done, int* size) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetCloseColl(void* collComm) { return ncclInternalError; } -__hidden ncclResult_t pluginCollNetCloseListen(void* listenComm) { return ncclInternalError; } - -ncclCollNet_t NCCL_COLLNET_PLUGIN_SYMBOL = { - "Dummy", - pluginCollNetInit, - pluginCollNetDevices, - pluginCollNetPciPath, - pluginCollNetPtrSupport, - pluginCollNetListen, - pluginCollNetConnect, - pluginCollNetReduceSupport, - pluginCollNetRegMr, - pluginCollNetDeregMr, - pluginCollNetIallreduce, - pluginCollNetFlush, - pluginCollNetTest, - pluginCollNetCloseColl, - pluginCollNetCloseListen -}; diff --git a/ext-net/dummy/Makefile b/ext-net/example/Makefile similarity index 100% rename from ext-net/dummy/Makefile rename to ext-net/example/Makefile diff --git a/ext-net/example/nccl/err.h b/ext-net/example/nccl/err.h new file mode 100644 index 0000000..0a22677 --- /dev/null +++ b/ext-net/example/nccl/err.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_ERR_H_ +#define NCCL_ERR_H_ + +/* Error type for plugins */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclRemoteError = 6 } ncclResult_t; + +#endif diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h new file mode 100644 index 0000000..6b5b62c --- /dev/null +++ b/ext-net/example/nccl/net.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_H_ +#define NCCL_NET_H_ + +#include +#include + +#include "err.h" + +#define NCCL_NET_HANDLE_MAXSIZE 128 + +#define NCCL_PTR_HOST 0x1 +#define NCCL_PTR_CUDA 0x2 +#define NCCL_PTR_DMABUF 0x4 + +// Maximum number of requests per comm object +#define NCCL_NET_MAX_REQUESTS 8 + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#include "net_v6.h" +#include "net_v5.h" +#include "net_v4.h" +#include "net_v3.h" +#include "net_v2.h" + +#endif // end include guard diff --git a/ext-net/example/nccl/net_v2.h b/ext-net/example/nccl/net_v2.h new file mode 100644 index 0000000..0d9c906 --- /dev/null +++ b/ext-net/example/nccl/net_v2.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V2_H_ +#define NCCL_NET_V2_H_ + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Return the device path in /sys. NCCL will call free on this path. + ncclResult_t (*pciPath)(int dev, char** path); + // Return whether this device supports host pointers and/or CUDA pointers + // as data from the current GPU. Supported types should be composed with + // NCCL_PTR_HOST and NCCL_PTR_CUDA. + ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connectHandle + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); + // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v2_t; + +#endif // end include guard diff --git a/ext-net/example/nccl/net_v3.h b/ext-net/example/nccl/net_v3.h new file mode 100644 index 0000000..26d117c --- /dev/null +++ b/ext-net/example/nccl/net_v3.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V3_H_ +#define NCCL_NET_V3_H_ + +#define NCCL_NET_HANDLE_MAXSIZE_V3 64 +#define NCCL_NET_MAX_REQUESTS_V3 16 + +typedef ncclNetProperties_v4_t ncclNetProperties_v3_t; +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connectHandle + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v3_t; + +#endif // end include guard diff --git a/ext-net/example/nccl/net_v4.h b/ext-net/example/nccl/net_v4.h new file mode 100644 index 0000000..b39da14 --- /dev/null +++ b/ext-net/example/nccl/net_v4.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V4_H_ +#define NCCL_NET_V4_H_ + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA + int speed; // Port speed in Mbps. + int port; // Port number. + int maxComms; // Maximum number of comms we can create +} ncclNetProperties_v4_t; + +// v4 struct for backwards compatibility +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connectHandle + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v4_t; + +#endif // end include guard diff --git a/ext-net/example/nccl/net_v5.h b/ext-net/example/nccl/net_v5.h new file mode 100644 index 0000000..b96b6fc --- /dev/null +++ b/ext-net/example/nccl/net_v5.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V5_H_ +#define NCCL_NET_V5_H_ + +typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v5_t; + +#endif // end include guard diff --git a/ext-net/example/nccl/net_v6.h b/ext-net/example/nccl/net_v6.h new file mode 100644 index 0000000..8bc1678 --- /dev/null +++ b/ext-net/example/nccl/net_v6.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V6_H_ +#define NCCL_NET_V6_H_ + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. +}ncclNetProperties_v6_t; + +typedef ncclNetProperties_v6_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v6_t; + +#endif // end include guard diff --git a/ext-net/example/nccl/types.h b/ext-net/example/nccl/types.h new file mode 100644 index 0000000..0a5d837 --- /dev/null +++ b/ext-net/example/nccl/types.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_ERR_H_ +#define NCCL_ERR_H_ + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclBfloat16 = 9, +} ncclDataType_t; + +#endif diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c new file mode 100644 index 0000000..a44ce9e --- /dev/null +++ b/ext-net/example/plugin.c @@ -0,0 +1,200 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include + +#define __hidden __attribute__ ((visibility("hidden"))) + +int max_requests = NCCL_NET_MAX_REQUESTS; + +__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } +__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } + +__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } +__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } +__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v6_t* props) { + //pluginPciPath(dev, &props.pciPath); + //pluginPtrSupport(dev, &props.ptrSupport); + return ncclInternalError; +} +__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } +__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; } +__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; } +__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; } +__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; } +__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} +__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } +__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } +__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; } +__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; } + +#define PLUGIN_NAME "Plugin" + +const ncclNet_v6_t ncclNetPlugin_v6 = { + .name = PLUGIN_NAME, + .init = pluginInit, + .devices = pluginDevices, + .getProperties = pluginGetProperties, + .listen = pluginListen, + .connect = pluginConnect, + .accept = pluginAccept, + .regMr = pluginRegMr, + .regMrDmaBuf = pluginRegMrDmaBuf, + .deregMr = pluginDeregMr, + .isend = pluginIsend, + .irecv = pluginIrecv, + .iflush = pluginIflush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, +}; + +/* v5 Compat */ +const ncclNet_v5_t ncclNetPlugin_v5 = { + .name = PLUGIN_NAME, + .init = pluginInit, + .devices = pluginDevices, + .getProperties = pluginGetProperties, + .listen = pluginListen, + .connect = pluginConnect, + .accept = pluginAccept, + .regMr = pluginRegMr, + .deregMr = pluginDeregMr, + .isend = pluginIsend, + .irecv = pluginIrecv, + .iflush = pluginIflush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, +}; + +/* v4 Compat */ +static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) { + ncclNetProperties_v6_t props_v6; + ncclResult_t ret = pluginGetProperties(dev, &props_v6); + if (ret != ncclSuccess) return ret; + props->name = props_v6.name; + props->pciPath = props_v6.pciPath; + props->guid = props_v6.guid; + props->ptrSupport = props_v6.ptrSupport; + props->speed = props_v6.speed; + props->port = props_v6.port; + props->maxComms = props_v6.maxComms; + return ncclSuccess; +} +static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) { + return pluginIsend(sendComm, data, size, 0, mhandle, request); +} +static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { + int tag = 0; + return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request); +} +static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { + return pluginIflush(recvComm, 1, &data, &size, &mhandle, request); +} +static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) { + ncclResult_t ret; + do { + ret = pluginConnect(dev, handle, sendComm); + } while (ret == ncclSuccess && *sendComm == NULL); + return ret; +} +static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) { + ncclResult_t ret; + do { + ret = pluginAccept(listenComm, recvComm); + } while (ret == ncclSuccess && *recvComm == NULL); + return ret; +} +const ncclNet_v4_t ncclNetPlugin_v4 = { + .name = PLUGIN_NAME, + .init = pluginInit, + .devices = pluginDevices, + .getProperties = pluginGetProperties_v4, + .listen = pluginListen, + .connect = pluginConnect_v4, + .accept = pluginAccept_v4, + .regMr = pluginRegMr, + .deregMr = pluginDeregMr, + .isend = pluginIsend_v4, + .irecv = pluginIrecv_v4, + .iflush = pluginIflush_v4, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, +}; + +/* v3 Compat */ +static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) { + void* req; + ncclResult_t ret = pluginIflush_v4(recvComm, data, size, mhandle, &req); + int done = 0; + while (ret == ncclSuccess && done == 0) { + ret = pluginTest(req, &done, NULL); + } + return ret; +} +static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) { + max_requests = NCCL_NET_MAX_REQUESTS_V3; + return pluginInit(logFunction); +} +#include +static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) { + char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; + ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm); + memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V3); + return ret; +} +static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) { + char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; + memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V3); + return pluginConnect_v4(dev, &pluginHandle, sendComm); +} +const ncclNet_v3_t ncclNetPlugin_v3 = { + .name = PLUGIN_NAME, + .init = pluginInit_v3, + .devices = pluginDevices, + .getProperties = pluginGetProperties_v4, + .listen = pluginListen_v3, + .connect = pluginConnect_v3, + .accept = pluginAccept_v4, + .regMr = pluginRegMr, + .deregMr = pluginDeregMr, + .isend = pluginIsend_v4, + .irecv = pluginIrecv_v4, + .flush = pluginFlush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, +}; + +/* v2 Compat */ +const ncclNet_v2_t ncclNetPlugin_v2 = { + .name = PLUGIN_NAME, + .init = pluginInit_v3, + .devices = pluginDevices, + .pciPath = pluginPciPath, + .ptrSupport = pluginPtrSupport, + .listen = pluginListen, + .connect = pluginConnect_v4, + .accept = pluginAccept_v4, + .regMr = pluginRegMr, + .deregMr = pluginDeregMr, + .isend = pluginIsend_v4, + .irecv = pluginIrecv_v4, + .flush = pluginFlush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, +};