Add documentation for NCCL NET plugins
Also repurpose dummy plugin as example, including headers and compat layers from v6 to v2.
This commit is contained in:
parent
2f4cb874ba
commit
55b1d8ab98
352
ext-net/README.md
Normal file
352
ext-net/README.md
Normal file
@ -0,0 +1,352 @@
|
|||||||
|
# NCCL Net Plugin Documentation
|
||||||
|
|
||||||
|
This page describes the NCCL Net plugin API and how to implement a network plugin for NCCL.
|
||||||
|
|
||||||
|
# Overview
|
||||||
|
|
||||||
|
To allow NCCL to work on any network type, NCCL provides a way to use external plugins. Plugins
|
||||||
|
implement the NCCL network API, and decouple NCCL binary builds which are built against a
|
||||||
|
particular version of the GPU stack (i.e. CUDA) from the network code which is built against a
|
||||||
|
particular version of the networking stack. That way, we can easily integrate any CUDA version
|
||||||
|
with any network stack version.
|
||||||
|
|
||||||
|
NCCL network plugins come as a shared library called `libnccl-net.so`. That shared library
|
||||||
|
contains one or more implementations of the NCCL NET API, in the form of versioned structs,
|
||||||
|
filled with pointers to all required functions.
|
||||||
|
|
||||||
|
# Plugin architecture
|
||||||
|
|
||||||
|
## Plugin name and supporting multiple network plugins
|
||||||
|
|
||||||
|
When NCCL is initialized, it will look for a `libnccl-net.so` library and dynamically load it,
|
||||||
|
then look for symbols inside the library.
|
||||||
|
|
||||||
|
The `NCCL_NET_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
|
||||||
|
will look for a library with a name of `libnccl-net-${NCCL_NET_PLUGIN}.so`. It is therefore
|
||||||
|
advised to name the library following that pattern, with a symlink pointing `libnccl-net.so`
|
||||||
|
to `libnccl-net-${NCCL_NET_PLUGIN}.so`. That way, if there are multiple plugins in the path,
|
||||||
|
setting `NCCL_NET_PLUGIN` will allow users to select the right plugin.
|
||||||
|
|
||||||
|
## Struct versioning
|
||||||
|
|
||||||
|
Once a library is found, NCCL will look for a symbol named `ncclNet_vX`, with `X` increasing
|
||||||
|
over time. The versioning ensures that the plugin and the NCCL core are compatible.
|
||||||
|
|
||||||
|
Plugins are encouraged to provide multiple of those symbols, implementing multiple versions
|
||||||
|
of the NCCL NET API, so that the same plugin can be compiled and support a wide range of NCCL
|
||||||
|
versions.
|
||||||
|
|
||||||
|
Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
|
||||||
|
for the latest ncclNet struct version, but also looking for older ones so that older plugins
|
||||||
|
would still work.
|
||||||
|
|
||||||
|
## In-network collective operations, a.k.a. collNet
|
||||||
|
|
||||||
|
Additionally to the ncclNet structure, network plugins can provide a collNet structure which
|
||||||
|
implements in-network collective operations, if supported. That can be used by the NCCL collNet
|
||||||
|
algorithm to accelerate inter-node reductions in allReduce.
|
||||||
|
|
||||||
|
The collNet struct is a different, optional struct provided by the network plugin, but its
|
||||||
|
versioning is tied to the ncclNet struct and many functions are common between the two to
|
||||||
|
ease the implementation.
|
||||||
|
|
||||||
|
## Headers management
|
||||||
|
|
||||||
|
To help users build plugins effortlessly, plugins should copy the `ncclNet_vX` definitions
|
||||||
|
they support to their internal includes. An example is shown in `ext-net/example/` where we keep
|
||||||
|
all headers in the `nccl/` directory and provide thin layers to implement old versions on top
|
||||||
|
of newer ones.
|
||||||
|
|
||||||
|
The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
|
||||||
|
from old API versions. It also provides error codes in `err.h`.
|
||||||
|
|
||||||
|
# API (v6)
|
||||||
|
|
||||||
|
Below is the main `ncclNet_v6` struct. Each function is explained in later sections.
|
||||||
|
|
||||||
|
```
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with sendComm == NULL with the expectation that
|
||||||
|
// it will be called again until sendComm != NULL.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connect.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with recvComm == NULL with the expectation that
|
||||||
|
// it will be called again until recvComm != NULL.
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
/* DMA-BUF support */
|
||||||
|
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v6_t;
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error codes
|
||||||
|
|
||||||
|
All plugins functions use NCCL error codes as return value. `ncclSuccess` should be returned upon
|
||||||
|
success.
|
||||||
|
|
||||||
|
Otherwise, plugins can return one of the following:
|
||||||
|
- `ncclSystemError` is the most common error for network plugins, when a call to the linux kernel
|
||||||
|
or a system library fails. This typically includes all network/hardware errors.
|
||||||
|
- `ncclInternalError` is returned when the NCCL core code is using the network plugin in an
|
||||||
|
incorrect way, for example allocating more requests than it should, or passing an invalid argument
|
||||||
|
to calls.
|
||||||
|
- `ncclInvalidUsage` should be returned when the error is most likely a user error. This can
|
||||||
|
include misconfiguration, but also sizes mismatch.
|
||||||
|
- `ncclInvalidArgument` should usually not be used by plugins since arguments should be checked by
|
||||||
|
the NCCL core layer.
|
||||||
|
- `ncclUnhandledCudaError` is returned when an error comes from CUDA. Since network plugins should
|
||||||
|
not need to rely on CUDA, this should not be common.
|
||||||
|
|
||||||
|
## Operation overview
|
||||||
|
|
||||||
|
NCCL will call the `init` function first, then query the number of network devices with the
|
||||||
|
`devices` function, getting each network device properties with `getProperties`.
|
||||||
|
|
||||||
|
To establish a connection between two network devices, NCCL will first call `listen` on the
|
||||||
|
receiving side, pass the returned handle to the sender side of the connection, and call `connect`
|
||||||
|
with that handle. Finally, `accept` will be called on the receiving side to finalize the connection
|
||||||
|
establishment.
|
||||||
|
|
||||||
|
Once the connection is established, communication will be done using the functions `isend`,
|
||||||
|
`irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on
|
||||||
|
all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers.
|
||||||
|
|
||||||
|
In certain conditions, `iflush` will be called after a receive calls completes to allow the network
|
||||||
|
plugin to flush data and ensure the GPU will observe the newly written data.
|
||||||
|
|
||||||
|
To close the connections NCCL will call `closeListen` to close the object returned by `listen`,
|
||||||
|
`closeSend` to close the object returned by `connect` and `closeRecv` to close the object returned
|
||||||
|
by `accept`.
|
||||||
|
|
||||||
|
## API Functions
|
||||||
|
|
||||||
|
### Initialization
|
||||||
|
`name`
|
||||||
|
|
||||||
|
The `name` field should point to a character string with the name of the network plugin. This will
|
||||||
|
be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
|
||||||
|
|
||||||
|
Note: setting `NCCL_NET=<plugin name>` will ensure a specific network implementation is used, with
|
||||||
|
a matching `name`. This is not to be confused with `NCCL_NET_PLUGIN` which defines a suffix to the
|
||||||
|
`libnccl-net.so`library name to load.
|
||||||
|
|
||||||
|
`init`
|
||||||
|
|
||||||
|
As soon as NCCL finds the plugin and the correct ncclNet symbol, it will call the `init` function.
|
||||||
|
This will allow the plugin to discover network devices and make sure they are usable. If the
|
||||||
|
`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
|
||||||
|
internal ones.
|
||||||
|
|
||||||
|
To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
|
||||||
|
function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
|
||||||
|
the plugin code adding the following definitions:
|
||||||
|
|
||||||
|
```
|
||||||
|
#define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
|
||||||
|
#define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||||
|
```
|
||||||
|
|
||||||
|
`devices`
|
||||||
|
|
||||||
|
Once the plugin is initialized, NCCL will query the number of devices available. It should not
|
||||||
|
be zero, otherwise NCCL initialization will fail. If no device is present or usable, the `init`
|
||||||
|
function should not return `ncclSuccess`.
|
||||||
|
|
||||||
|
`getProperties`
|
||||||
|
|
||||||
|
Right after getting the number of devices, NCCL will query properties for each available network
|
||||||
|
device. These properties are critical when multiple adapters are present to ensure NCCL uses each
|
||||||
|
adapter in the most optimized way.
|
||||||
|
|
||||||
|
The `name` is only used for logging.
|
||||||
|
|
||||||
|
The `pciPath` is the base for all topology detection and should point to the PCI device directory
|
||||||
|
in /sys. This is typically the directory pointed by `/sys/class/net/eth0/device` or
|
||||||
|
`/sys/class/infiniband/mlx5_0/device`. If the network interface is virtual, then `pciPath` should
|
||||||
|
be `NULL`.
|
||||||
|
|
||||||
|
The `guid` field is used to determine when network adapters are connected to multiple PCI
|
||||||
|
endpoints. For normal cases, it can be set to the device number. If multiple network devices have
|
||||||
|
the same guid, then NCCL will consider the are sharing the same network port to the fabric, hence
|
||||||
|
it will not use the port multiple times.
|
||||||
|
|
||||||
|
The `ptrSupport` field indicates whether or not CUDA pointers are supported. If so, it should be
|
||||||
|
set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HOST`. If the plugin
|
||||||
|
supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and
|
||||||
|
provide a `regMrDmaBuf` function.
|
||||||
|
|
||||||
|
The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
|
||||||
|
important to ensure proper optimization of flows within the node.
|
||||||
|
|
||||||
|
The `port` field indicates the port number. This is important again for topology detection and flow
|
||||||
|
optimization within the node when a NIC with a single PCI connection is connected to the fabric
|
||||||
|
with multiple ports.
|
||||||
|
|
||||||
|
The `latency` field indicates the network latency in microseconds. This can be useful to improve
|
||||||
|
the NCCL tuning and make sure NCCL switches from tree to ring at the right size.
|
||||||
|
|
||||||
|
The `maxComms` field indicates the maximum number of connections we can create.
|
||||||
|
|
||||||
|
The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped
|
||||||
|
receive).
|
||||||
|
|
||||||
|
### Connection establishment
|
||||||
|
|
||||||
|
Connections are used in an unidirectional manner. There is therefore a sender side and a receiver
|
||||||
|
side.
|
||||||
|
|
||||||
|
`listen`
|
||||||
|
|
||||||
|
To create a connection, NCCL will start by calling `listen` on the receiver side. This function
|
||||||
|
takes a device number as input argument, and should return a local `listenComm` object, and a
|
||||||
|
`handle` to pass to the other side, so that the sender side can connect to the receiver.
|
||||||
|
|
||||||
|
The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
|
||||||
|
|
||||||
|
This call should never block, but contrary to `connect` and `accept`, `listenComm` should never
|
||||||
|
be `NULL` if the call succeeds.
|
||||||
|
|
||||||
|
`connect`
|
||||||
|
|
||||||
|
NCCL will use its bootstrap infrastructure to provide the `handle` to the sender side, then call
|
||||||
|
`connect` on the sender side on a given device index `dev`, providing the `handle`. `connect`
|
||||||
|
should not block either, and instead set `sendComm` to `NULL` and return `ncclSuccess`. In that
|
||||||
|
case, NCCL will call `accept` again until it succeeds.
|
||||||
|
|
||||||
|
`accept`
|
||||||
|
|
||||||
|
To finalize the connection, the receiver side will call `accept` on the `listenComm` returned by
|
||||||
|
the `listen` call previously. If the sender did not connect yet, `accept` should not block. It
|
||||||
|
should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
|
||||||
|
succeeds.
|
||||||
|
|
||||||
|
`closeListen`/`closeSend`/`closeRecv`
|
||||||
|
|
||||||
|
Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
|
||||||
|
`closeListen`/`closeSend`/`closeRecv` to free the associated resources.
|
||||||
|
|
||||||
|
### Communication
|
||||||
|
|
||||||
|
Communication is done using asynchronous send and receive operations: `isend`, `irecv` and `test`.
|
||||||
|
To support RDMA capabilities, buffer registration and flush functions are provided.
|
||||||
|
|
||||||
|
To keep track of asynchronous send, receive and flush operations, requests are returned to NCCL,
|
||||||
|
then queried with `test`. Each `sendComm` or `recvComm` must be able to handle
|
||||||
|
`NCCL_NET_MAX_REQUESTS` requests in parallel.
|
||||||
|
|
||||||
|
Note: That value should be multiplied by the multi-receive capability of the plugin for the sender
|
||||||
|
side, so that we can effectively have `NCCL_NET_MAX_REQUESTS` multi-receive operations happening
|
||||||
|
in parallel. So, if we have a `maxRecvs`value of 8 and `NCCL_NET_MAX_REQUESTS` is 8, then each
|
||||||
|
`sendComm` must be able to handle up to 8x8=64 concurrent `isend` operations.
|
||||||
|
|
||||||
|
`regMr`
|
||||||
|
|
||||||
|
Prior to sending or receiving data, NCCL will call `regMr` with any buffers later used for
|
||||||
|
communication. It will provide a `sendComm` or `recvComm` as `comm` argument, then the buffer
|
||||||
|
pointer `data`, `size`, and `type` being either `NCCL_PTR_HOST`, or `NCCL_PTR_CUDA` if the network
|
||||||
|
supports CUDA pointers.
|
||||||
|
|
||||||
|
The network plugin can use the output argument `mhandle` to keep any reference to that memory
|
||||||
|
registration, as this `mhandle` will be passed back for all `isend`, `irecv`, `iflush` and
|
||||||
|
`deregMr` calls.
|
||||||
|
|
||||||
|
`regMrDmaBuf`
|
||||||
|
|
||||||
|
If the plugin has set the `NCCL_PTR_DMABUF` property in `ptrSupport`, NCCL will use `regMrDmaBuf`
|
||||||
|
instead of `regMr`. If the property was not set, `regMrDmaBuf` can be set to `NULL`.
|
||||||
|
|
||||||
|
|
||||||
|
`deregMr`
|
||||||
|
|
||||||
|
When buffers will no longer be used for communication, NCCL will call `deregMr` to let the plugin
|
||||||
|
free resources. This function is used to deregister handles returned by both `regMr` and
|
||||||
|
`regMrDmaBuf`.
|
||||||
|
|
||||||
|
`isend`
|
||||||
|
|
||||||
|
Data will be sent through the connection using `isend`, passing the `sendComm` previously
|
||||||
|
created by `connect`, and the buffer described by `data`, `size`, and `mhandle`. A `tag` must be
|
||||||
|
used if the network supports multi-receive operations (see `irecv`) to distinguish between
|
||||||
|
different sends matching the same multi-receive. Otherwise it can be set to 0.
|
||||||
|
|
||||||
|
The `isend` operation returns a handle in the `request` argument for further calls to `test`. If
|
||||||
|
the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
|
||||||
|
`isend` again later.
|
||||||
|
|
||||||
|
`irecv`
|
||||||
|
|
||||||
|
To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
|
||||||
|
`n` will allow NCCL to perform a multi-receive, to allow grouping of multiple sends through a
|
||||||
|
single network connection. Each buffer will be described by the `data`, `sizes`, and `mhandles`
|
||||||
|
arrays. `tags` will specify a tag for each receive so that each of the `n` independent `isend`
|
||||||
|
operations is received into the right buffer.
|
||||||
|
|
||||||
|
If all receive operations can be initiated, `irecv` will return a handle in the `request` pointer,
|
||||||
|
otherwise it will set it to `NULL`. In the case of multi-receive, all `n` receive operations are
|
||||||
|
handled by a single request handle.
|
||||||
|
|
||||||
|
The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation.
|
||||||
|
The contrary (receive size being lower than the send size) is an error, however.
|
||||||
|
|
||||||
|
Note: for a given connection, send/receive operations should always match in the order they were
|
||||||
|
posted. Tags provided for receive operations are only used to assign a given send operation to one
|
||||||
|
of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
|
||||||
|
matching on any receive operation posted.
|
||||||
|
|
||||||
|
`test`
|
||||||
|
|
||||||
|
After an `isend` or `irecv` operation is initiated, NCCL will call `test` on the request handles
|
||||||
|
until they complete. When that happens, `done` will be set to 1 and `sizes` will be set to the
|
||||||
|
real size sent or received, the latter being potentially lower than the size passed to `irecv`.
|
||||||
|
|
||||||
|
In the case of a multi-receive, all receives will be considered as done as a single operation (the
|
||||||
|
goal being to allow aggregation), hence they share a single request and a single `done` status.
|
||||||
|
However, they can have different sizes, so when `done` is non-zero, the `sizes` array should
|
||||||
|
contain the `n` sizes corresponding to the buffers passed to `irecv`.
|
||||||
|
|
||||||
|
Once `test` returns 1 in `done`, the request handle can be freed, meaning that NCCL will never
|
||||||
|
call `test` again on that request (until it is reallocated by another call to `isend` or `irecv`).
|
||||||
|
|
||||||
|
`iflush`
|
||||||
|
|
||||||
|
After a receive operation completes, if the operation was targeting GPU memory and received a
|
||||||
|
non-zero number of bytes, NCCL will call `iflush` to let the network flush any buffer and ensure
|
||||||
|
the GPU can read it right after without seeing stale data. This flush operation is decoupled from
|
||||||
|
the `test` code to improve latency of `LL*` protocols, as those are capable of determining when
|
||||||
|
data is valid or not.
|
||||||
|
|
||||||
|
`iflush` returns a request which needs to be queried with `test` until it completes.
|
@ -1,80 +0,0 @@
|
|||||||
/*************************************************************************
|
|
||||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
*
|
|
||||||
* See LICENSE.txt for license information
|
|
||||||
************************************************************************/
|
|
||||||
|
|
||||||
#include <nccl.h>
|
|
||||||
#include <nccl_net.h>
|
|
||||||
|
|
||||||
#define __hidden __attribute__ ((visibility("hidden")))
|
|
||||||
|
|
||||||
__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
|
|
||||||
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
|
|
||||||
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
|
|
||||||
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
|
|
||||||
|
|
||||||
ncclNet_t NCCL_PLUGIN_SYMBOL = {
|
|
||||||
"Dummy",
|
|
||||||
pluginInit,
|
|
||||||
pluginDevices,
|
|
||||||
pluginPciPath,
|
|
||||||
pluginPtrSupport,
|
|
||||||
pluginListen,
|
|
||||||
pluginConnect,
|
|
||||||
pluginAccept,
|
|
||||||
pluginRegMr,
|
|
||||||
pluginDeregMr,
|
|
||||||
pluginIsend,
|
|
||||||
pluginIrecv,
|
|
||||||
pluginFlush,
|
|
||||||
pluginTest,
|
|
||||||
pluginCloseSend,
|
|
||||||
pluginCloseRecv,
|
|
||||||
pluginCloseListen
|
|
||||||
};
|
|
||||||
|
|
||||||
__hidden ncclResult_t pluginCollNetInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
|
|
||||||
__hidden ncclResult_t pluginCollNetDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
|
|
||||||
__hidden ncclResult_t pluginCollNetPciPath(int dev, char** path) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
|
|
||||||
__hidden ncclResult_t pluginCollNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetFlush(void* collComm, void* data, int size, void* mhandle) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetTest(void* request, int* done, int* size) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetCloseColl(void* collComm) { return ncclInternalError; }
|
|
||||||
__hidden ncclResult_t pluginCollNetCloseListen(void* listenComm) { return ncclInternalError; }
|
|
||||||
|
|
||||||
ncclCollNet_t NCCL_COLLNET_PLUGIN_SYMBOL = {
|
|
||||||
"Dummy",
|
|
||||||
pluginCollNetInit,
|
|
||||||
pluginCollNetDevices,
|
|
||||||
pluginCollNetPciPath,
|
|
||||||
pluginCollNetPtrSupport,
|
|
||||||
pluginCollNetListen,
|
|
||||||
pluginCollNetConnect,
|
|
||||||
pluginCollNetReduceSupport,
|
|
||||||
pluginCollNetRegMr,
|
|
||||||
pluginCollNetDeregMr,
|
|
||||||
pluginCollNetIallreduce,
|
|
||||||
pluginCollNetFlush,
|
|
||||||
pluginCollNetTest,
|
|
||||||
pluginCollNetCloseColl,
|
|
||||||
pluginCollNetCloseListen
|
|
||||||
};
|
|
16
ext-net/example/nccl/err.h
Normal file
16
ext-net/example/nccl/err.h
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_ERR_H_
|
||||||
|
#define NCCL_ERR_H_
|
||||||
|
|
||||||
|
/* Error type for plugins */
|
||||||
|
typedef enum { ncclSuccess = 0,
|
||||||
|
ncclUnhandledCudaError = 1,
|
||||||
|
ncclSystemError = 2,
|
||||||
|
ncclInternalError = 3,
|
||||||
|
ncclInvalidArgument = 4,
|
||||||
|
ncclRemoteError = 6 } ncclResult_t;
|
||||||
|
|
||||||
|
#endif
|
33
ext-net/example/nccl/net.h
Normal file
33
ext-net/example/nccl/net.h
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_NET_H_
|
||||||
|
#define NCCL_NET_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "err.h"
|
||||||
|
|
||||||
|
#define NCCL_NET_HANDLE_MAXSIZE 128
|
||||||
|
|
||||||
|
#define NCCL_PTR_HOST 0x1
|
||||||
|
#define NCCL_PTR_CUDA 0x2
|
||||||
|
#define NCCL_PTR_DMABUF 0x4
|
||||||
|
|
||||||
|
// Maximum number of requests per comm object
|
||||||
|
#define NCCL_NET_MAX_REQUESTS 8
|
||||||
|
|
||||||
|
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||||
|
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||||
|
|
||||||
|
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||||
|
|
||||||
|
#include "net_v6.h"
|
||||||
|
#include "net_v5.h"
|
||||||
|
#include "net_v4.h"
|
||||||
|
#include "net_v3.h"
|
||||||
|
#include "net_v2.h"
|
||||||
|
|
||||||
|
#endif // end include guard
|
50
ext-net/example/nccl/net_v2.h
Normal file
50
ext-net/example/nccl/net_v2.h
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_NET_V2_H_
|
||||||
|
#define NCCL_NET_V2_H_
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Return the device path in /sys. NCCL will call free on this path.
|
||||||
|
ncclResult_t (*pciPath)(int dev, char** path);
|
||||||
|
// Return whether this device supports host pointers and/or CUDA pointers
|
||||||
|
// as data from the current GPU. Supported types should be composed with
|
||||||
|
// NCCL_PTR_HOST and NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connectHandle
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v2_t;
|
||||||
|
|
||||||
|
#endif // end include guard
|
51
ext-net/example/nccl/net_v3.h
Normal file
51
ext-net/example/nccl/net_v3.h
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_NET_V3_H_
|
||||||
|
#define NCCL_NET_V3_H_
|
||||||
|
|
||||||
|
#define NCCL_NET_HANDLE_MAXSIZE_V3 64
|
||||||
|
#define NCCL_NET_MAX_REQUESTS_V3 16
|
||||||
|
|
||||||
|
typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connectHandle
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v3_t;
|
||||||
|
|
||||||
|
#endif // end include guard
|
59
ext-net/example/nccl/net_v4.h
Normal file
59
ext-net/example/nccl/net_v4.h
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_NET_V4_H_
|
||||||
|
#define NCCL_NET_V4_H_
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
char* name; // Used mostly for logging.
|
||||||
|
char* pciPath; // Path to the PCI device in /sys.
|
||||||
|
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||||
|
// cards with multiple PCI functions (Physical or virtual).
|
||||||
|
int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
|
||||||
|
int speed; // Port speed in Mbps.
|
||||||
|
int port; // Port number.
|
||||||
|
int maxComms; // Maximum number of comms we can create
|
||||||
|
} ncclNetProperties_v4_t;
|
||||||
|
|
||||||
|
// v4 struct for backwards compatibility
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connectHandle
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v4_t;
|
||||||
|
|
||||||
|
#endif // end include guard
|
54
ext-net/example/nccl/net_v5.h
Normal file
54
ext-net/example/nccl/net_v5.h
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_NET_V5_H_
|
||||||
|
#define NCCL_NET_V5_H_
|
||||||
|
|
||||||
|
typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with sendComm == NULL with the expectation that
|
||||||
|
// it will be called again until sendComm != NULL.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connect.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with recvComm == NULL with the expectation that
|
||||||
|
// it will be called again until recvComm != NULL.
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v5_t;
|
||||||
|
|
||||||
|
#endif // end include guard
|
70
ext-net/example/nccl/net_v6.h
Normal file
70
ext-net/example/nccl/net_v6.h
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_NET_V6_H_
|
||||||
|
#define NCCL_NET_V6_H_
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
char* name; // Used mostly for logging.
|
||||||
|
char* pciPath; // Path to the PCI device in /sys.
|
||||||
|
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||||
|
// cards with multiple PCI functions (Physical or virtual).
|
||||||
|
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||||
|
int speed; // Port speed in Mbps.
|
||||||
|
int port; // Port number.
|
||||||
|
float latency; // Network latency
|
||||||
|
int maxComms; // Maximum number of comms we can create
|
||||||
|
int maxRecvs; // Maximum number of grouped receives.
|
||||||
|
}ncclNetProperties_v6_t;
|
||||||
|
|
||||||
|
typedef ncclNetProperties_v6_t ncclNetProperties_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with sendComm == NULL with the expectation that
|
||||||
|
// it will be called again until sendComm != NULL.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connect.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with recvComm == NULL with the expectation that
|
||||||
|
// it will be called again until recvComm != NULL.
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
/* DMA-BUF support */
|
||||||
|
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v6_t;
|
||||||
|
|
||||||
|
#endif // end include guard
|
21
ext-net/example/nccl/types.h
Normal file
21
ext-net/example/nccl/types.h
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_ERR_H_
|
||||||
|
#define NCCL_ERR_H_
|
||||||
|
|
||||||
|
/* Data types */
|
||||||
|
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||||
|
ncclUint8 = 1,
|
||||||
|
ncclInt32 = 2, ncclInt = 2,
|
||||||
|
ncclUint32 = 3,
|
||||||
|
ncclInt64 = 4,
|
||||||
|
ncclUint64 = 5,
|
||||||
|
ncclFloat16 = 6, ncclHalf = 6,
|
||||||
|
ncclFloat32 = 7, ncclFloat = 7,
|
||||||
|
ncclFloat64 = 8, ncclDouble = 8,
|
||||||
|
ncclBfloat16 = 9,
|
||||||
|
} ncclDataType_t;
|
||||||
|
|
||||||
|
#endif
|
200
ext-net/example/plugin.c
Normal file
200
ext-net/example/plugin.c
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#include <nccl/net.h>
|
||||||
|
|
||||||
|
#define __hidden __attribute__ ((visibility("hidden")))
|
||||||
|
|
||||||
|
int max_requests = NCCL_NET_MAX_REQUESTS;
|
||||||
|
|
||||||
|
__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
|
||||||
|
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
|
||||||
|
|
||||||
|
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v6_t* props) {
|
||||||
|
//pluginPciPath(dev, &props.pciPath);
|
||||||
|
//pluginPtrSupport(dev, &props.ptrSupport);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
|
||||||
|
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
|
||||||
|
|
||||||
|
#define PLUGIN_NAME "Plugin"
|
||||||
|
|
||||||
|
const ncclNet_v6_t ncclNetPlugin_v6 = {
|
||||||
|
.name = PLUGIN_NAME,
|
||||||
|
.init = pluginInit,
|
||||||
|
.devices = pluginDevices,
|
||||||
|
.getProperties = pluginGetProperties,
|
||||||
|
.listen = pluginListen,
|
||||||
|
.connect = pluginConnect,
|
||||||
|
.accept = pluginAccept,
|
||||||
|
.regMr = pluginRegMr,
|
||||||
|
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||||
|
.deregMr = pluginDeregMr,
|
||||||
|
.isend = pluginIsend,
|
||||||
|
.irecv = pluginIrecv,
|
||||||
|
.iflush = pluginIflush,
|
||||||
|
.test = pluginTest,
|
||||||
|
.closeSend = pluginCloseSend,
|
||||||
|
.closeRecv = pluginCloseRecv,
|
||||||
|
.closeListen = pluginCloseListen,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* v5 Compat */
|
||||||
|
const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||||
|
.name = PLUGIN_NAME,
|
||||||
|
.init = pluginInit,
|
||||||
|
.devices = pluginDevices,
|
||||||
|
.getProperties = pluginGetProperties,
|
||||||
|
.listen = pluginListen,
|
||||||
|
.connect = pluginConnect,
|
||||||
|
.accept = pluginAccept,
|
||||||
|
.regMr = pluginRegMr,
|
||||||
|
.deregMr = pluginDeregMr,
|
||||||
|
.isend = pluginIsend,
|
||||||
|
.irecv = pluginIrecv,
|
||||||
|
.iflush = pluginIflush,
|
||||||
|
.test = pluginTest,
|
||||||
|
.closeSend = pluginCloseSend,
|
||||||
|
.closeRecv = pluginCloseRecv,
|
||||||
|
.closeListen = pluginCloseListen,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* v4 Compat */
|
||||||
|
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
|
||||||
|
ncclNetProperties_v6_t props_v6;
|
||||||
|
ncclResult_t ret = pluginGetProperties(dev, &props_v6);
|
||||||
|
if (ret != ncclSuccess) return ret;
|
||||||
|
props->name = props_v6.name;
|
||||||
|
props->pciPath = props_v6.pciPath;
|
||||||
|
props->guid = props_v6.guid;
|
||||||
|
props->ptrSupport = props_v6.ptrSupport;
|
||||||
|
props->speed = props_v6.speed;
|
||||||
|
props->port = props_v6.port;
|
||||||
|
props->maxComms = props_v6.maxComms;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
|
||||||
|
return pluginIsend(sendComm, data, size, 0, mhandle, request);
|
||||||
|
}
|
||||||
|
static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
|
||||||
|
int tag = 0;
|
||||||
|
return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request);
|
||||||
|
}
|
||||||
|
static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
|
||||||
|
return pluginIflush(recvComm, 1, &data, &size, &mhandle, request);
|
||||||
|
}
|
||||||
|
static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
|
||||||
|
ncclResult_t ret;
|
||||||
|
do {
|
||||||
|
ret = pluginConnect(dev, handle, sendComm);
|
||||||
|
} while (ret == ncclSuccess && *sendComm == NULL);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
|
||||||
|
ncclResult_t ret;
|
||||||
|
do {
|
||||||
|
ret = pluginAccept(listenComm, recvComm);
|
||||||
|
} while (ret == ncclSuccess && *recvComm == NULL);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
const ncclNet_v4_t ncclNetPlugin_v4 = {
|
||||||
|
.name = PLUGIN_NAME,
|
||||||
|
.init = pluginInit,
|
||||||
|
.devices = pluginDevices,
|
||||||
|
.getProperties = pluginGetProperties_v4,
|
||||||
|
.listen = pluginListen,
|
||||||
|
.connect = pluginConnect_v4,
|
||||||
|
.accept = pluginAccept_v4,
|
||||||
|
.regMr = pluginRegMr,
|
||||||
|
.deregMr = pluginDeregMr,
|
||||||
|
.isend = pluginIsend_v4,
|
||||||
|
.irecv = pluginIrecv_v4,
|
||||||
|
.iflush = pluginIflush_v4,
|
||||||
|
.test = pluginTest,
|
||||||
|
.closeSend = pluginCloseSend,
|
||||||
|
.closeRecv = pluginCloseRecv,
|
||||||
|
.closeListen = pluginCloseListen,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* v3 Compat */
|
||||||
|
static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) {
|
||||||
|
void* req;
|
||||||
|
ncclResult_t ret = pluginIflush_v4(recvComm, data, size, mhandle, &req);
|
||||||
|
int done = 0;
|
||||||
|
while (ret == ncclSuccess && done == 0) {
|
||||||
|
ret = pluginTest(req, &done, NULL);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
|
||||||
|
max_requests = NCCL_NET_MAX_REQUESTS_V3;
|
||||||
|
return pluginInit(logFunction);
|
||||||
|
}
|
||||||
|
#include <string.h>
|
||||||
|
static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
|
||||||
|
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
|
||||||
|
ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
|
||||||
|
memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V3);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) {
|
||||||
|
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
|
||||||
|
memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V3);
|
||||||
|
return pluginConnect_v4(dev, &pluginHandle, sendComm);
|
||||||
|
}
|
||||||
|
const ncclNet_v3_t ncclNetPlugin_v3 = {
|
||||||
|
.name = PLUGIN_NAME,
|
||||||
|
.init = pluginInit_v3,
|
||||||
|
.devices = pluginDevices,
|
||||||
|
.getProperties = pluginGetProperties_v4,
|
||||||
|
.listen = pluginListen_v3,
|
||||||
|
.connect = pluginConnect_v3,
|
||||||
|
.accept = pluginAccept_v4,
|
||||||
|
.regMr = pluginRegMr,
|
||||||
|
.deregMr = pluginDeregMr,
|
||||||
|
.isend = pluginIsend_v4,
|
||||||
|
.irecv = pluginIrecv_v4,
|
||||||
|
.flush = pluginFlush,
|
||||||
|
.test = pluginTest,
|
||||||
|
.closeSend = pluginCloseSend,
|
||||||
|
.closeRecv = pluginCloseRecv,
|
||||||
|
.closeListen = pluginCloseListen,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* v2 Compat */
|
||||||
|
const ncclNet_v2_t ncclNetPlugin_v2 = {
|
||||||
|
.name = PLUGIN_NAME,
|
||||||
|
.init = pluginInit_v3,
|
||||||
|
.devices = pluginDevices,
|
||||||
|
.pciPath = pluginPciPath,
|
||||||
|
.ptrSupport = pluginPtrSupport,
|
||||||
|
.listen = pluginListen,
|
||||||
|
.connect = pluginConnect_v4,
|
||||||
|
.accept = pluginAccept_v4,
|
||||||
|
.regMr = pluginRegMr,
|
||||||
|
.deregMr = pluginDeregMr,
|
||||||
|
.isend = pluginIsend_v4,
|
||||||
|
.irecv = pluginIrecv_v4,
|
||||||
|
.flush = pluginFlush,
|
||||||
|
.test = pluginTest,
|
||||||
|
.closeSend = pluginCloseSend,
|
||||||
|
.closeRecv = pluginCloseRecv,
|
||||||
|
.closeListen = pluginCloseListen,
|
||||||
|
};
|
Loading…
x
Reference in New Issue
Block a user