diff --git a/ext-net/README.md b/ext-net/README.md index aa1a394..90fe89b 100644 --- a/ext-net/README.md +++ b/ext-net/README.md @@ -60,20 +60,20 @@ of newer ones. The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. -# API (v9) +# API (v10) -Below is the main `ncclNet_v9` struct. Each function is explained in later sections. +Below is the main `ncclNet_v10` struct. Each function is explained in later sections. ``` typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); + ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. @@ -83,13 +83,13 @@ typedef struct { // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); + ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); @@ -98,10 +98,10 @@ typedef struct { ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); @@ -200,6 +200,9 @@ the plugin code adding the following definitions: #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) ``` +The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and +record its own events with the NCCL profiler plugin. + `devices` Once the plugin is initialized, NCCL will query the number of devices available. It should not @@ -301,6 +304,11 @@ the `listen` call previously. If the sender did not connect yet, `accept` should should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it succeeds. +The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field. +This field can be used by the network plugin to specify the QoS level of the connection. By default, +`trafficClass` is set to -1 but can be configured by the application during communicator initialization +to select a plugin-supported QoS level. + `closeListen`/`closeSend`/`closeRecv` Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call @@ -354,6 +362,9 @@ The `isend` operation returns a handle in the `request` argument for further cal the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call `isend` again later. +The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin +to support network defined events. + `irecv` To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument @@ -375,6 +386,9 @@ of irecv and is resilient to redundant network writes. This allows the plugin to completions on such irecvs (for example, complete the request immediately). The plugin is still expected to set a valid request pointer on return which NCCL can poll to check for completion. +The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the +network plugin to support network defined events. + Note: for a given connection, send/receive operations should always match in the order they were posted. Tags provided for receive operations are only used to assign a given send operation to one of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h index 112967a..85ea79e 100644 --- a/ext-net/example/nccl/net.h +++ b/ext-net/example/nccl/net.h @@ -2,14 +2,15 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_H_ -#define NCCL_NET_H_ +#ifndef NET_H_ +#define NET_H_ #include #include #include "common.h" #include "err.h" +#include "net_device.h" #define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB @@ -22,6 +23,9 @@ // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 +typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData); + +#include "net_v10.h" #include "net_v9.h" #include "net_v8.h" #include "net_v7.h" @@ -31,4 +35,9 @@ #include "net_v3.h" #include "net_v2.h" +typedef ncclNet_v10_t ncclNet_t; +typedef ncclNetProperties_v10_t ncclNetProperties_t; +typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; +typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; + #endif // end include guard diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h index 874fb59..d693101 100644 --- a/ext-net/example/nccl/net_device.h +++ b/ext-net/example/nccl/net_device.h @@ -26,6 +26,7 @@ typedef struct { typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; -typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; +typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; #endif diff --git a/ext-net/example/nccl/net_v10.h b/ext-net/example/nccl/net_v10.h new file mode 100644 index 0000000..809e7c0 --- /dev/null +++ b/ext-net/example/nccl/net_v10.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NET_V10_H_ +#define NET_V10_H_ + +#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4 +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10]; +} ncclNetVDeviceProps_v10_t; + + +#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 +typedef struct { + // Plugin-specific TC value + int trafficClass; +} ncclNetCommConfig_v10_t; + + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v10_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations +} ncclNetProperties_v10_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props); +} ncclNet_v10_t; + +#endif // end include guard diff --git a/ext-net/example/nccl/net_v2.h b/ext-net/example/nccl/net_v2.h index 0d9c906..dd9f39b 100644 --- a/ext-net/example/nccl/net_v2.h +++ b/ext-net/example/nccl/net_v2.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V2_H_ -#define NCCL_NET_V2_H_ +#ifndef NET_V2_H_ +#define NET_V2_H_ typedef struct { // Name of the network (mainly for logs) diff --git a/ext-net/example/nccl/net_v3.h b/ext-net/example/nccl/net_v3.h index db1287b..9002165 100644 --- a/ext-net/example/nccl/net_v3.h +++ b/ext-net/example/nccl/net_v3.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V3_H_ -#define NCCL_NET_V3_H_ +#ifndef NET_V3_H_ +#define NET_V3_H_ #define NCCL_NET_MAX_REQUESTS_V3 16 diff --git a/ext-net/example/nccl/net_v4.h b/ext-net/example/nccl/net_v4.h index efe4824..41cef56 100644 --- a/ext-net/example/nccl/net_v4.h +++ b/ext-net/example/nccl/net_v4.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V4_H_ -#define NCCL_NET_V4_H_ +#ifndef NET_V4_H_ +#define NET_V4_H_ #define NCCL_NET_HANDLE_MAXSIZE_V4 64 diff --git a/ext-net/example/nccl/net_v5.h b/ext-net/example/nccl/net_v5.h index b96b6fc..47f446c 100644 --- a/ext-net/example/nccl/net_v5.h +++ b/ext-net/example/nccl/net_v5.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V5_H_ -#define NCCL_NET_V5_H_ +#ifndef NET_V5_H_ +#define NET_V5_H_ typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; typedef struct { diff --git a/ext-net/example/nccl/net_v6.h b/ext-net/example/nccl/net_v6.h index fffaf8c..de90f29 100644 --- a/ext-net/example/nccl/net_v6.h +++ b/ext-net/example/nccl/net_v6.h @@ -2,10 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V6_H_ -#define NCCL_NET_V6_H_ - -#define NCCL_NET_MAX_REQUESTS_V6 8 +#ifndef NET_V6_H_ +#define NET_V6_H_ typedef struct { char* name; // Used mostly for logging. diff --git a/ext-net/example/nccl/net_v7.h b/ext-net/example/nccl/net_v7.h index d607095..3802a3d 100644 --- a/ext-net/example/nccl/net_v7.h +++ b/ext-net/example/nccl/net_v7.h @@ -2,10 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V7_H_ -#define NCCL_NET_V7_H_ - -#include "net_device.h" +#ifndef NET_V7_H_ +#define NET_V7_H_ typedef struct { char* name; // Used mostly for logging. diff --git a/ext-net/example/nccl/net_v8.h b/ext-net/example/nccl/net_v8.h index 54a61f6..74eb72d 100644 --- a/ext-net/example/nccl/net_v8.h +++ b/ext-net/example/nccl/net_v8.h @@ -2,10 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V8_H_ -#define NCCL_NET_V8_H_ - -#include "net_device.h" +#ifndef NET_V8_H_ +#define NET_V8_H_ typedef struct { char* name; // Used mostly for logging. diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h index 61035ec..ca60ad6 100644 --- a/ext-net/example/nccl/net_v9.h +++ b/ext-net/example/nccl/net_v9.h @@ -2,18 +2,14 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V9_H_ -#define NCCL_NET_V9_H_ - -#include "net_device.h" +#ifndef NET_V9_H_ +#define NET_V9_H_ #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 -#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9 typedef struct { int ndevs; int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; } ncclNetVDeviceProps_v9_t; -typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t; typedef struct { char* name; // Used mostly for logging. @@ -35,8 +31,6 @@ typedef struct { size_t maxCollBytes; // Max transfer size for collective operations } ncclNetProperties_v9_t; -typedef ncclNetProperties_v9_t ncclNetProperties_t; - typedef struct { // Name of the network (mainly for logs) const char* name; @@ -93,7 +87,7 @@ typedef struct { // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller // what index this new vNIC exists at - ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props); } ncclNet_v9_t; #endif // end include guard diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c index 2852242..97a2987 100644 --- a/ext-net/example/plugin.c +++ b/ext-net/example/plugin.c @@ -11,7 +11,7 @@ int max_requests = NCCL_NET_MAX_REQUESTS; -__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } +__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; } __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } @@ -52,13 +52,13 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) { } __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } -__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; } +__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} -__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; } -__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } @@ -70,7 +70,7 @@ __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { #define PLUGIN_NAME "Plugin" -ncclNet_v9_t ncclNetPlugin_v9 = { +const ncclNet_v10_t ncclNetPlugin_v10 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, @@ -93,6 +93,51 @@ ncclNet_v9_t ncclNetPlugin_v9 = { .makeVDevice = pluginMakeVDevice, }; +__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) { + return pluginInit(logFunction, NULL); +} + +__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) { + return pluginGetProperties(dev, (ncclNetProperties_t*)props); +} + +__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){ + return pluginConnect(dev, NULL, handle, sendComm, sendDevComm); +} + +__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { + return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request); +} + +__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { + return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request); +} + +__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; } + +const ncclNet_v9_t ncclNetPlugin_v9 = { + .name = PLUGIN_NAME, + .init = pluginInit_v9, + .devices = pluginDevices, + .getProperties = pluginGetProperties_v9, + .listen = pluginListen, + .connect = pluginConnect_v9, + .accept = pluginAccept, + .regMr = pluginRegMr, + .regMrDmaBuf = pluginRegMrDmaBuf, + .deregMr = pluginDeregMr, + .isend = pluginIsend_v9, + .irecv = pluginIrecv_v9, + .iflush = pluginIflush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, + .getDeviceMr = pluginGetDeviceMr, + .irecvConsumed = pluginIrecvConsumed, + .makeVDevice = pluginMakeVDevice_v9, +}; + __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) { ncclNetProperties_t props; ncclResult_t ret = pluginGetProperties(dev, &props); @@ -113,22 +158,22 @@ __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* pr } __hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { - return pluginIsend(sendComm, data, (int)size, tag, mhandle, request); + return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request); } __hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { size_t sizesOut[NCCL_PLUGIN_MAX_RECVS]; for (int i=0; i static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) { diff --git a/makefiles/common.mk b/makefiles/common.mk index 1b1bb86..545203a 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -16,6 +16,7 @@ WERROR ?= 0 PROFAPI ?= 1 NVTX ?= 1 RDMA_CORE ?= 0 +NET_PROFILER ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc @@ -137,3 +138,7 @@ endif ifneq ($(RDMA_CORE), 0) CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 endif + +ifneq ($(NET_PROFILER), 0) +CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1 +endif diff --git a/makefiles/version.mk b/makefiles/version.mk index b02cf90..df3ee5c 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 25 -NCCL_PATCH := 1 +NCCL_MINOR := 26 +NCCL_PATCH := 2 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index b66ebef..65da630 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,11 +10,15 @@ include ../makefiles/version.mk INCEXPORTS := nccl.h LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ - init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \ + init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) \ $(wildcard register/*.cc) \ + $(wildcard plugin/*.cc) \ + $(wildcard plugin/net/*.cc) \ + $(wildcard plugin/tuner/*.cc) \ + $(wildcard plugin/profiler/*.cc) \ $(filter-out ras/client.cc,$(wildcard ras/*.cc)) BINSRCFILES := ras/client.cc @@ -49,6 +53,7 @@ LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) BINOBJ := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl +INCPLUGIN := include/plugin DEVMANIFEST := $(BUILDDIR)/obj/device/manifest @@ -126,8 +131,8 @@ $(PKGDIR)/%.pc : %.pc $(OBJDIR)/%.o : %.cc $(INCTARGETS) @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` - $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ - @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) + $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@ + @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 675bcfc..9e24faa 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -153,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz int* done) { if (*done) return ncclSuccess; if (!*sendReq) { - NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq)); + NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq)); } if (*sendReq) { NCCLCHECK(net->test(*sendReq, done, NULL)); @@ -167,8 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz int* done) { if (*done) return ncclSuccess; if (!*recvReq) { - size_t size64 = size; - NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq)); + size_t size64 = size; + NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, NULL, recvReq)); } if (*recvReq) { NCCLCHECK(net->test(*recvReq, done, NULL)); @@ -484,7 +484,7 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { if (devOOB < 0) { pthread_mutex_lock(&bootstrapNetLock); if (devOOB < 0) { - char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME"); + const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME"); if (userIfEnv && strlen(userIfEnv) > 0) { INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv); bool searchNot = userIfEnv && userIfEnv[0] == '^'; @@ -540,7 +540,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis do { NCCLCHECK(checkAbort(abortFlag, &abortCounter)); if (!*sendComm) - NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle)); + NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle)); if (!*recvComm) NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle)); } while (!*sendComm || !*recvComm); @@ -736,6 +736,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { rasRanks[rank].pid = getpid(); rasRanks[rank].cudaDev = comm->cudaDev; rasRanks[rank].nvmlDev = comm->nvmlDev; + rasRanks[rank].hostHash = getHostHash(); + rasRanks[rank].pidHash = getPidHash(); if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) { INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error"); // We should still participate in the ringAllInfo below as the peers will be waiting for us. @@ -967,7 +969,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s NCCLCHECK(socketAccept(commState, peer, tag, &sock)); TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size); NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail); - NCCLCHECK(ncclSocketClose(&sock)); + NCCLCHECKGOTO(ncclSocketClose(&sock, /*wait*/true), ret, fail); return ret; fail: (void)ncclSocketClose(&sock); @@ -1062,7 +1064,7 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" */ - int data[1]; + int data[1] = {0}; for (int mask = 1; mask < nranks; mask <<= 1) { int src = (rank - mask + nranks) % nranks; int dst = (rank + mask) % nranks; diff --git a/src/channel.cc b/src/channel.cc index b3a8f29..bc48986 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -20,8 +20,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) { channel->workFifoProduced = 0; struct ncclSharedResources* sharedRes = comm->sharedRes; - - NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + cudaStream_t deviceStream; + NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); if (channel->peers == NULL) { // The extra on nRanks+1 is for collnet root (i.e. network) @@ -39,33 +39,33 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) { if (channel->devPeers == NULL) { if (sharedRes->devPeers[channelId] == NULL) { - NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream)); } /* channel->devPeers is not shared, so just free it when calling commFree() */ - NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream)); ncclCommPushCudaFree(comm, channel->devPeers); NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers)); for (int r = 0; r < nRanks; r++) { uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]); - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr; } } channel->ring.userRanks = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, deviceStream)); ncclCommPushCudaFree(comm, channel->devRingUserRanks); /* guarantee addr has been copied into channel->devPeers */ + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false)); NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); - return ncclSuccess; } ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) { struct ncclChannel* channel = &comm->channels[channelId]; struct ncclSharedResources* sharedRes = comm->sharedRes; + cudaStream_t deviceStream; if (channel->nvlsPeers != NULL) return ncclSuccess; @@ -73,7 +73,7 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo if (channel->id == -1) NCCLCHECK(initChannel(comm, channelId)); - NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); int nvlsRanks = comm->localRanks; @@ -84,24 +84,24 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo int tr = comm->topParentLocalRanks[r]; uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr); channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr; - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount); } } else { NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks)); - NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, deviceStream)); for (int r = 0; r < nvlsRanks; ++r) { uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r); channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r; - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount); } } + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false)); NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } @@ -110,6 +110,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc struct ncclChannel* channel = &comm->channels[channelId]; struct ncclSharedResources* sharedRes = comm->sharedRes; uintptr_t addr; + cudaStream_t deviceStream; if (channel->collnetPeers != NULL) return ncclSuccess; @@ -117,28 +118,28 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc if (channel->id == -1) NCCLCHECK(initChannel(comm, channelId)); - NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); if (share) { channel->collnetPeers = parent->channels[channelId].collnetPeers; channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers; addr = (uintptr_t)parent->channels[channelId].collnetDevPeers; channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers; - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount); } else { NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1)); - NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, deviceStream)); addr = (uintptr_t)channel->collnetDevPeers; channel->peers[comm->nRanks] = channel->collnetPeers; - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount); } + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false)); NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } diff --git a/src/debug.cc b/src/debug.cc index 2ea6eab..2eb8d77 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -6,6 +6,7 @@ #include "core.h" #include "nccl_net.h" +#include #include #include #include @@ -16,6 +17,11 @@ #include "param.h" int ncclDebugLevel = -1; +static uint32_t ncclDebugTimestampLevels = 0; // bitmaps of levels that have timestamps turned on +static char ncclDebugTimestampFormat[256]; // with space for subseconds +static int ncclDebugTimestampSubsecondsStart; // index where the subseconds starts +static uint64_t ncclDebugTimestampMaxSubseconds; // Max number of subseconds plus 1, used in duration ratio +static int ncclDebugTimestampSubsecondDigits; // Number of digits to display static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; @@ -112,6 +118,84 @@ static void ncclDebugInit() { ncclWarnSetDebugInfo = value; } + // Determine which debug levels will have timestamps. + const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS"); + if (timestamps == nullptr) { + ncclDebugTimestampLevels = (1< sizeof(ncclDebugTimestampFormat) - 1) { + // Won't fit; fall back on the default. + break; + } + ncclDebugTimestampSubsecondsStart = i; + ncclDebugTimestampMaxSubseconds = 1; + + memcpy(ncclDebugTimestampFormat, tsFormat, i); + for (int j=0; j>(delta).count()*1000; - len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", - hostname, pid, tid, cudaDev, timestamp, filefunc, line); + len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line); } + len = std::min(len, sizeof(buffer)-1); // prevent overflows + // Add the message as given by the call site. va_list vargs; va_start(vargs, fmt); len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); va_end(vargs); // vsnprintf may return len >= sizeof(buffer) in the case of a truncated output. - // Rewind len so that we can replace the final \0 by \n - if (len >= sizeof(buffer)) len = sizeof(buffer)-1; - if (len) { - buffer[len++] = '\n'; - fwrite(buffer, 1, len, ncclDebugFile); - } + // Rewind len so that we can replace the final \0 by "\n" + len = std::min(len, sizeof(buffer)-1); // prevent overflows + + // Add a newline and write it to the debug file. No terminating null is + // necessary since we write bytes instead of the string. + buffer[len++] = '\n'; + fwrite(buffer, 1, len, ncclDebugFile); } NCCL_API(void, ncclResetDebugInit); diff --git a/src/device/all_gather.h b/src/device/all_gather.h index 5d79d73..854ebbf 100644 --- a/src/device/all_gather.h +++ b/src/device/all_gather.h @@ -67,7 +67,7 @@ namespace { offset = dataOffset + rankDest * count; // Final wait/copy. - prims.directRecv(offset, offset, nelem); + prims.directRecv(offset, nelem); } } else if (inputBuf != outputBuf + ringRanks[0] * count) { inputBuf = inputBuf + partOffset; @@ -111,25 +111,63 @@ struct RunWorkColl struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { +#if __CUDA_ARCH__ >= 600 using Proto = ProtoSimple<1, 1>; const int nranks = ncclShmem.comm.nRanks; const int rank = ncclShmem.comm.rank; size_t count, channelOffset, channelCount, chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount); - T *inputBuf = (T*)work->sendbuff; - T *outputBuf = (T*)work->recvbuff; - Primitives, 0, Proto, 0> prims - (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg); + static constexpr int nworkers = NCCL_PAT_NWORKERS; + struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0); + uint64_t pollCount = 0; + __syncthreads(); // Don't start using shared mem until everyone arrives + for (int i=tid; ipatSteps[i].flags = 0; + if (tid == 0) shmem->localAccSize = 0; + if (tid == nworkers) shmem->parallelFactor = 0; + __syncthreads(); - PatAGAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); - int last = 0; - while (!last) { - int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem; - size_t inpIx, outIx; - patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last); - prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend); + if (tid == nworkers) { // Algo computation thread + PatAGAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); + int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor(); + int step = 0; + while (1) { + struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); + cuda::atomic_ref poll(ps->flags); + while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS' + patAlgo.getNextOp(ps); + int last = ps->last; + step++; + if (last == 2) break; + } + } else if (tid < nworkers) { // Worker threads + T *inputBuf = (T*)work->sendbuff; + T *outputBuf = (T*)work->recvbuff; + int parallelFactor = 0; + volatile int* pfPtr = &shmem->parallelFactor; + while (parallelFactor == 0) parallelFactor = *pfPtr; + + int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE; + int group = tid / groupSize; + int nGroups = nworkers / groupSize; + int tidInGroup = tid - group*groupSize; + // We don't use recvPeers/sendPeers so let's pass shmem structs instead + Primitives, 0, Proto, 0> prims + (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatAg); + + int step = group; + while(1) { + struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); + cuda::atomic_ref poll(ps->flags); + while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread + int last = ps->last; + prims.patCopy(ps, shmem); + if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread + if (last) break; + step += nGroups; + } } +#endif } }; diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 2161597..81da554 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -78,7 +78,7 @@ namespace { offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); - prims.directRecv(offset, offset, nelem); + prims.directRecv(offset, nelem); } } @@ -132,7 +132,7 @@ namespace { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecv(offset, offset, nelem); + prims.directRecv(offset, nelem); } } else { @@ -215,7 +215,7 @@ namespace { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecv(offset, offset, nelem); + prims.directRecv(offset, nelem); } } else { @@ -710,7 +710,7 @@ struct RunWorkCollchannels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter; } } diff --git a/src/device/primitives.h b/src/device/primitives.h index 73c10c2..3b9f169 100644 --- a/src/device/primitives.h +++ b/src/device/primitives.h @@ -12,7 +12,7 @@ #include "common_kernel.h" #include "common.h" -#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000 +#define NCCL_SPINS_BEFORE_CHECK_ABORT 10000 /* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128 * We use these as template args to the Primtiives class instead of integral @@ -115,7 +115,7 @@ struct PrimitivesWithoutDirect { __device__ void directSendFromOutput(intptr_t outIx, int eltN) { static_cast(this)->sendFromOutput(outIx, eltN); } - __device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) { + __device__ void directRecv(intptr_t outIx, int eltN) { static_cast(this)->recv(outIx, eltN, /*postOp=*/false); } __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { @@ -139,6 +139,18 @@ struct PrimitivesWithoutDirect { } }; +__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) { + if (abortCache & abortValue) return 1; + if (++spins < NCCL_SPINS_BEFORE_CHECK_ABORT) return 0; + spins = 0; + int abort = *ncclShmem.comm.abortFlag; + if (abort) { + ncclShmem.aborted = abort; + abortCache |= abortValue; + } + return abort; +} + #include "prims_simple.h" #include "prims_ll.h" #include "prims_ll128.h" diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h index 3e00f3b..2a0f556 100644 --- a/src/device/prims_ll.h +++ b/src/device/prims_ll.h @@ -51,23 +51,14 @@ class Primitives: } } - uint32_t abort = 0; - - inline __device__ int checkAbort(int &spins, int send) { - spins++; - if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - abort = *ncclShmem.comm.abortFlag; - spins = 0; - } - return abort; - } + int abort = 0; inline __device__ void waitSend(int nbytes) { if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { sendConnHeadCache = *sendConnHeadPtr; - if (checkAbort(spins, 1)) break; + if (checkAbort(abort, 1, spins)) break; } if (sendConnFifo) { int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes; @@ -102,7 +93,7 @@ class Primitives: int spins = 0; do { asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory"); - if (checkAbort(spins, 0)) break; + if (checkAbort(abort, 1, spins)) break; } while ((flag1 != flag) || (flag2 != flag)); uint64_t val64 = data1 + (((uint64_t)data2) << 32); return val64; @@ -126,7 +117,7 @@ class Primitives: int spins = 0; while (line[i].flag1 != flag || line[i].flag2 != flag) { asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory"); - if (checkAbort(spins, 0)) break; + if (checkAbort(abort, 1, spins)) break; } uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32); return val64; diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h index 617b7ac..6985e67 100644 --- a/src/device/prims_ll128.h +++ b/src/device/prims_ll128.h @@ -53,23 +53,14 @@ class Primitives: barrier_sync(15-group, nthreads); } - uint32_t abort = 0; - - inline __device__ int checkAbort(int &spins, int i, int send) { - spins++; - if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - abort = *ncclShmem.comm.abortFlag; - spins = 0; - } - return abort; - } + int abort = 0; inline __device__ void waitSend(int nbytes) { if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { sendConnHeadCache = *sendConnHeadPtr; - if (checkAbort(spins, wid, 1)) break; + if (checkAbort(abort, 1, spins)) break; } if (sendConnFifo) { sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes; @@ -201,7 +192,7 @@ class Primitives: load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]); needReload |= flagThread && (vr[u+1] != flag); } - needReload &= (0 == checkAbort(spins, 0, 0)); + needReload &= (0 == checkAbort(abort, 1, spins)); } while (__any_sync(WARP_MASK, needReload)); #pragma unroll @@ -248,7 +239,7 @@ class Primitives: load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]); needReload |= flagThread && (vr[u+1] != flag); } - needReload &= (0 == checkAbort(spins, i, 0)); + needReload &= (0 == checkAbort(abort, 1, spins)); } while (__any_sync(WARP_MASK, needReload)); #pragma unroll diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h index 0051019..cf3ba9b 100644 --- a/src/device/prims_simple.h +++ b/src/device/prims_simple.h @@ -52,7 +52,7 @@ class Primitives< uint64_t connStepCache; // Cache last seen value of (*connStepPtr) int connStepSize; // Connection step size void* netDeviceHandle; - uint64_t accSize; // Accumulated size. Used by PAT operations + uint64_t accSize; // Don't use barrier 0 as it's used by the final sync __device__ void barrier() { @@ -70,6 +70,11 @@ class Primitives< } } + // PAT uses a single barrier across all groups + __device__ void patBarrier() { + barrier_sync(15, NCCL_PAT_NWORKERS); + } + __device__ bool barrierAny(int vote) { if (nthreads == WARP_SIZE) { return __any_sync(~0u, vote); @@ -87,18 +92,6 @@ class Primitives< } } - inline __device__ bool checkAbort(int &spins) { - spins++; - if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - if (*ncclShmem.comm.abortFlag) { - flags |= Aborted; - ncclShmem.aborted = 1; - } - spins = 0; - } - return flags & Aborted; - } - inline __device__ uint64_t loadStepValue(uint64_t* ptr) { #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 if (flags & NvlsMinPolling) { @@ -121,7 +114,7 @@ class Primitives< int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; + if (checkAbort(flags, Aborted, spins)) break; //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); } } @@ -338,13 +331,8 @@ public: peerPtr->recv[connIndex].step += steps; st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step); while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) { - if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) { - if (*ncclShmem.comm.abortFlag) { - ncclShmem.aborted = 1; - break; - } - spins = 0; - } + int abort = 0; + if (checkAbort(abort, 1, spins)) break; } } @@ -359,7 +347,7 @@ public: int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; + if (checkAbort(flags, Aborted, spins)) break; } void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts : ncclShmem.groups[group].srcs; @@ -601,13 +589,13 @@ private: tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) { - // For send operations, we need an extra warp to overlap the threadfence and the copy - this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0); - int peer = -1; flags = 0; index = -1; if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers + // For send operations, we need an extra warp to overlap the threadfence and the copy + this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0); + int nrecv=0, nsend=0; // Yes, for some template arguments this code will be unreachable. That's fine. // coverity[dead_error_line] @@ -637,68 +625,84 @@ private: if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; + + // Coverity thinks that index could be -1 here but that's not actually the case. + // coverity[negative_returns:FALSE] + int sendIpcReg; + int recvIpcReg; + int sendNetReg; + int recvNetReg; + if (P2p) { + sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0; + recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0; + sendNetReg = p2pWork ? p2pWork->sendNetReg : 0; + recvNetReg = p2pWork ? p2pWork->recvNetReg : 0; + } else { + recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0; + recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0; + } + + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 + if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg); + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 + if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg); + + if (barrierAny(flags & NetDeviceUnpack)) { + flags |= AnyNetDeviceUnpack; + // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers + // have NetDeviceUnpack. + uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0); + if (tid == 0) { + ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask; + } + } + + // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case + // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case + setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer); + // coverity[uninit_member] => coverity thinks fan.n is not initialized } else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n flags |= PatMode; - accSize = 0; + const int roles[5] = { RoleWaitRecv, RolePostRecv, RoleWaitSend, RolePostSend, RoleInput | RoleOutput }; + if (tid < 5) flags |= roles[tid]; + int nranks = ncclShmem.comm.nRanks; - int rank = ncclShmem.comm.rank; - // A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer. - index = tid % 32; - uint32_t delta = 1 << index; - const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv}; - int block = tid / 32; - if (block < 4 && delta < nranks) { - int role = roles[block]; - if (mode == primsModePatRs) { - if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks; - if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks; - } else if (mode == primsModePatAg) { - if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks; - if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks; - } - flags |= role; - } else if (tid == 128) { - flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation + if (tid < 32 && ((1UL<conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv; + peer->step = conn->step; + peer->buff = conn->buffs[NCCL_PROTO_SIMPLE]; + peer->stepCache = loadStepValue(peer->tailPtr = conn->tail); + peer->headPtr = conn->head; + peer->accSize = 0; + peer->connStepSize = conn->stepSize/sizeof(T); + // Load send peer + int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks; + peer = ((struct ncclPatPeer*)sendPeers)+tid; + conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend; + peer->step = conn->step; + peer->connFifo = conn->connFifo; + peer->buff = conn->buffs[NCCL_PROTO_SIMPLE]; + peer->stepCache = loadStepValue(peer->headPtr = conn->head); + peer->tailPtr = conn->tail; + peer->accSize = 0; + peer->connStepSize = conn->stepSize/sizeof(T); } - } - - // Coverity thinks that index could be -1 here but that's not actually the case. - // coverity[negative_returns:FALSE] - int sendIpcReg; - int recvIpcReg; - int sendNetReg; - int recvNetReg; - if (P2p) { - sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0; - recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0; - sendNetReg = p2pWork ? p2pWork->sendNetReg : 0; - recvNetReg = p2pWork ? p2pWork->recvNetReg : 0; - } else { - recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0; - recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0; - } - // coverity[overrun-call] => Coverity think prims.index can be greater than 1 - if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg); - // coverity[overrun-call] => Coverity think prims.index can be greater than 1 - if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg); - - if (barrierAny(flags & NetDeviceUnpack)) { - flags |= AnyNetDeviceUnpack; - // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers - // have NetDeviceUnpack. - uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0); - if (tid == 0) { - ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask; + if (tid==0) { + ncclShmem.groups[group].userInput = (void*)inputBuf; + ncclShmem.groups[group].userOutput = (void*)outputBuf; + ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input } + patBarrier(); } - - // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case - // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case - setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer); - // coverity[uninit_member] => coverity thinks fan.n is not initialized } __device__ ~Primitives() { + if (flags&PatMode) return; // Save steps for the next operation if (flags & (RolePostSend|RolePostRecv)) conn->step = step; if ((flags & NetRegMode) && (flags & RoleWaitSend)) { @@ -708,7 +712,7 @@ private: uint64_t prevStep = step - StepPerSlice; volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size); int spins = 0; - while (*ptr != -1) if (checkAbort(spins)) break; + while (*ptr != -1) if (checkAbort(flags, Aborted, spins)) break; } if (flags & NetDeviceUnpack) { @@ -726,7 +730,7 @@ private: int spins = 0; volatile uint64_t* tail = conn->tail; volatile uint64_t* head = conn->head; - while (*tail > *head) if (checkAbort(spins)) break; + while (*tail > *head) if (checkAbort(flags, Aborted, spins)) break; } } @@ -749,7 +753,7 @@ private: if (slot) { T* exchgPtr; directBuff = (T*)outputBuf; - while (*slot != nullptr && !checkAbort(spins)); + while (*slot != nullptr && !checkAbort(flags, Aborted, spins)); if (P2p) { exchgPtr = (T*)outputBuf; } else { @@ -766,7 +770,7 @@ private: void* ptr; while (slot) { ptr = *slot; - if (ptr != nullptr || checkAbort(spins)) break; + if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break; } if (slot) { @@ -785,7 +789,7 @@ private: // Wait for consumer to consume previous value before trampling it. if (slot && argSlot0 && argSlot1) { T* exchgPtr; - while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins)); + while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(flags, Aborted, spins)); // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; @@ -815,7 +819,7 @@ private: void* ptr; while (slot) { ptr = *slot; - if (ptr != nullptr || checkAbort(spins)) break; + if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break; } if (slot && argSlot0 && argSlot1) { @@ -826,7 +830,7 @@ private: while (true) { arg0 = *argSlot0; arg1 = *argSlot1; - if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; + if ((arg0 != 0 && arg1 != 0) || checkAbort(flags, Aborted, spins)) break; } ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff); } @@ -866,8 +870,8 @@ private: __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { - genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp); + __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN, bool postOp=false) { + genericOp<1, 0, 1, 0, -1, Output>(outIx, outIx, eltN, postOp); } __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) { genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false); @@ -945,54 +949,65 @@ private: ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } - __device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) { - nelem = nelem < 0 ? 0 : nelem; + __device__ __forceinline__ void patReduce(struct ncclPatStep* ps, struct ncclPatShmem* shmem) { + if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped + int nelem = ps->nelem < 0 ? 0 : ps->nelem; T* userInput = (T*)ncclShmem.groups[group].userInput; T* userOutput = (T*)ncclShmem.groups[group].userOutput; - if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) { - ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset; - int spins = 0; - while (connStepCache < step + StepPerSlice) { - connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; - } - if (postRecv) step += StepPerSlice; + bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv)); + bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend)); + bool postRecv = ps->postRecv && recv; + bool postSend = ps->postSend && send; + struct ncclPatPeer* peer = NULL; + if (recv) { + peer = shmem->recvDims+ps->recvDim; + step = peer->step; } - if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) { - int spins = 0; - while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) { - connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; - } - ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset; - if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) { - // New data, add our own data to it. - ncclShmem.groups[group].srcs[1] = userInput + inpIx; - accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize; - if (flags & ConnFifoEnabled) - connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T); - } else { - // There is already data in there, accumulate instead of writing to it. - ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0]; - } - if (postSend) step += StepPerSlice; + if (send) { + peer = shmem->sendDims+ps->sendDim; + step = peer->step; } - if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer - ncclShmem.groups[group].dsts[0] = userOutput + outIx; - if (accSize < outIx + nelem) { + + if (recv && (flags & RoleWaitRecv)) { + ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->recvOffset; + int spins = 0; + while (peer->stepCache < step + StepPerSlice) { + peer->stepCache = loadStepValue(peer->tailPtr); + if (checkAbort(flags, Aborted, spins)) break; + } + } + if (send && (flags & RoleWaitSend)) { + int spins = 0; + while (peer->stepCache + NCCL_STEPS < step + ps->stepOffset + StepPerSlice) { + peer->stepCache = loadStepValue(peer->headPtr); + if (checkAbort(flags, Aborted, spins)) break; + } + ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->sendOffset; + if (peer->accSize < ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) { // New data, add our own data to it. - ncclShmem.groups[group].srcs[1] = userInput + inpIx; - accSize = outIx + nelem; + ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx; } else { // There is already data in there, accumulate instead of writing to it. ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0]; } } - barrier(); + long long int localAccSize = shmem->localAccSize; + if (ps->sendDim < 0 && (flags & RoleOutput)) { // Destination is our own local buffer + ncclShmem.groups[group].dsts[0] = userOutput + ps->outIx; + if (localAccSize < ps->outIx + nelem) { + // New data, add our own data to it. + ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx; + localAccSize = ps->outIx + nelem; + } else { + // There is already data in there, accumulate instead of writing to it. + ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0]; + } + } + patBarrier(); int nSrcs = 2; void** srcs = ncclShmem.groups[group].srcs; - if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source + if (ps->recvDim < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source int workSize = ncclShmem.aborted ? 0 : nelem; @@ -1000,59 +1015,92 @@ private: (tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false, nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize); - barrier(); - if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem); - if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem); + // Store conn step here inside the two barriers to make sure next reload will see the update. + if (postSend && (flags & RolePostSend)) { + if (peer->connFifo) { + peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T); + } + peer->step = step += StepPerSlice; + st_relaxed_sys_global(&peer->conn->step, step); + } + if (postRecv && (flags & RolePostRecv)) { + peer->step = step += StepPerSlice; + st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op + } + + // Update accSize + if (ps->sendDim < 0 && (flags & RoleOutput)) atomicMax(&shmem->localAccSize, localAccSize); + if (ps->sendDim >= 0 && (flags & RoleWaitSend)) atomicMax(&peer->accSize, ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize); + + patBarrier(); + + if (postSend && (flags & RolePostSend)) { + if (nelem > 0 || peer->connFifo) fence_acq_rel_sys(); + st_relaxed_sys_global(peer->tailPtr, step); + } + if (postRecv && (flags & RolePostRecv)) { + st_relaxed_sys_global(peer->headPtr, step); + } } - __device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) { - nelem = nelem < 0 ? 0 : nelem; + __device__ __forceinline__ void patCopy(struct ncclPatStep* ps, struct ncclPatShmem* shmem) { + if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped + int nelem = ps->nelem < 0 ? 0 : ps->nelem; T* userInput = (T*)ncclShmem.groups[group].userInput; T* userOutput = (T*)ncclShmem.groups[group].userOutput; - if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) { - ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset; - int spins = 0; - while (connStepCache < step + recvStepOffset + StepPerSlice) { - connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; - } - if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) { - // New data, copy to our output buffer. - ncclShmem.groups[group].dsts[1] = userOutput + outIx; - accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize; - } else { - ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done - } - if (postRecv) step += StepPerSlice; + bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv)); + bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend)); + bool postRecv = ps->postRecv && recv; + bool postSend = ps->postSend && send; + struct ncclPatPeer* peer = NULL; + if (recv) { + peer = shmem->recvDims+ps->recvDim; + step = peer->step; } - if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) { - int spins = 0; - while (connStepCache + NCCL_STEPS < step + StepPerSlice) { - connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; - } - ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset; - if (postSend) { - if (flags & ConnFifoEnabled) - connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T); - step += StepPerSlice; - } + if (send) { + peer = shmem->sendDims+ps->sendDim; + step = peer->step; } - if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer - ncclShmem.groups[group].srcs[0] = userInput + inpIx; - if (accSize < inpIx + nelem) { + + if (recv && (flags & RoleWaitRecv)) { + ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->recvOffset; + int spins = 0; + while (peer->stepCache < step + ps->stepOffset + StepPerSlice) { + peer->stepCache = loadStepValue(peer->tailPtr); + if (checkAbort(flags, Aborted, spins)) break; + } + if (peer->accSize < ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) { // New data, copy to our output buffer. - ncclShmem.groups[group].dsts[1] = userOutput + outIx; - accSize = inpIx + nelem; + ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx; } else { ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done } } - barrier(); + if (send && (flags & RoleWaitSend)) { + int spins = 0; + while (peer->stepCache + NCCL_STEPS < step + StepPerSlice) { + peer->stepCache = loadStepValue(peer->headPtr); + if (checkAbort(flags, Aborted, spins)) break; + } + ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->sendOffset; + } + long long int localAccSize = shmem->localAccSize; + if (ps->recvDim < 0 && (flags & RoleInput)) { // Source is our own local buffer + ncclShmem.groups[group].srcs[0] = userInput + ps->inpIx; + if (localAccSize < ps->inpIx + nelem) { + // New data, copy to our output buffer. + ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx; + localAccSize = ps->inpIx + nelem; + } else { + // Already done + ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; + } + } + patBarrier(); int nDsts = 2; void** dsts = ncclShmem.groups[group].dsts; - if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest + if (ps->sendDim < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done. int workSize = ncclShmem.aborted ? 0 : nelem; @@ -1061,9 +1109,32 @@ private: (tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false, 1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize); - barrier(); - if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem); - if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem); + // Store conn step here inside the two barriers to make sure next reload will see the update. + if (postSend && (flags & RolePostSend)) { + if (peer->connFifo) { + peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T); + } + peer->step = step += StepPerSlice; + st_relaxed_sys_global(&peer->conn->step, step); + } + if (postRecv && (flags & RolePostRecv)) { + peer->step = step += StepPerSlice; + st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op + } + + // Update accSize + if (ps->recvDim < 0 && (flags & RoleInput)) atomicMax(&shmem->localAccSize, localAccSize); + if (ps->recvDim >= 0 && (flags & RoleWaitRecv)) atomicMax(&peer->accSize, ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize); + + patBarrier(); + + if (postSend && (flags & RolePostSend)) { + if (nelem > 0 || peer->connFifo) fence_acq_rel_sys(); + st_relaxed_sys_global(peer->tailPtr, step); + } + if (postRecv && (flags & RolePostRecv)) { + st_relaxed_sys_global(peer->headPtr, step); + } } }; diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index 70538b1..5d8de28 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -80,29 +80,66 @@ struct RunWorkColl struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { +#if __CUDA_ARCH__ >= 600 using Proto = ProtoSimple<1, 1>; const int nranks = ncclShmem.comm.nRanks; const int rank = ncclShmem.comm.rank; size_t count, channelOffset, channelCount, chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount); - T *inputBuf = (T*)work->sendbuff; - T *outputBuf = (T*)work->recvbuff; - Primitives, 0, Proto, 0> prims - (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs); + static constexpr int nworkers = NCCL_PAT_NWORKERS; + struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0); + uint64_t pollCount = 0; + __syncthreads(); // Don't start using shared mem until everyone arrives + for (int i=tid; ipatSteps[i].flags = 0; + if (tid == 0) shmem->localAccSize = 0; + if (tid == nworkers) shmem->parallelFactor = 0; + __syncthreads(); - PatRSAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); - int last = 0; - while (!last) { - int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem; - size_t inpIx, outIx; - patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last); - prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend); + if (tid == nworkers) { // Algo computation thread + PatRSAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); + int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor(); + int step = 0; + while (1) { + struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); + cuda::atomic_ref poll(ps->flags); + while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS' + patAlgo.getNextOp(ps); + int last = ps->last; + step++; + if (last == 2) break; + } + } else if (tid < nworkers) { // Worker threads + T *inputBuf = (T*)work->sendbuff; + T *outputBuf = (T*)work->recvbuff; + int parallelFactor = 0; + volatile int* pfPtr = &shmem->parallelFactor; + while (parallelFactor == 0) parallelFactor = *pfPtr; + + int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE; + int group = tid / groupSize; + int nGroups = nworkers / groupSize; + int tidInGroup = tid - group*groupSize; + // We don't use recvPeers/sendPeers so let's pass shmem structs instead + Primitives, 0, Proto, 0> prims + (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatRs); + + int step = group; + while(1) { + struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); + cuda::atomic_ref poll(ps->flags); + while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread + int last = ps->last; + prims.patReduce(ps, shmem); + if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread + if (last) break; + step += nGroups; + } } +#endif } }; - template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h index fe3b9ca..f36a511 100644 --- a/src/device/sendrecv.h +++ b/src/device/sendrecv.h @@ -41,7 +41,7 @@ struct RunWorkBatch (maxSharedMem-attr.sharedSizeBytes)) { - if (print++ == 0) - INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", - sharedMemSize, maxSharedMem-attr.sharedSizeBytes); - // Reduce requested MaxDynamicSharedMemorySize attribute - sharedMemSize = maxSharedMem - attr.sharedSizeBytes; + WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", + cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes); + return ncclSystemError; } CUDACHECKGOTO(cudaFuncSetAttribute(fn, cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize), @@ -388,6 +385,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool struct ncclTaskColl* next = aggBeg->next; aggBeg->algorithm = agg.algorithm; aggBeg->protocol = agg.protocol; + if (aggBeg->protocol == NCCL_PROTO_LL) aggBeg->trafficBytes *= 4; aggBeg->nMaxChannels = agg.nMaxChannels; aggBeg->nWarps = agg.nWarps; aggBeg->devFuncId = agg.devFuncId; @@ -478,6 +476,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool return ncclSuccess; } +static ncclResult_t addProfilerProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) { + int tmp = op->pattern; + op->pattern = ncclPatternProfiler; + ncclResult_t ret = addProxyOpIfNeeded(comm, plan, op); + op->pattern = tmp; + return ret; +} + static ncclResult_t scheduleCollTasksToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget ) { @@ -550,11 +556,16 @@ static ncclResult_t scheduleCollTasksToPlan( proxyOp.opCount = proxyOpId; proxyOp.task.coll = task; proxyOp.rank = comm->rank; + proxyOp.eActivationMask = task->eActivationMask; + proxyOp.workCounter = ++comm->profiler.workCounter[c]; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); + // Set pattern to profiler to add a proxy profiler for kernel events NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); + NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOp)); } } else { // not task->isCollnet int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks); + if (task->protocol == NCCL_PROTO_LL) trafficPerByte *= 4; size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16; int elementsPerCell = cellSize/elementSize; size_t cells = divUp(task->count*elementSize, cellSize); @@ -669,11 +680,14 @@ static ncclResult_t scheduleCollTasksToPlan( } proxyOp->ringAlgo->incRefCount(); } + proxyOp->eActivationMask = task->eActivationMask; + proxyOp->workCounter = ++comm->profiler.workCounter[c]; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); // Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to // determine if that's actually true but it's also not clear if that would be an issue. // coverity[uninit_use_in_call:FALSE] NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp)); + NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, proxyOp)); } } @@ -797,7 +811,8 @@ static ncclResult_t addP2pToPlan( if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2; if (network[dir]) { - if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) { + bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1; + if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) { int regFlag = 0; NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax)); for (int part = 0; part < nChannelsMax; part++) { @@ -888,6 +903,7 @@ static ncclResult_t addP2pToPlan( op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0; op->task.p2p = p2pTasks[dir]; op->rank = comm->rank; + op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0; // The following are modified per channel part in addWorkToChannels(): // op->buffer, op->nbytes, op->nsteps = ...; } @@ -898,7 +914,6 @@ static ncclResult_t addP2pToPlan( plan->channelMask |= uint64_t(1)<nSendChannels : work->nRecvChannels; @@ -935,9 +950,12 @@ static ncclResult_t addP2pToPlan( // equal one plus the batch index this p2p settled in. proxyOps[dir].channelId = channelId; proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1; + proxyOps[dir].workCounter = comm->profiler.workCounter[channelId]+1; NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir])); + NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir])); } } + comm->profiler.workCounter[channelId] += (proxyOps[0].nsteps || proxyOps[1].nsteps) ? 1 : 0; } return ncclSuccess; @@ -1157,22 +1175,23 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla struct uploadWork_cleanup_t* cleanup = nullptr; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; void* fifoBufDev = nullptr; + cudaStream_t deviceStream; + CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail); - // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the - // user's graph will be launched later, and it also acquires the deviceStream, - // it will observe this upload. - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail); + // Acquire deviceStream. Since the user's graph will be launched later and it also + // acquires the deviceStream, it will observe this upload. + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, fail); - CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail); + CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, deviceStream), result, fail); plan->workBufPersistent = fifoBufDev; plan->kernelArgs->workBuf = fifoBufDev; // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL - CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail); + CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, deviceStream), result, fail); cudaEvent_t memcpyDone; CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail); - CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail); + CUDACHECKGOTO(cudaEventRecord(memcpyDone, deviceStream), result, fail); NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail); cleanup->base.fn = uploadWork_cleanup_fn; @@ -1180,7 +1199,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla cleanup->hostBuf = fifoBufHost; ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail); NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail); finish_scope: @@ -1254,14 +1273,15 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) { if (result != ncclSuccess) { WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result)); } - if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs); + if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs); return; } static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) { struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim` if (plan->persistent) { - comm->persistentRefs -= 1; + comm->sharedRes->persistentRefs -= 1; + comm->localPersistentRefs -= 1; if (plan->workStorageType == ncclDevWorkStorageTypePersistent) { cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); @@ -1317,6 +1337,28 @@ static void persistentDestructor(void* plans_) { } } +NCCL_PARAM(LaunchOrderImplicit, "LAUNCH_ORDER_IMPLICIT", 0); + +namespace { + enum ncclImplicitOrder { + ncclImplicitOrderNone, + ncclImplicitOrderSerial, + ncclImplicitOrderLaunch + }; +} + +static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) { + if (ncclParamLaunchOrderImplicit()) { + // Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs + if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; } + if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); } + *mode = 12030 <= std::min(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial; + return ncclSuccess; + } + *mode = ncclImplicitOrderNone; + return ncclSuccess; +} + ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { ncclResult_t result = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; @@ -1364,58 +1406,60 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { if (nPlans == 0) return ncclSuccess; - // Semantically we want these dependencies for the kernels launched: - // 1. Launch host task on hostStream. - // 2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...} - // 3. {deviceStream, userStream[i]...} depend on kernel. - // We achieve this by: - // 1. userStream[0] waits on deviceStream - // 2. deviceStream waits on each of userStream[1...] - // 3. host task launch on hostStream - // 4. userStream[0] waits on hostStream - // 5. kernel launch on userStream[0] - // 6. deviceStream waits on userStream[0] - // 7. userStream[1...] each waits on deviceStream - // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires - // at least one of the two streams to be strong-stream. cudaStream_t launchStream = planner->streams->stream; - NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure); + cudaStream_t deviceStream, launchOrder; + NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, failure); - // Create dependency for device stream on user streams. First from extra user - // streams to deviceStream. Then deviceStream to first user stream. + // userStream[0] waits on each userStream[i]... for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) { - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure); + CUDACHECKGOTO(cudaEventRecord(comm->sharedRes->scratchEvent, l->stream), result, failure); + CUDACHECKGOTO(cudaStreamWaitEvent(launchStream, comm->sharedRes->scratchEvent, 0), result, failure); } - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure); + // userStream[0] waits on deviceStream + NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, deviceStream, comm->sharedRes->scratchEvent), result, failure); - if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) { + bool capturing = ncclCudaGraphValid(planner->capturingGraph); + enum ncclImplicitOrder implicitOrder; + NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure); + + if (implicitOrder != ncclImplicitOrderNone) { + // userStream[0] waits on per-device (context) launchOrder. Concurrent strong stream access is + // required if this is a graph capture, non-captured cannot be concurrent because that would violate + // deterministic program order of launches. + bool concurrent = capturing; + NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder), result, failure); + NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure); + } + + if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) { // We have to launch host tasks to push proxy args. We are careful to only // do this if necessary since host tasks impose a high performance cost in CUDA. bool acquired = false; + cudaStream_t hostStream; for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) { if (plan->hasProxyOps) { if (!acquired) { acquired = true; - NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure); } - if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs); + if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs); plan->isHostCbEnq = true; - NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); + CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure); } } if (acquired) { // Make to-be-launched kernels dependent on just-launched host stream tasks. - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure); - NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); + NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, hostStream, comm->sharedRes->scratchEvent), result, failure); + NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false), result, failure); } } if (persistent) { - comm->persistentRefs += nPlans; + comm->sharedRes->persistentRefs += nPlans; + comm->localPersistentRefs += nPlans; NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure); } } - failure: return result; } @@ -1434,6 +1478,7 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote); #endif ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) { + ncclResult_t ret = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; int nChannels = countOneBits(plan->channelMask); void* sym = plan->kernelFn; @@ -1447,18 +1492,19 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan CU_LAUNCH_PARAM_END }; - CUfunction fn; - CUDACHECK(cudaGetFuncBySymbol(&fn, sym)); - - #if CUDART_VERSION >= 11080 int driverVersion; - NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); - if (driverVersion >= 11080) { + NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, do_return); + + CUfunction fn; + CUDACHECKGOTO(cudaGetFuncBySymbol(&fn, sym), ret, do_return); + + if (CUDART_VERSION >= 11080 && driverVersion >= 11080) { + #if CUDART_VERSION >= 11080 int compCap = comm->compCap; unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0; CUlaunchConfig launchConfig = {0}; - CUlaunchAttribute launchAttrs[3]; + CUlaunchAttribute launchAttrs[4] = {}; int attrs = 0; /* Cooperative Group Array (CGA) * On sm90 and later we have an extra level of hierarchy where we @@ -1485,6 +1531,17 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain(); } #endif + #if CUDART_VERSION >= 12030 + bool capturing = ncclCudaGraphValid(planner->capturingGraph); + enum ncclImplicitOrder implicitOrder; + NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return); + if (implicitOrder == ncclImplicitOrderLaunch) { + launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT; + launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent; + launchAttrs[attrs].value.launchCompletionEvent.flags = 0; + attrs++; + } + #endif launchConfig.gridDimX = grid.x; launchConfig.gridDimY = grid.y; launchConfig.gridDimZ = grid.z; @@ -1496,15 +1553,15 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan launchConfig.numAttrs = attrs; launchConfig.hStream = launchStream; - //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args)); - CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra)); - return ncclSuccess; - } + CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return); #endif - // Standard kernel launch - CUCHECK(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra)); - //CUDACHECK(cudaLaunchKernel(fnAddr, grid, block, args, smem, launchStream)); - return ncclSuccess; + } else { + // Standard kernel launch + CUCHECKGOTO(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra), ret, do_return); + } + +do_return: + return ret; } ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { @@ -1524,34 +1581,39 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern } ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { - ncclResult_t result = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; - if (!ncclIntruQueueEmpty(&planner->planQueue)) { // Reset queue to empty without destroying plans since those will be sent // back to us for reclaiming via callbackQueue. ncclIntruQueueConstruct(&planner->planQueue); + cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch - // Create dependency for deviceStream on launchStream. We know that deviceStream - // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare), - // so we can say that launchStream subsumes it. - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1); - resume1: - // Create dependency for other user streams (skip launch stream) on deviceStream. - // Again, the user streams haven't been touched since deviceStream waited on them - // so we can say they are subsumed by deviceStream. - struct ncclCudaStreamList* sl = planner->streams->next; - planner->streams = nullptr; // Reset comm->planner.streams to empty. - while (sl != nullptr) { - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2); - resume2: - sl = sl->next; + cudaStream_t deviceStream, launchOrder; + CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream)); + // deviceStream waits on userStream[0] + NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); + CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0)); + // Each userStream[i] waits on userStream[0] + for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) { + CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0)); } - // Release device stream as acquired in ncclLaunchPrepare() - NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3); - resume3:; + bool capturing = ncclCudaGraphValid(planner->capturingGraph); + enum ncclImplicitOrder implicitOrder; + NCCLCHECK(getImplicitOrder(&implicitOrder, capturing)); + if (implicitOrder != ncclImplicitOrderNone) { + // As in ncclLaunchPrepare, strong stream can be non-concurrent when non-captured. + bool concurrent = capturing; + // Incorporate launch event into per-device (context) launch order. + NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder)); + // If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution). + CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent)); + // Release launchOrder as acquired in ncclLaunchPrepare() + NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent)); + } + // Release deviceStream as acquired in ncclLaunchPrepare() + NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false)); } - return result; + return ncclSuccess; } /*****************************************************************************/ @@ -1655,11 +1717,11 @@ static ncclResult_t topoGetAlgoInfo( if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { char ncclAlgoEnvStr[1024] = ""; char ncclProtoEnvStr[1024] = ""; - char* algoEnv = getenv("NCCL_ALGO"); + const char* algoEnv = ncclGetEnv("NCCL_ALGO"); if (algoEnv) { snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv); } - char* protoEnv = getenv("NCCL_PROTO"); + const char* protoEnv = ncclGetEnv("NCCL_PROTO"); if (protoEnv) { snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv); } @@ -2007,7 +2069,7 @@ static ncclResult_t hostToDevRedOp( uint64_t allBits = uint64_t(-1)>>(64-nbits); uint64_t signBit = allBits^(allBits>>1); bool datatype_signed = false; - + switch (int(op)) { case ncclSum: opFull->op = ncclDevSum; break; case ncclProd: opFull->op = ncclDevProd; break; @@ -2097,6 +2159,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { p2p->datatype = info->datatype; p2p->root = info->root; p2p->bytes = nBytes; + p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); ncclIntruQueueEnqueue( isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue, p2p); @@ -2105,6 +2168,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { // Mark channels that need pre-connect if (comm->rank != peer) { if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) { + // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway. (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true; int round = 0; while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank @@ -2115,12 +2179,17 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { for (int c=0; c < comm->p2pnChannelsPerPeer; c++) { int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c); if (isSendNotRecv) { - if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector + // the send/recv connector is shared among split shared comms. We need to set hasSeen to + // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split + // shared comms together. + comm->channels[channelId].peers[peer]->send[1].hasSeen = 1; comm->connectSend[peer] |= (1UL<channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector + comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1; comm->connectRecv[peer] |= (1UL<opDev = opDev; // C++ struct assignment t->chunkSteps = info->chunkSteps; t->sliceSteps = info->sliceSteps; + t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); planner->nTasksColl += 1; ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes); diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 64fc1c5..76b508c 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -390,7 +390,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail); // Alternate rings to avoid crossing rails - if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) { + if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) { for (int r=0; rnRanks; r++) { if (comm->rankToNode[r] % 2 == 1) { // Exchange rings diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 587a8b2..ace4476 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -376,9 +376,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); int ncclTopoUserGdrLevel = -1; +const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" }; -ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) { - *useGdr = 0; +NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0); + +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) { + *gdrMode = ncclTopoGdrModeDisable; // Get GPU and NET int n, g; @@ -418,25 +421,37 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n int distance = gpu->paths[NET][n].type; if (distance == PATH_PXN) { // In case of PXN, use the intermediate GPU distance instead - int proxyRank, g; + int proxyRank; NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank)); NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g)); - struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g; - distance = proxyGpu->paths[NET][n].type; + gpu = system->nodes[GPU].nodes+g; + distance = gpu->paths[NET][n].type; } + + int c; + NCCLCHECK(ncclGetLocalCpu(system, g, &c)); + if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) { + // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs + INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c); + distance = PATH_C2C; + } + if (distance > netGdrLevel) { INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel); return ncclSuccess; } - *useGdr = 1; - INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read); + // Force PCIe mapping if path goes through PCI on a C2C system + if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci; + else *gdrMode = ncclTopoGdrModeDefault; + + INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]); return ncclSuccess; } ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) { int netNum = system->nodes[NET].count; - int useGdr = 0; + enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDisable; *avail = false; for (int n = 0; n < netNum; n++) { int64_t netId = system->nodes[NET].nodes[n].id; @@ -469,6 +484,14 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Flush is required on Ampere and earlier if (gpu->gpu.cudaCompCap >= 90) *flush = 0; + // On C2C platforms, data could go through a PCI switch while completions and + // flags would go through C2C. In that case, force a flush. + int c, n; + NCCLCHECK(ncclGetLocalCpu(system, g, &c)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); + if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) { + *flush = 1; + } return ncclSuccess; } @@ -538,7 +561,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0); int ncclPxnDisable(struct ncclComm* comm) { static int pxnDisable = -1; if (pxnDisable == -1) { - if (comm && ncclNetVersion(comm) == 4) { + if (comm && comm->ncclNetVer == 4) { INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); pxnDisable = 1; } else { @@ -561,9 +584,9 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int proxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank)); if (proxyRank == comm->rank) continue; - int useGdr; + enum ncclTopoGdrMode useGdr; NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr)); - if (useGdr == 0) continue; + if (useGdr == ncclTopoGdrModeDisable) continue; int found = 0; for (int r=0; rpaths[NET][n].type < PATH_PHB) { // Update path when we dont want to / can't use GPU Direct RDMA. - int gdr; + enum ncclTopoGdrMode gdr; NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr)); if (gdr == 0) { // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU @@ -862,3 +885,38 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink *allNvLink = maxPath >= PATH_PIX ? 0 : 1; return ncclSuccess; } + +// Check whether we are in a split NVLink situation, with two NVLink domains, not +// connected through NVLink (e.g. QPI). +ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink) { + ncclResult_t res = ncclSuccess; + int nvlDomains = 0; + int *nvlDomain = NULL, *nvlDomainCount = NULL; + // Compute NVLink domains + NCCLCHECKGOTO(ncclCalloc(&nvlDomain, system->nodes[GPU].count), res, exit); + for (int g=0; gnodes[GPU].count; g++) nvlDomain[g] = g; + for (int g=0; gnodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + int domain = nvlDomain[g]; + for (int p=g+1; pnodes[GPU].count; p++) { + if (gpu->paths[GPU][p].type == PATH_NVL) { + nvlDomain[p] = domain; + } + } + } + // Compute number of GPUs per NVLink domain. + NCCLCHECKGOTO(ncclCalloc(&nvlDomainCount, system->nodes[GPU].count), res, exit); + for (int g=0; gnodes[GPU].count; g++) { + nvlDomainCount[nvlDomain[g]]++; + } + // Count the number of NVLink domains + for (int g=0; gnodes[GPU].count; g++) { + if (nvlDomainCount[g] > 1) nvlDomains++; + } + *splitNvLink = nvlDomains == 2 ? 1 : 0; + +exit: + if(nvlDomain) free(nvlDomain); + if(nvlDomainCount) free(nvlDomainCount); + return res; +} diff --git a/src/graph/search.cc b/src/graph/search.cc index 0185b3f..15a0124 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -446,12 +446,11 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. // 2. add other NETs satisfying typeInter but not already in the list. -ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { +ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) { ncclResult_t ret = ncclSuccess; int netCount = 0; int localNetCount; - int* localNets; - NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS)); + int localNets[MAXCHANNELS]; // First add the preferred NICs for (int g=0; gnodes[GPU].count; g++) { @@ -460,8 +459,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; for (int c = 0; cgpu.rank, c, &netId, NULL), ret, fail); - NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail); + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; localNetCount++; } @@ -469,7 +468,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in for (int i=0; iintra[graph->nChannels*ngpus+step] = gpu->gpu.rank; int g = gpu - system->nodes[GPU].nodes; - int* nets = NULL; + int nets[NCCL_TOPO_MAX_NODES]; if (step == backToNet) { // first get back to NIC if (system->nodes[NET].count) { @@ -533,8 +527,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; int netCount; - NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); - NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail); + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount)); for (int i=0; inodes[NET].nodes+n; @@ -555,14 +548,14 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo graph->bwInter /= 2; } - NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail); + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); graph->bwInter = bwInterSave; if (net) { graph->inter[graph->nChannels*2+1] = net->id; - NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail); + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time)); if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2; - NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail); + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net)); graph->bwInter = bwInterSave; } } @@ -601,21 +594,15 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo // Next path NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time)); } -exit: - if (nets) free(nets); - return ret; -fail: - goto exit; + return ncclSuccess; } ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { - ncclResult_t ret = ncclSuccess; const int bw = graph->bwInter; - int* nets; - NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); + int nets[NCCL_TOPO_MAX_NODES]; int netCount; int graphFound = 0; - NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail); + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount)); for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break; int n = nets[(graph->nChannels+i)%netCount]; @@ -639,7 +626,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo // NVLS search only tries to find NIC:GPU combinations to compute the heads. if (graph->nChannels < netCount) { int gpu; - NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail); + NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu)); if (gpu != -1) { int duplicate = 0; // check whether there is duplicate head when one GPU connects with multiple NICs @@ -650,7 +637,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } if (!duplicate) { - NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu)); graphFound = 1; } } @@ -659,14 +646,14 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo if (graph->nChannels > 0) { // Try to replay the last channel int g; - NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail); - NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail); + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); } if (graph->nChannels == 0 || graph->sameChannels == 0) { if (graph->nChannels == 0 && system->nodes[NVS].count == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; - NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); if (t == -1) *time = -1; } @@ -686,7 +673,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo for (int i=0; inodes[GPU].count; i++) { int g = (graph->nChannels+i)%system->nodes[GPU].count; if (paths[g].bw == maxBw && paths[g].count == minHops) { - NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); } } } @@ -700,11 +687,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } } -exit: - free(nets); - return ret; -fail: - goto exit; + return ncclSuccess; } /* Search Patterns @@ -999,6 +982,15 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph graph->minChannels = graph->maxChannels; } + int splitNvLink; + NCCLCHECK(ncclTopoSplitNvLink(system, &splitNvLink)); + if (graph->pattern == NCCL_TOPO_PATTERN_RING && splitNvLink) { + // We have two sockets with NVLink and a slower link in between (typically QPI). + // Tree is likely going to work better but it needs at least 2 channels. + // Since Tree needs to have the same number of channels as Ring, also force Ring to use 2 channels. + if (graph->maxChannels >= 2 && graph->minChannels == 1) graph->minChannels = 2; + } + struct ncclTopoGraph tmpGraph; memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); diff --git a/src/graph/topo.cc b/src/graph/topo.cc index ba82caf..9499f39 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -22,8 +22,8 @@ #define BUSID_REDUCED_SIZE (sizeof("0000:00")) const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; -const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" }; -const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" }; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "SYS", "NET" }; +const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ @@ -45,7 +45,7 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) return ncclSuccess; } -static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) { +static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu, struct ncclTopoNode* from) { *cpu = NULL; if (node->type == CPU) { *cpu = node; @@ -54,9 +54,10 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode* for (int l=0; lnlinks; l++) { // Go up the PCI tree to find the CPU. Follow only PCI switches. if (node->links[l].type == LINK_PCI + && node->links[l].remNode != from && (node->links[l].remNode->type == PCI || node->links[l].remNode->type == CPU)) { - NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); + NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu, node)); } if (*cpu != NULL) return ncclSuccess; } @@ -77,13 +78,17 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) { return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { - *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW; + *bw = + cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_ERP ? ERP_QPI_BW : + cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SRP ? SRP_QPI_BW : + cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SKL ? SKL_QPI_BW : + BDW_QPI_BW; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) { *bw = AMD_BW; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { - *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; + *bw = cpu->cpu.model == NCCL_TOPO_CPU_MODEL_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; } return ncclSuccess; } @@ -511,12 +516,16 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); - cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW; + cpu->cpu.model = + (familyId == 6 && modelId >= 0xCF) ? NCCL_TOPO_CPU_MODEL_INTEL_ERP : + (familyId == 6 && modelId >= 0x8F) ? NCCL_TOPO_CPU_MODEL_INTEL_SRP : + (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_MODEL_INTEL_SKL : + NCCL_TOPO_CPU_MODEL_INTEL_BDW; } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); - if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG; + if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_MODEL_YONGFENG; } } for (int s=0; snSubs; s++) { @@ -565,7 +574,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId))); } else if (targetType == CPU) { // NVL connection to the local CPU - NCCLCHECK(findLocalCpu(gpu, &remote)); + NCCLCHECK(findLocalCpu(gpu, &remote, NULL)); } else { if (system->nodes[NVS].count == 0) { NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0)); @@ -642,10 +651,10 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys NCCLCHECK(xmlGetAttrInt(node, "bw", &bw)); double c2cBw = (bw*count)/1000.0; struct ncclTopoNode* cpu = NULL; - NCCLCHECK(findLocalCpu(gpu, &cpu)); + NCCLCHECK(findLocalCpu(gpu, &cpu, NULL)); if (cpu == NULL) return ncclSuccess; - NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw)); - NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw)); + NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_C2C, c2cBw)); + NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_C2C, c2cBw)); } else { if (strcmp(node->name, "cpu") == 0) { NCCLCHECK(ncclGetSystemId(system, node, &systemId)); @@ -961,26 +970,31 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (* // Trigger the merge, then get the new device's properties int vDevIndex = 0; ncclResult_t ret = makeVDevice(&vDevIndex, vProps); - if (ret == ncclInvalidUsage) { - WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC"); - NCCLCHECK(ret); + if (ret != ncclSuccess) { + INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.", + vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]); + return ret; } INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex); return ncclSuccess; } -ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { +ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { + ncclResult_t ret = ncclSuccess; INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str); + char* ncStr; + NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1)); + strcpy(ncStr, str); char* semi_token; - char* semi = strtok_r(str, ";", &semi_token); + char* semi = strtok_r(ncStr, ";", &semi_token); while (semi) { TRACE(NCCL_NET, "Fusing %s", semi); struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC]; int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC); if (nUserIfs == 0) { INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.", - str, semi); + ncStr, semi); continue; } @@ -994,26 +1008,37 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, if (vProps.ndevs != nUserIfs) { WARN("TOPO/NET : Only matched %d devices, %d requested from %s", vProps.ndevs, nUserIfs, semi); - return ncclInvalidUsage; + ret = ncclInvalidUsage; + goto fail; } if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) { WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC); - return ncclInvalidUsage; + ret = ncclInvalidUsage; + goto fail; } struct ncclXmlNode* netNode; - NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice)); - - // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this) - for (int i = 0; i < vProps.ndevs; i++) { - placedDevs[vProps.devs[i]] = 1; + ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice); + if (ret == ncclSuccess) { + // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this) + for (int i = 0; i < vProps.ndevs; i++) { + placedDevs[vProps.devs[i]] = 1; + } + } else { + WARN("TOPO/NET : Could not force merge NICs %s. Please specify a valid NCCL_NET_FORCE_MERGE string.", semi); + ret = ncclInvalidUsage; + goto fail; } semi = strtok_r(NULL, ";", &semi_token);; } - return ncclSuccess; +exit: + free(ncStr); + return ret; +fail: + goto exit; } ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { @@ -1061,7 +1086,24 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe } struct ncclXmlNode* netNode; - NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out); + ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice); + + // Merging failed. + // Mark all as unplaced and increase their distance to disconnected (PATH_DIS) + // Set i to 0 to restart the automatic merging process and ensure all are placed + if (ret != ncclSuccess) { + INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "Marking physical devices as unplaced, increasing distance and restarting search."); + placedDevs[i] = 0; + TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, keeping distance -> self as PATH_LOC", i); + for (int k = 1; k < vProps.ndevs; k++) { + int dev = vProps.devs[k]; + placedDevs[dev] = 0; + paths[i*nPhysDevs + dev] = PATH_DIS; + paths[dev*nPhysDevs + i] = PATH_DIS; + TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, setting distance -> %d as PATH_DIS", dev, i); + } + i = 0; + } } } @@ -1125,16 +1167,16 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_ // By default, don't merge any devices int mergeLevel; mergeLevel = PATH_PORT; - char* mergeLevelEnv; - mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL"); - if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList); - char* forceMerge; - forceMerge = getenv("NCCL_NET_FORCE_MERGE"); - NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs)); - memset(placedDevs, 0, sizeof(int)*physicalDevs); + { // Avoids warnings related to jumping to "out" + const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL"); + if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList); + const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE"); + NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs)); + memset(placedDevs, 0, sizeof(int)*physicalDevs); - if (forceMerge) { - NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + if (forceMerge) { + NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + } } NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); diff --git a/src/graph/topo.h b/src/graph/topo.h index 2be029b..921a7f5 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -18,9 +18,11 @@ #define SM86_NVLINK_BW 12.0 #define SM100_NVLINK_BW 40.0 #define PCI_BW 12.0 // PCI Gen3 x16 -#define QPI_BW 6.0 #define AMD_BW 16.0 +#define BDW_QPI_BW 6.0 #define SKL_QPI_BW 10.0 +#define SRP_QPI_BW 22.0 +#define ERP_QPI_BW 40.0 #define ZPI_BW 6.0 #define YONGFENG_ZPI_BW 9.0 #define P9_BW 32.0 @@ -44,12 +46,13 @@ extern const char* topoNodeTypeStr[]; #define LINK_LOC 0 #define LINK_NVL 1 // Skipping 2 for PATH_NVB -#define LINK_PCI 3 -// Skipping 4 for PATH_PXB -// Skipping 5 for PATH_PXN -// Skipping 6 for PATH_PHB -#define LINK_SYS 7 -#define LINK_NET 8 +#define LINK_C2C 3 +#define LINK_PCI 4 +// Skipping 5 for PATH_PXB +// Skipping 6 for PATH_PXN +// Skipping 7 for PATH_PHB +#define LINK_SYS 8 +#define LINK_NET 9 extern const char* topoLinkTypeStr[]; // Local (myself) @@ -61,29 +64,32 @@ extern const char* topoLinkTypeStr[]; // Connection through NVLink using an intermediate GPU #define PATH_NVB 2 +// Connection through C2C +#define PATH_C2C 3 + // Connection traversing at most a single PCIe bridge -#define PATH_PIX 3 +#define PATH_PIX 4 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) -#define PATH_PXB 4 +#define PATH_PXB 5 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. -#define PATH_PXN 5 +#define PATH_PXN 6 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) -#define PATH_PHB 6 +#define PATH_PHB 7 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) -#define PATH_SYS 7 +#define PATH_SYS 8 // Connection through the network -#define PATH_NET 8 +#define PATH_NET 9 // New type of path which should precede PATH_PIX #define PATH_PORT PATH_NVL // Disconnected -#define PATH_DIS 9 +#define PATH_DIS 10 extern const char* topoPathTypeStr[]; struct ncclTopoNode; @@ -103,9 +109,6 @@ struct ncclTopoLinkList { int type; }; -#define NCCL_TOPO_CPU_INTEL_BDW 1 -#define NCCL_TOPO_CPU_INTEL_SKL 2 - #define NCCL_TOPO_UNDEF (-1) #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff @@ -176,6 +179,7 @@ ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank); ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min); ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max); +ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink); #define NCCL_TOPO_XML_MAX_NODES 256 #define NCCL_GRAPH_XML_MAX_NODES 4096 diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 8da4aeb..68085b8 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -177,6 +177,7 @@ static const double perChMaxTreeBws[][3] = { NCCL_PARAM(PatEnable, "PAT_ENABLE", 2); static int ncclPatEnable(struct ncclComm* comm) { int patEnable = ncclParamPatEnable(); + if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics if (patEnable != 2) return patEnable; if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0; // PAT doesn't support net device offload @@ -257,7 +258,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw); - if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; + if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; if (a == NCCL_ALGO_PAT) busBw *= .75; if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used diff --git a/src/group.cc b/src/group.cc index e387db7..c48c0de 100644 --- a/src/group.cc +++ b/src/group.cc @@ -193,7 +193,6 @@ fail: static ncclResult_t doLaunches(struct ncclComm* head) { ncclResult_t result = ncclSuccess; - struct ncclComm* cliqueComm0 = head->intraComm0; struct ncclComm* cliqueHead = head; struct ncclComm* cliqueNextHead; bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup; @@ -209,7 +208,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) { NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure); if (useBarrier) ncclCommIntraBarrierIn(comm, 1); comm = comm->groupNext; - } while (comm != nullptr && comm->intraComm0 == cliqueComm0); + } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0); cliqueNextHead = comm; if (capturingYes && capturingNo) { @@ -424,38 +423,47 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf /* Connect channels at runtime if cumem is supported */ if (groupCommHeadMain != nullptr) { - struct ncclComm* comm = groupCommHeadMain; + struct ncclComm* cliqueHead = groupCommHeadMain; + struct ncclComm* comm = NULL; struct ncclIntruQueue asyncCollJobs; ncclIntruQueueConstruct(&asyncCollJobs); do { - bool needConnect = false; - bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; - memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); + // We need to preconnect connections for collectives clique by clique to avoid + // race condition for split shared comms which can connect the same connections + // at the same time. + comm = cliqueHead; + do { + bool needConnect = false; + bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; + memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); - NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail); + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail); - if (comm->cuMemSupport && needConnect) { - struct ncclPreconnectJob* job; - NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); - job->base.func = ncclCollPreconnectFunc; - job->base.undo = nullptr; - job->base.destructor = free; - job->base.state = ncclGroupJobRunning; - job->base.abortFlag = comm->abortFlag; - job->base.abortFlagDev = comm->abortFlagDev; - job->comm = comm; - NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail); - memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); - ncclIntruQueueEnqueue(&asyncCollJobs, &job->base); + if (comm->cuMemSupport && needConnect) { + struct ncclPreconnectJob* job; + NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); + job->base.func = ncclCollPreconnectFunc; + job->base.undo = nullptr; + job->base.destructor = free; + job->base.state = ncclGroupJobRunning; + job->base.abortFlag = comm->abortFlag; + job->base.abortFlagDev = comm->abortFlagDev; + job->comm = comm; + NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail); + memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); + ncclIntruQueueEnqueue(&asyncCollJobs, &job->base); + } + comm = comm->groupNext; + } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0); + // connect + NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail); + while (!ncclIntruQueueEmpty(&asyncCollJobs)) { + struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs); + if (job->destructor) job->destructor((void*)job); } - comm = comm->groupNext; - } while (comm); - NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail); - while (!ncclIntruQueueEmpty(&asyncCollJobs)) { - struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs); - if (job->destructor) job->destructor((void*)job); - } + cliqueHead = comm; + } while (cliqueHead != nullptr); // done with all buffer allocation, start registration and enqueue comm = groupCommHeadMain; diff --git a/src/include/bitops.h b/src/include/bitops.h index a650aa7..dcf0e2e 100644 --- a/src/include/bitops.h +++ b/src/include/bitops.h @@ -8,6 +8,7 @@ #define NCCL_BITOPS_H_ #include +#include #if !__NVCC__ #ifndef __host__ @@ -276,13 +277,53 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) { return u32fpDecode(x, 3); } -inline __host__ __device__ uint64_t getHash(const char* string, int n) { - // Based on DJB2a, result = result * 33 ^ char - uint64_t result = 5381; - for (int c = 0; c < n; c++) { - result = ((result << 5) + result) ^ string[c]; +// The hash isn't just a function of the bytes but also where the bytes are split +// into different calls to eatHash(). +inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) { + char const* ptr = (char const*)bytes; + acc[0] ^= size; + while (size != 0) { + // Mix the accumulator bits. + acc[0] += acc[1]; + acc[1] ^= acc[0]; + acc[0] ^= acc[0] >> 31; + acc[0] *= 0x9de62bbc8cef3ce3; + acc[1] ^= acc[1] >> 32; + acc[1] *= 0x485cd6311b599e79; + // Read in a chunk of input. + size_t chunkSize = size < sizeof(uint64_t) ? size : sizeof(uint64_t); + uint64_t x = 0; + memcpy(&x, ptr, chunkSize); + ptr += chunkSize; + size -= chunkSize; + // Add to accumulator. + acc[0] += x; } - return result; +} + +template +inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) { + eatHash(acc, (const void*)bytes, sizeof(T)); +} + +inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) { + uint64_t h = acc[0]; + h ^= h >> 31; + h *= 0xbac3bd562846de6b; + h += acc[1]; + h ^= h >> 32; + h *= 0x995a187a14e7b445; + return h; +} + +inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) { + uint64_t acc[2] = {1, 1}; + eatHash(acc, bytes, size); + return digestHash(acc); +} +template +inline __host__ __device__ uint64_t getHash(const T* bytes) { + return getHash((const void*)bytes, sizeof(T)); } #endif diff --git a/src/include/collectives.h b/src/include/collectives.h index c82ebce..c68b041 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -10,6 +10,7 @@ #include "nccl.h" #include "nccl_common.h" #include "device.h" + #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. // CHUNKSIZE must be a multiple of SLICESIZE @@ -382,6 +383,42 @@ public: ~RingBCAlgorithm() {} }; +#if !defined (__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#include +#endif + +// Need a power of two to ensure it divides by parallelFactor (which is also a power of two) +#define NCCL_PAT_NWORKERS 512 + +static constexpr int PatUsed = 0x1, + PatSkipped = 0x2; + +struct ncclPatStep { + int recvDim, sendDim, recvOffset, sendOffset, stepOffset, postRecv, postSend, nelem, last, flags; + size_t inpIx, outIx; +}; + +struct ncclPatPeer { + uint64_t step; + struct ncclConnInfo* conn; + struct ncclConnFifo* connFifo; + void* buff; + uint64_t *headPtr; + uint64_t *tailPtr; + uint64_t stepCache; + long long int accSize; + int connStepSize; +}; + +#define NCCL_SHMEM_PAT_STEPS 32 +struct ncclPatShmem { + struct ncclPatStep patSteps[NCCL_SHMEM_PAT_STEPS]; + int parallelFactor; + long long int localAccSize; + struct ncclPatPeer sendDims[32]; // Should cover 2^32 ranks + struct ncclPatPeer recvDims[32]; +}; + template class PatRSAlgorithm{ size_t offset; @@ -394,18 +431,17 @@ class PatRSAlgorithm{ int nrPow2; int postFreq; int lastA; - + int parallelFactor; int aggFactor; int as; // aggregated steps int a; // step inside aggregated step int sendSkipped; // number of skipped steps during aggregation - int recvSkipped; // number of skipped steps during aggregation - int phase2recv; // receive offset for phase 2 + int stepOffset; int aggDelta; int scale; int phase; - __device__ __host__ int min(int a, int b) { + __device__ __host__ ssize_t min(ssize_t a, ssize_t b) { return (a= 2) lastA /= 2*scale; + if (phase == 4) lastA = 1; } __device__ __host__ void reset() { nelem = getNelem(); phase = 0; scale = 1; - phase2recv = 0; as = aggDelta - 1; resetA(); } @@ -465,8 +501,9 @@ class PatRSAlgorithm{ } public: - __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks): + __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks): offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) { + parallelFactor = maxParallelFactor; aggDelta = nrPow2 = (1< 1 && aggFactor < nranks/2) { d /= 2; @@ -486,160 +524,151 @@ public: reset(); } - __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) { -restart: - last = 0; - nelemOut = nelem; - outIx = offset; + __device__ __host__ int getParallelFactor() { + return parallelFactor; + } + + __device__ __host__ void getNextOp(struct ncclPatStep* ps) { + ps->last = 0; + ps->nelem = nelem; + ps->outIx = offset; + ps->stepOffset = stepOffset; int skip = 0; - //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale); - if (phase == 0) { + if (a >= lastA) { + skip = 1; + } else if (phase == 0) { int s = mirrorInvert(a, lastA)*aggDelta + as; if (s >= nranks) skip = 1; int sendDataRank = (rank + s) % nranks; - inpIx = sendDataRank * count + offset; - recvDim = -1; - sendDim = 0; - outIx = 0; - recvOffset = -1; - sendOffset = ((a - sendSkipped)%postFreq) * nelem; - sendStepOffset = 0; - if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) { - postSend = 1; + ps->inpIx = sendDataRank * count + offset; + ps->recvDim = -1; + ps->sendDim = 0; + ps->outIx = 0; + ps->recvOffset = -1; + ps->sendOffset = (a%postFreq) * nelem; + if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) { + ps->postSend = 1; } else { - postSend = 0; + ps->postSend = 0; } - postRecv = 0; - if (skip) sendSkipped++; - if (++a == lastA) { - phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2 - resetA(); - } - if (skip == 0) return; + ps->postRecv = 0; } else if (phase == 1) { int s = mirrorInvert(a, lastA)*aggDelta + as; if (s >= nranks) skip = 1; - recvDim = firstBitSet(s, nrPow2); - sendOffset = ((a - sendSkipped)%postFreq)*nelem; - recvOffset = ((a - recvSkipped)%postFreq)*nelem; - postSend = 0; - if (recvDim == 0) { - if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1; - sendStepOffset = 0; + ps->recvDim = firstBitSet(s, nrPow2); + ps->sendOffset = (a%postFreq)*nelem; + ps->recvOffset = (a%postFreq)*nelem; + ps->postSend = 0; + if (ps->recvDim == 0 && (((a%postFreq) + 1 >= postFreq) || (a == lastA-1))) ps->postSend = 1; + if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) { + ps->postRecv = 1; } else { - sendStepOffset = (a - sendSkipped)/postFreq; + ps->postRecv = 0; } - if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) { - postRecv = 1; - } else { - postRecv = 0; - } - s -= (1<recvDim); int recvDataRank = (rank + nranks + s) % nranks; - inpIx = recvDataRank * count + offset; - sendDim = s ? firstBitSet(s, nrPow2) : -1; - if (sendDim == -1) { - sendOffset = -1; - sendStepOffset = 0; - } else if (as - (1<inpIx = recvDataRank * count + offset; + ps->sendDim = s ? firstBitSet(s, nrPow2) : -1; + if (ps->sendDim == -1) { + ps->sendOffset = -1; + } else if (as - (1<recvDim) == 0) { + if (newPeer(a, aggFactor)) { sendSkipped = a; ps->stepOffset = stepOffset = 0; } int foffset = a - sendSkipped; - sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq; - sendOffset = (foffset%postFreq)*nelem; + ps->sendOffset = (foffset%postFreq)*nelem; } + int recvDim = ps->recvDim; if (s < nranks && skip) { - recvDim = -1; - recvOffset = -1; - postRecv = 0; + ps->recvDim = -1; + ps->recvOffset = -1; + ps->postRecv = 0; skip = 0; } - if (skip || recvDim == -1) recvSkipped++; - if (skip) sendSkipped++; - if (++a == lastA) { - as--; - phase = as % 2 == 1 ? 0 : 1; - resetA(); - } - if (skip == 0) return; + if (recvDim > 0 && (((a-sendSkipped)%postFreq) + 1 >= postFreq) && skip == 0) stepOffset++; } else if (phase == 2) { int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1; - postRecv = 0; + ps->postRecv = 0; if (s >= nranks) skip = 1; - recvDim = 0; - postSend = a == lastA-1 ? 1 : 0; + ps->recvDim = 0; + ps->postSend = a == lastA-1 ? 1 : 0; s -= 1; if (s < nranks && skip) { - recvDim = -1; - recvOffset = -1; + ps->recvDim = -1; + ps->recvOffset = -1; skip = 0; } else if (!skip) { - int foffset = phase2recv; - phase2recv++; - postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0; - recvOffset = (foffset%postFreq) * nelem; + int foffset = a + aggFactor - aggFactor/scale; + ps->postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0; + ps->recvOffset = (foffset%postFreq) * nelem; } int recvDataRank = (rank + nranks + s) % nranks; - inpIx = recvDataRank * count + offset; - sendDim = s ? firstBitSet(s, nrPow2) : -1; - int foffset = a - sendSkipped; - postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0; - sendStepOffset = 0; - sendOffset = (foffset%postFreq) * nelem; - if (skip || sendDim == -1) sendSkipped++; - if (++a == lastA) { - phase = 3; - resetA(); - } - if (skip == 0) return; + ps->inpIx = recvDataRank * count + offset; + ps->sendDim = s ? firstBitSet(s, nrPow2) : -1; + int foffset = a; + ps->postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0; + ps->sendOffset = (foffset%postFreq) * nelem; } else if (phase == 3) { int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta; - postRecv = a == lastA-1 ? 1 : 0; + ps->postRecv = a == lastA-1 ? 1 : 0; if (s >= nranks) skip = 1; - recvDim = firstBitSet(s, nrPow2); - postSend = 0; - s -= (1<recvDim = firstBitSet(s, nrPow2); + ps->postSend = 0; + s -= (1<recvDim); + int foffset = a; + ps->postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0; + ps->recvOffset = (foffset%postFreq) * nelem; int recvDataRank = (rank + nranks + s) % nranks; - inpIx = recvDataRank * count + offset; - sendDim = s ? firstBitSet(s, nrPow2) : -1; + ps->inpIx = recvDataRank * count + offset; + ps->sendDim = s ? firstBitSet(s, nrPow2) : -1; if (s < nranks && skip) { - recvDim = -1; - recvOffset = -1; - postRecv = 0; + ps->recvDim = -1; + ps->recvOffset = -1; + ps->postRecv = 0; skip = 0; } - if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a; + if (newPeer(a, aggFactor/(2*scale))) { sendSkipped = a; ps->stepOffset = stepOffset = 0; } foffset = a - sendSkipped; - sendStepOffset = foffset / postFreq; // Accumulate on next steps - sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1; - if (skip || recvDim == -1) recvSkipped++; - if (skip) sendSkipped++; - if (++a == lastA) { - scale *= 2; - phase = scale < aggFactor ? 2 : 4; + if ((foffset%postFreq) + 1 >= postFreq && skip == 0) stepOffset++; + ps->sendOffset = ps->sendDim >= 0 ? (foffset%postFreq) * nelem : -1; + } else if (phase == 4) { + ps->recvDim = 0; + ps->sendDim = -1; + ps->inpIx = rank * count + offset; + ps->recvOffset = ((aggFactor-1)%postFreq) * nelem; + ps->sendOffset = -1; + ps->postRecv = 1; + ps->postSend = 0; + offset += chunkCount; + } + a++; + if (a >= lastA && a >= parallelFactor) { + int p = phase; + if (p == 1) as--; + if (p == 3) scale *= 2; + phase = + p == 0 ? as == 1 ? (aggFactor > 1 ? 2 : 4) : 1 : + p == 1 ? as % 2 == 1 ? 0 : 1 : + p == 2 ? 3 : + p == 3 ? scale < aggFactor ? 2 : 4 : + 5; + if (p == 4) { + if (offset >= end) { + ps->last = 2; + } else { + reset(); + } + } else { resetA(); } - if (skip == 0) return; - } else if (phase == 4) { - recvDim = 0; - sendDim = -1; - inpIx = rank * count + offset; - recvOffset = (phase2recv%postFreq) * nelem; - sendStepOffset = 0; - sendOffset = -1; - postRecv = 1; - postSend = 0; - offset += chunkCount; - if (offset >= end) { - last = 1; - } else { - reset(); - } - return; + } else if (phase == 4 && offset >= end) { + ps->last = 1; } - goto restart; + int flags = PatUsed | (skip ? PatSkipped : 0); +#if __CUDA_ARCH__ >= 600 + cuda::atomic_ref a(ps->flags); + a.store(flags, cuda::memory_order_release); +#else + ps->flags = flags; +#endif } }; @@ -655,14 +684,12 @@ class PatAGAlgorithm{ int nrPow2; int postFreq; int lastA; - + int parallelFactor; int aggFactor; int as; // aggregated steps int a; // step inside aggregated step int aggDelta; - int scale; - int phase; // AS computation @@ -671,7 +698,7 @@ class PatAGAlgorithm{ int bitCount[32]; int bitZeroStep[32]; - __device__ __host__ int min(int a, int b) { + __device__ __host__ ssize_t min(ssize_t a, ssize_t b) { return (a 1 && aggFactor < nranks/2) { d /= 2; aggFactor *= 2; aggDelta /= 2; } - //printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta); asDim = log2Up(aggDelta); reset(); } - __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) { -restart: - //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale); - last = 0; - nelemOut = nelem; - inpIx = offset; + __device__ __host__ int getParallelFactor() { + return parallelFactor; + } + + __device__ __host__ void getNextOp(struct ncclPatStep* ps) { + ps->last = 0; + ps->nelem = nelem; + ps->inpIx = offset; int skip = 0; - if (phase == 0) { + if (a >= lastA) { + skip = 1; + } else if (phase == 0) { int s = a*aggDelta + as; if (s >= nranks) skip = 1; - int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0; int recvDataRank = (rank + s) % nranks; - outIx = recvDataRank * count + offset; - sendDim = -1; - recvDim = 0; - inpIx = 0; - sendOffset = -1; - recvOffset = (a % postFreq) * nelem; - recvStepOffset = 0; - postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; - postSend = 0; - a++; - if (nextSkip) { - as = nextAs(); - if (as == aggDelta/2) { - offset += chunkCount; - if (offset >= end) { - last = 1; - } else { - reset(); - } - return; - } - phase = 1; - resetA(); - } - if (skip == 0) return; + ps->outIx = recvDataRank * count + offset; + ps->sendDim = -1; + ps->recvDim = 0; + ps->inpIx = 0; + ps->sendOffset = -1; + ps->recvOffset = (a % postFreq) * nelem; + ps->stepOffset = 0; + ps->postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; + ps->postSend = 0; } else if (phase == 1) { int s = a*aggDelta + as; if (s >= nranks) skip = 1; - sendDim = firstBitSet(s, nrPow2); - s -= (1<sendDim = firstBitSet(s, nrPow2); + s -= (1<sendDim); int sendDataRank = (rank + nranks + s) % nranks; - outIx = sendDataRank * count + offset; - recvDim = s ? firstBitSet(s, nrPow2) : -1; - sendOffset = recvOffset = (a % postFreq) * nelem; - postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; - postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0; - recvStepOffset = (sendDim == 0) ? 0 : a/postFreq; - if (recvDim == -1) { - recvOffset = -1; - postRecv = 0; - } else if (as - (1<> (recvDim+1); - recvOffset = (foffset%postFreq)*nelem; - postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<= nranks) ? 1 : 0; - recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq; + ps->outIx = sendDataRank * count + offset; + ps->recvDim = s ? firstBitSet(s, nrPow2) : -1; + ps->sendOffset = ps->recvOffset = (a % postFreq) * nelem; + ps->postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; + ps->postRecv = (ps->sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0; + ps->stepOffset = (ps->sendDim == 0) ? 0 : a/postFreq; + if (ps->recvDim == -1) { + ps->recvOffset = -1; + ps->postRecv = 0; + } else if (as - (1<sendDim) == 0) { + int foffset = (a*aggDelta) >> (ps->recvDim+1); + ps->recvOffset = (foffset%postFreq)*nelem; + ps->postRecv = (ps->sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<recvDim) >= nranks) ? 1 : 0; + ps->stepOffset = (ps->sendDim == 0) ? 0 : foffset/postFreq; } - if (s < nranks && sendDim == 0 && skip) { + if (s < nranks && ps->sendDim == 0 && skip) { // Don't forget to receive at least once even if we don't send afterwards - sendDim = -1; - sendOffset = -1; - postSend = 0; + ps->sendDim = -1; + ps->sendOffset = -1; + ps->postSend = 0; skip = 0; } - if (++a == lastA) { - if (as % 2 == 1) { - phase = 0; - } else { - as = nextAs(); - } - resetA(); - } - if (skip == 0) return; } else if (phase == 2) { int s = (2*a+1)*scale*aggDelta; - postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0; - postRecv = 0; + ps->postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0; + ps->postRecv = 0; if (s >= nranks) skip = 1; - sendDim = firstBitSet(s, nrPow2); - s -= (1<sendDim = firstBitSet(s, nrPow2); + s -= (1<sendDim); + ps->sendOffset = (a%postFreq) * nelem; + ps->stepOffset = a / postFreq; int sendDataRank = (rank + nranks + s) % nranks; - outIx = sendDataRank * count + offset; - recvDim = s ? firstBitSet(s, nrPow2) : -1; - if (recvDim == -1) { - recvOffset = -1; + ps->outIx = sendDataRank * count + offset; + ps->recvDim = s ? firstBitSet(s, nrPow2) : -1; + if (ps->recvDim == -1) { + ps->recvOffset = -1; } else { - s -= (1<> (recvDim+1); - recvOffset = (foffset%postFreq)*nelem; - recvStepOffset = foffset / postFreq; + s -= (1<recvDim); + int foffset = (a*2*scale*aggDelta) >> (ps->recvDim+1); + ps->recvOffset = (foffset%postFreq)*nelem; + ps->stepOffset = foffset / postFreq; } - if (++a == lastA) { - scale /= 2; - phase = scale ? 2 : 1; + } + a++; + if (a >= lastA && a >= parallelFactor) { + int p = phase; + if (p == 2) scale /= 2; + phase = + p == 2 ? scale ? 2 : 1 : + p == 1 ? as % 2 == 1 ? 0 : 1 : + 1; + if (p == 0 || (p == 1 && as % 2 == 0)) as = nextAs(); + if (p == 0 && as == aggDelta/2) { + offset += chunkCount; + if (offset >= end) { + ps->last = 2; + } else { + reset(); + } + } else { resetA(); } - if (skip == 0) return; + } else if (phase == 0 && as == 1 && offset + chunkCount >= end && a-1 >= ((lastA-1) / parallelFactor) * parallelFactor) { + ps->last = 1; } - goto restart; + int flags = PatUsed | (skip ? PatSkipped : 0); +#if __CUDA_ARCH__ >= 600 + cuda::atomic_ref a(ps->flags); + a.store(flags, cuda::memory_order_release); +#else + ps->flags = flags; +#endif } }; #endif diff --git a/src/include/comm.h b/src/include/comm.h index c3f4eb4..4095187 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -131,6 +131,9 @@ struct ncclSharedResources { int* tpRankToLocalRank; // Internal streams struct ncclStrongStream deviceStream, hostStream; + int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream + int persistentRefs; + cudaEvent_t launchEvent, scratchEvent; /* proxy related shared res */ struct ncclProxyState* proxyState; @@ -407,6 +410,7 @@ struct ncclComm { // List of destructors to run when comm is destructed struct ncclDestructor* destructorHead; + struct ncclCudaContext* context; struct ncclSharedResources* sharedRes; /* map to top parent ranks. */ int* topParentRanks; @@ -419,6 +423,7 @@ struct ncclComm { int netPluginLoaded; ncclNet_t* ncclNet; + int ncclNetVer; ncclNetDeviceType netDeviceType; ncclCollNet_t* ncclCollNet; void* bootstrap; @@ -426,6 +431,7 @@ struct ncclComm { uint64_t* connectSend; uint64_t* connectRecv; struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS]; + int maxTreePattern; bool initAlgoChannels[NCCL_NUM_ALGORITHMS]; bool runtimeConn; // if dynamic connection is supported bool directMode; @@ -565,8 +571,7 @@ struct ncclComm { struct ncclComm* groupNext; // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. struct ncclComm* preconnectNext; - int persistentRefs; // number of persistent plan-lists capturing this comm - int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream + int localPersistentRefs; // number of persistent plan-lists capturing this comm struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule; struct ncclKernelPlanner planner; @@ -603,6 +608,7 @@ struct ncclComm { // Profiler plugin void* profilerContext; uint64_t seqNumber[NCCL_NUM_FUNCTIONS]; + struct ncclProfilerProxy profiler; // buffer registration cache struct ncclRegCache regCache; diff --git a/src/include/device.h b/src/include/device.h index 3f918ab..0763a57 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -133,6 +133,7 @@ struct ncclProxyConnector { struct ncclConnector { int connected; + int hasSeen; struct ncclProxyConnector proxyConn; struct ncclTransportComm* transportComm; void* transportResources; @@ -374,6 +375,7 @@ struct alignas(16) ncclDevChannel { struct ncclDirect collnetDirect; struct ncclNvls nvls; uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed + uint64_t workCounter; }; struct ncclDevComm { @@ -396,6 +398,10 @@ struct ncclDevComm { // Channels, device side struct ncclDevChannel* channels/*[MAXCHANNELS]*/; int* rankToLocalRank; + + // Profiler counters + uint64_t* workStarted/*[MAXCHANNELS]*/; + uint64_t* workCompleted/*[MAXCHANNELS]*/; }; struct alignas(16) ncclDevCommAndChannels { @@ -468,7 +474,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) { // Our collective unroll should move to the same bytes&insns model as NVLS. - return cudaArch >= 800 ? 8 : 4; + return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4; } __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } diff --git a/src/include/graph.h b/src/include/graph.h index a22b62b..b779773 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -36,7 +36,13 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret); -ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr); +enum ncclTopoGdrMode { + ncclTopoGdrModeDisable = 0, + ncclTopoGdrModeDefault = 1, + ncclTopoGdrModePci = 2, + ncclTopoGdrModeNum = 3 +}; +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode); ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush); ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail); ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net); @@ -55,9 +61,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #define NCCL_TOPO_CPU_VENDOR_AMD 2 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 #define NCCL_TOPO_CPU_VENDOR_MIXED 4 -#define NCCL_TOPO_CPU_TYPE_BDW 1 -#define NCCL_TOPO_CPU_TYPE_SKL 2 -#define NCCL_TOPO_CPU_TYPE_YONGFENG 1 +#define NCCL_TOPO_CPU_MODEL_INTEL_BDW 1 +#define NCCL_TOPO_CPU_MODEL_INTEL_SKL 2 +#define NCCL_TOPO_CPU_MODEL_INTEL_SRP 3 +#define NCCL_TOPO_CPU_MODEL_INTEL_ERP 4 +#define NCCL_TOPO_CPU_MODEL_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); diff --git a/src/include/group.h b/src/include/group.h index 91bc190..c06d1ef 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -112,6 +112,12 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) { struct ncclComm** pp = &ncclGroupCommHead; while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0) pp = &(*pp)->groupNext; + + // didn't find its clique, we need to insert it with ascending order based on commHash + if (*pp == nullptr) { + pp = &ncclGroupCommHead; + while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext; + } comm->groupNext = *pp; *pp = comm; // Comms gets a new memory stack scope upon joining. Each task batched for diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h deleted file mode 100644 index f165aa1..0000000 --- a/src/include/nccl_net.h +++ /dev/null @@ -1,604 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NET_H_ -#define NCCL_NET_H_ - -#include "nccl.h" -#include "nccl_common.h" -#include "net_device.h" -#include - -#define NCCL_NET_HANDLE_MAXSIZE 128 -//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties -#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) -#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 - -#define NCCL_PTR_HOST 0x1 -#define NCCL_PTR_CUDA 0x2 -#define NCCL_PTR_DMABUF 0x4 - -// Maximum number of requests per comm object -#define NCCL_NET_MAX_REQUESTS 32 - -// Max number of ncclNet objects which can live in the same process -#define NCCL_NET_MAX_PLUGINS 3 - -#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 -#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9 - -typedef struct { - int ndevs; - int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; -} ncclNetVDeviceProps_v9_t; -typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t; - -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int regIsGlobal; // regMr is not tied to a particular comm - int forceFlush; // Force a flush on receives - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. - ncclNetDeviceType netDeviceType; // Network offload type - int netDeviceVersion; // Version number for network offload - ncclNetVDeviceProps_v9_t vProps; - size_t maxP2pBytes; // Max transfer size for point-to-point operations - size_t maxCollBytes; // Max transfer size for collective operations -} ncclNetProperties_v9_t; -typedef ncclNetProperties_v9_t ncclNetProperties_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Copy the given mhandle to a dptr in a format usable by this plugin's device code - ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); - - // Notify the plugin that a recv has completed by the device - ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); - - // Create a virtual NIC given the specified properties, which can be accessed at device index d - ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); -} ncclNet_v9_t; - -typedef ncclNet_v9_t ncclNet_t; - -#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9 - -typedef struct { - void* mhandle; - void* address; - size_t size; -} ncclNetSGE_v9_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - void* sendMhandle, void** request); - ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - ncclDataType_t dataType, ncclRedOp_t redOp, - void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Create a virtual NIC given the specified properties, which can be accessed at device index d - ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); -} ncclCollNet_v9_t; - -typedef ncclCollNet_v9_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9 - -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int regIsGlobal; // regMr is not tied to a particular comm - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. - ncclNetDeviceType netDeviceType; // Network offload type - int netDeviceVersion; // Version number for network offload -} ncclNetProperties_v8_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Copy the given mhandle to a dptr in a format usable by this plugin's device code - ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); - - // Notify the plugin that a recv has completed by the device - ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); -} ncclNet_v8_t; - -typedef struct { - void* mhandle; - void* address; - uint32_t size; -} ncclNetSGE_v8_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - void* sendMhandle, void** request); - ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - ncclDataType_t dataType, ncclRedOp_t redOp, - void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v8_t; - -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. - ncclNetDeviceType netDeviceType; // Network offload type - int netDeviceVersion; // Version number for network offload -} ncclNetProperties_v7_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Copy the given mhandle to a dptr in a format usable by this plugin's device code - ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); - - // Notify the plugin that a recv has completed by the device - ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); -} ncclNet_v7_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v7_t; - -#define NCCL_NET_MAX_REQUESTS_V6 8 - -// v6 struct for backwards compatibility -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. -} ncclNetProperties_v6_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v6_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v6_t; - -// v5 struct for backwards compatibility -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v5_t; - -// v5 struct for backwards compatibility -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v5_t; - -#endif // end include guard diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h deleted file mode 100644 index a8164d0..0000000 --- a/src/include/nccl_profiler.h +++ /dev/null @@ -1,235 +0,0 @@ -/************************************************************************* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_PROFILER_H_ -#define NCCL_PROFILER_H_ - -#include - -enum { - ncclProfileGroup = (1 << 0), // group event type - ncclProfileColl = (1 << 1), // host collective call event type - ncclProfileP2p = (1 << 2), // host point-to-point call event type - ncclProfileProxyOp = (1 << 3), // proxy operation event type - ncclProfileProxyStep = (1 << 4), // proxy step event type - ncclProfileProxyCtrl = (1 << 5), // proxy control event type -}; - -typedef struct { - uint8_t type; // event type descriptor: ncclProfileColl, ... - void* parentObj; // pointer to the profiler parent object (for coll is the group) - int rank; // originating rank - union { - struct { - const char* name; - uint64_t commHash; - uint64_t seqNumber; - const char* func; - void const* sendBuff; - void* recvBuff; - size_t count; - int root; - const char* datatype; - size_t trafficBytes; - uint8_t nMaxChannels; - uint8_t nWarps; - const char* algo; - const char* proto; - } coll; - - struct { - const char* name; - uint64_t commHash; - const char* func; - void* buff; - const char* datatype; - size_t count; - int peer; - } p2p; - - struct { - pid_t pid; // pid of the originating process - uint8_t channelId; // channel id for this proxy operation - int peer; // remote rank for send/recv - int nSteps; // number of steps for this proxy operation - int chunkSize; // amount of data transferred by this proxy operation - int isSend; - } proxyOp; - - struct { - int step; - } proxyStep; - }; -} ncclProfilerEventDescr_v2_t; - -typedef enum { - ncclProfilerProxyOpSendPosted, - ncclProfilerProxyOpSendRemFifoWait, - ncclProfilerProxyOpSendTransmitted, - ncclProfilerProxyOpSendDone, - ncclProfilerProxyOpRecvPosted, - ncclProfilerProxyOpRecvReceived, - ncclProfilerProxyOpRecvTransmitted, - ncclProfilerProxyOpRecvDone, - - /* Legacy proxy profiler states */ - ncclProfilerProxyStepSendGPUWait, - ncclProfilerProxyStepSendWait, - ncclProfilerProxyStepRecvWait, - ncclProfilerProxyStepRecvFlushWait, - ncclProfilerProxyStepRecvGPUWait, - - /* Legacy proxy control states */ - ncclProfilerProxyCtrlIdle, - ncclProfilerProxyCtrlActive, - ncclProfilerProxyCtrlSleep, - ncclProfilerProxyCtrlWakeup, - ncclProfilerProxyCtrlAppend, - ncclProfilerProxyCtrlAppendEnd, -} ncclProfilerEventState_v2_t; - -typedef union { - struct { - size_t transSize; - int steps; - } proxyOp; - - struct { - int appendedProxyOps; - } proxyCtrl; -} ncclProfilerEventStateArgs_v2_t; - -typedef struct { - const char* name; - - // init - initialize the profiler plugin - // Input - // - context : opaque profiler context object for separating profiler behavior across comms - // Output - // - eActivationMask: bitmask of active events set by the plugin - ncclResult_t (*init)(void** context, int* eActivationMask); - - // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset - // Input - // - context: opaque profiler context object - // - eDescr : pointer to ncclProfilerEventDescr_t object - // Output - // - eHandle: return event handle for supplied event descriptor object - ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); - - // stopEvent - stop/finalize an event inside and event set - // Input - // - eHandle: handle to event object - ncclResult_t (*stopEvent)(void* eHandle); - - // recordEventState - record event state transitions and event attribute updates - // Input - // - eHandle : handle to event object created through startEvent - // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition - // - eState : event state transition - ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); - - // finalize - finalize the profiler plugin - // Input - // - context: opaque profiler context object - ncclResult_t (*finalize)(void* context); -} ncclProfiler_v2_t; - -typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t; -typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t; -typedef ncclProfiler_v2_t ncclProfiler_t; - -typedef struct { - uint8_t type; // event type descriptor: ncclProfileColl, ... - void* parentObj; // pointer to the profiler parent object (for coll is the group) - int rank; // originating rank - union { - struct { - const char* name; - uint64_t commHash; - uint64_t seqNumber; - uint8_t func; - void const* sendBuff; - void* recvBuff; - size_t count; - int root; - uint8_t datatype; - uint32_t op; - size_t trafficBytes; - uint8_t nMaxChannels; - uint8_t nWarps; - uint8_t algo; - uint8_t proto; - int isCollnet; - int isNvls; - } coll; - - struct { - const char* name; - uint64_t commHash; - uint8_t func; - void* buff; - uint8_t datatype; - size_t count; - int peer; - } p2p; - - struct { - pid_t pid; // pid of the originating process - uint8_t channelId; // channel id for this proxy operation - int peer; // remote rank for send/recv - int nSteps; // number of steps for this proxy operation - int chunkSize; // amount of data transferred by this proxy operation - int isSend; - } proxyOp; - - struct { - int step; - } proxyStep; - }; -} ncclProfilerEventDescr_v1_t; - -typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t; -typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t; - -typedef struct { - const char* name; - - // init - initialize the profiler plugin - // Input - // - context : opaque profiler context object for separating profiler behavior across comms - // Output - // - eActivationMask: bitmask of active events set by the plugin - ncclResult_t (*init)(void** context, int* eActivationMask); - - // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset - // Input - // - context: opaque profiler context object - // - eDescr : pointer to ncclProfilerEventDescr_t object - // Output - // - eHandle: return event handle for supplied event descriptor object - ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); - - // stopEvent - stop/finalize an event inside and event set - // Input - // - eHandle: handle to event object - ncclResult_t (*stopEvent)(void* eHandle); - - // recordEventState - record event state transitions and event attribute updates - // Input - // - eHandle : handle to event object created through startEvent - // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition - // - eState : event state transition - ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); - - // finalize - finalize the profiler plugin - // Input - // - context: opaque profiler context object - ncclResult_t (*finalize)(void* context); -} ncclProfiler_v1_t; - -#endif diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h deleted file mode 100644 index 6e61118..0000000 --- a/src/include/nccl_tuner.h +++ /dev/null @@ -1,149 +0,0 @@ -/************************************************************************* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_TUNER_H_ -#define NCCL_TUNER_H_ - -#include "nccl.h" -#include "nccl_common.h" - -// API to be implemented by external tuner -typedef struct { - // Name of the tuner - const char* name; - - // Initializes tuner states. - // Inputs: - // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. - // - nNodes: number of nodes in current communicator. - // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. - // Outputs: - // - context: tuner context object - ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); - - // Gets info (algo, protocol, number of ctas and threads) for a given collective. - // Inputs: - // - context: tuner context object - // - collType: collective type , e.g., allreduce, allgather… - // - nBytes: collective size in bytes - // - numPipeOps: number of operations in the group - // - numAlgo: number of algorithms in collCostTable - // - numProto: number of protocols in collCostTable - // - regBuff: can register user buffer - // - // Outputs: - // - nChannels: number of channels (hence SMs) to be used. - // - // InOut: - // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. - // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). - // - // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the - // default tuning for the given collective. - // Also, the plugin is allowed to not set any output, or set only the - // algorithm and protocol, but not only the algorithm or only the protocol. - // Unset fields will be set automatically by NCCL. - ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, - int numPipeOps, float** collCostTable, int numAlgo, int numProto, - int regBuff, int* nChannels); - - // Terminates the plugin and cleans up any resources that the plugin allocated. - // context: tuner context object - ncclResult_t (*destroy)(void* context); -} ncclTuner_v4_t; - -typedef ncclTuner_v4_t ncclTuner_t; - -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" - -// API to be implemented by external tuner -typedef struct { - // Name of the tuner - const char* name; - - // Initializes tuner states. - // Inputs: - // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. - // - nNodes: number of nodes in current communicator. - // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. - // Outputs: - // - context: tuner context object - ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); - - // Gets info (algo, protocol, number of ctas and threads) for a given collective. - // Inputs: - // - context: tuner context object - // - collType: collective type , e.g., allreduce, allgather… - // - nBytes: collective size in bytes - // - numPipeOps: number of operations in the group - // - numAlgo: number of algorithms in collCostTable - // - numProto: number of protocols in collCostTable - // - // Outputs: - // - nChannels: number of channels (hence SMs) to be used. - // - // InOut: - // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. - // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). - // - // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the - // default tuning for the given collective. - // Also, the plugin is allowed to not set any output, or set only the - // algorithm and protocol, but not only the algorithm or only the protocol. - // Unset fields will be set automatically by NCCL. - ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, - int numPipeOps, float** collCostTable, int numAlgo, int numProto, - int* nChannels); - - // Terminates the plugin and cleans up any resources that the plugin allocated. - // context: tuner context object - ncclResult_t (*destroy)(void* context); -} ncclTuner_v3_t; - -// API to be implemented by external tuner -typedef struct { - // Name of the tuner - const char* name; - - // Initializes tuner states. - // Inputs: - // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. - // - nNodes: number of nodes in current communicator. - // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. - // Outputs: - // - context: tuner context object - ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); - - // Gets info (algo, protocol, number of ctas and threads) for a given collective. - // Inputs: - // - context: tuner context object - // - collType: collective type , e.g., allreduce, allgather… - // - nBytes: collective size in bytes - // - collNetTypeSupport: whether collnet supports this type - // - nvlsTypeSupport: whether nvlink sharp supports this time - // - numPipeOps: number of operations in the group - // - // Outputs: - // - algorithm: selected algorithm to be used for the given collective - // - protocol: selected protocol to be used for the give collective - // - nChannels: number of channels (hence SMs) to be used. - // - // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the - // default tuning for the given collective. - // Also, the plugin is allowed to not set any output, or set only the - // algorithm and protocol, but not only the algorithm or only the protocol. - // Unset fields will be set automatically by NCCL. - ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, - int collNetSupport, int nvlsSupport, int numPipeOps, - int* algorithm, int* protocol, int* nChannels); - - // Terminates the plugin and cleans up any resources that the plugin allocated. - // context: tuner context object - ncclResult_t (*destroy)(void* context); -} ncclTuner_v2_t; - -#endif diff --git a/src/include/net.h b/src/include/net.h index d1926cc..afc2d16 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -18,7 +18,6 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm); ncclResult_t ncclNetPluginUnload(struct ncclComm* comm); ncclResult_t ncclNetInit(struct ncclComm* comm); ncclResult_t ncclNetFinalize(struct ncclComm* comm); -int ncclNetVersion(struct ncclComm* comm); // Test whether the current GPU support GPU Direct RDMA. ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); diff --git a/src/include/net_device.h b/src/include/net_device.h index 5fae9b5..c3a79e3 100644 --- a/src/include/net_device.h +++ b/src/include/net_device.h @@ -26,6 +26,7 @@ typedef struct { typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; -typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; +typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; #endif diff --git a/src/include/nvtx.h b/src/include/nvtx.h index 5d00f07..2c18b36 100644 --- a/src/include/nvtx.h +++ b/src/include/nvtx.h @@ -31,9 +31,10 @@ #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommSplit 13 #define NVTX_SID_CommFinalize 14 +// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below! // Define static schema ID for the reduction operation. -#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START +#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 15 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START extern const nvtxDomainHandle_t ncclNvtxDomainHandle; diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h new file mode 100644 index 0000000..d57aad5 --- /dev/null +++ b/src/include/plugin/nccl_net.h @@ -0,0 +1,54 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_NET_H_ +#define NCCL_NET_H_ + +#include "nccl.h" +#include "nccl_common.h" +#include "net_device.h" +#include + +#define NCCL_NET_HANDLE_MAXSIZE 128 +//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties +#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) +#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 + +#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. +#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried + +#define NCCL_PTR_HOST 0x1 +#define NCCL_PTR_CUDA 0x2 +#define NCCL_PTR_DMABUF 0x4 + +// Maximum number of requests per comm object +#define NCCL_NET_MAX_REQUESTS 32 + +// Max number of ncclNet objects which can live in the same process +#define NCCL_NET_MAX_PLUGINS 3 + +// NCCL core profiler callback for network defined events instrumentation +typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); + +#include "net/net_v10.h" +#include "net/net_v9.h" +#include "net/net_v8.h" +#include "net/net_v7.h" +#include "net/net_v6.h" + +typedef ncclNet_v10_t ncclNet_t; +typedef ncclCollNet_v10_t ncclCollNet_t; +typedef ncclNetSGE_v10_t ncclNetSGE_t; +typedef ncclNetProperties_v10_t ncclNetProperties_t; +typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; +typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; + +#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10 + +#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10 +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10 + +#endif // end include guard diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h new file mode 100644 index 0000000..34cf9a9 --- /dev/null +++ b/src/include/plugin/nccl_profiler.h @@ -0,0 +1,69 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_H_ +#define NCCL_PROFILER_H_ + +enum { + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileKernelCh = (1 << 6), // kernel channel event type + ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events +}; + +typedef enum { + ncclProfilerProxyOpSendPosted, + ncclProfilerProxyOpSendRemFifoWait, + ncclProfilerProxyOpSendTransmitted, + ncclProfilerProxyOpSendDone, + ncclProfilerProxyOpRecvPosted, + ncclProfilerProxyOpRecvReceived, + ncclProfilerProxyOpRecvTransmitted, + ncclProfilerProxyOpRecvDone, + + /* Legacy proxy profiler states */ + ncclProfilerProxyStepSendGPUWait, + ncclProfilerProxyStepSendWait, + ncclProfilerProxyStepRecvWait, + ncclProfilerProxyStepRecvFlushWait, + ncclProfilerProxyStepRecvGPUWait, + + /* Legacy proxy control states */ + ncclProfilerProxyCtrlIdle, + ncclProfilerProxyCtrlActive, + ncclProfilerProxyCtrlSleep, + ncclProfilerProxyCtrlWakeup, + ncclProfilerProxyCtrlAppend, + ncclProfilerProxyCtrlAppendEnd, +} ncclProfilerEventState_t; + +typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; + +#include +#include "profiler/profiler_v3.h" +#include "profiler/profiler_v2.h" +#include "profiler/profiler_v1.h" + +typedef ncclProfiler_v3_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; + +#define NCCL_PROFILER_NET_VER_BITS (16) +#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) +#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS) + +typedef enum { + NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS), + NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS), +} ncclProfilerNetType; + +#endif diff --git a/src/include/plugin/nccl_tuner.h b/src/include/plugin/nccl_tuner.h new file mode 100644 index 0000000..f240189 --- /dev/null +++ b/src/include/plugin/nccl_tuner.h @@ -0,0 +1,22 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TUNER_H_ +#define NCCL_TUNER_H_ + +#include "nccl.h" +#include "nccl_common.h" + +#include "tuner/tuner_v4.h" +#include "tuner/tuner_v3.h" +#include "tuner/tuner_v2.h" + +typedef ncclTuner_v4_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" + +#endif diff --git a/src/include/plugin/net/net_v10.h b/src/include/plugin/net/net_v10.h new file mode 100644 index 0000000..ada6d48 --- /dev/null +++ b/src/include/plugin/net/net_v10.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NET_V10_H_ +#define NET_V10_H_ + +#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4 + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10]; +} ncclNetVDeviceProps_v10_t; + +#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 + +typedef struct { + // Plugin-specific TC value + int trafficClass; +} ncclNetCommConfig_v10_t; + + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v10_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations +} ncclNetProperties_v10_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props); +} ncclNet_v10_t; + +typedef struct { + void* mhandle; + void* address; + size_t size; +} ncclNetSGE_v10_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Create a virtual NIC given the specified properties, which can be accessed at device index d + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props); +} ncclCollNet_v10_t; + +#endif // end include guard diff --git a/src/include/plugin/net/net_v6.h b/src/include/plugin/net/net_v6.h new file mode 100644 index 0000000..99445ce --- /dev/null +++ b/src/include/plugin/net/net_v6.h @@ -0,0 +1,113 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_V6_H_ +#define NET_V6_H_ + +#define NCCL_NET_MAX_REQUESTS_V6 8 + +// v6 struct for backwards compatibility +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. +} ncclNetProperties_v6_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v6_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v6_t; + +#endif diff --git a/src/include/plugin/net/net_v7.h b/src/include/plugin/net/net_v7.h new file mode 100644 index 0000000..e9b19de --- /dev/null +++ b/src/include/plugin/net/net_v7.h @@ -0,0 +1,120 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_V7_H_ +#define NET_V7_H_ + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v7_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v7_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v7_t; + +#endif diff --git a/src/include/plugin/net/net_v8.h b/src/include/plugin/net/net_v8.h new file mode 100644 index 0000000..a178132 --- /dev/null +++ b/src/include/plugin/net/net_v8.h @@ -0,0 +1,134 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_V8_H_ +#define NET_V8_H_ + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v8_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v8_t; + +typedef struct { + void* mhandle; + void* address; + uint32_t size; +} ncclNetSGE_v8_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v8_t; + +#endif diff --git a/src/include/plugin/net/net_v9.h b/src/include/plugin/net/net_v9.h new file mode 100644 index 0000000..ce9d917 --- /dev/null +++ b/src/include/plugin/net/net_v9.h @@ -0,0 +1,152 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_V9_H_ +#define NET_V9_H_ + +#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; +} ncclNetVDeviceProps_v9_t; + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v9_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations +} ncclNetProperties_v9_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props); +} ncclNet_v9_t; + +typedef struct { + void* mhandle; + void* address; + size_t size; +} ncclNetSGE_v9_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Create a virtual NIC given the specified properties, which can be accessed at device index d + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props); +} ncclCollNet_v9_t; + +#endif // end include guard diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h new file mode 100644 index 0000000..7336c34 --- /dev/null +++ b/src/include/plugin/plugin.h @@ -0,0 +1,18 @@ +/************************************************************************* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PLUGIN_H_ +#define NCCL_PLUGIN_H_ + +#include "nccl.h" + +void* ncclOpenNetPluginLib(const char* name); +void* ncclOpenTunerPluginLib(const char* name); +void* ncclOpenProfilerPluginLib(const char* name); +void* ncclGetNetPluginLib(void); +ncclResult_t ncclClosePluginLib(void* handle); + +#endif diff --git a/src/include/plugin/profiler/net_ib.h b/src/include/plugin/profiler/net_ib.h new file mode 100644 index 0000000..2ac6d5c --- /dev/null +++ b/src/include/plugin/profiler/net_ib.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_IB_H_ +#define NET_IB_H_ + +#include "nccl_profiler.h" +#include "net_ib_v1.h" + +#endif diff --git a/src/include/plugin/profiler/net_ib_v1.h b/src/include/plugin/profiler/net_ib_v1.h new file mode 100644 index 0000000..f142de5 --- /dev/null +++ b/src/include/plugin/profiler/net_ib_v1.h @@ -0,0 +1,34 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_IB_V1_H_ +#define NET_IB_V1_H_ + +#define NCCL_PROFILER_NET_IB_VER 1 + +enum { + ncclProfileQp = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int device; // network device id + uint64_t wr_id; // work request id + int opcode; // ibv opcode + int qpNum; // QP number + size_t length; // work request data length + } qp; + }; +} ncclProfilerNetIbDescr_v1_t; + +#endif diff --git a/src/include/plugin/profiler/net_socket.h b/src/include/plugin/profiler/net_socket.h new file mode 100644 index 0000000..9f57496 --- /dev/null +++ b/src/include/plugin/profiler/net_socket.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_SOCKET_H_ +#define NET_SOCKET_H_ + +#include "nccl_profiler.h" +#include "net_socket_v1.h" + +#endif diff --git a/src/include/plugin/profiler/net_socket_v1.h b/src/include/plugin/profiler/net_socket_v1.h new file mode 100644 index 0000000..0cb664f --- /dev/null +++ b/src/include/plugin/profiler/net_socket_v1.h @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_SOCKET_V1_H_ +#define NET_SOCKET_V1_H_ + +#define NCCL_PROFILER_NET_SOCKET_VER 1 + +enum { + ncclProfileSocket = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int fd; + int op; + size_t length; + } sock; + }; +} ncclProfilerNetSockDescr_v1_t; + +#endif diff --git a/src/include/plugin/profiler/profiler_v1.h b/src/include/plugin/profiler/profiler_v1.h new file mode 100644 index 0000000..3b67102 --- /dev/null +++ b/src/include/plugin/profiler/profiler_v1.h @@ -0,0 +1,107 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V1_H_ +#define PROFILER_V1_H_ + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + uint8_t func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + uint8_t datatype; + uint32_t op; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + uint8_t algo; + uint8_t proto; + int isCollnet; + int isNvls; + } coll; + + struct { + const char* name; + uint64_t commHash; + uint8_t func; + void* buff; + uint8_t datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v1_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v1_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v1_t; + +#endif diff --git a/src/include/plugin/profiler/profiler_v2.h b/src/include/plugin/profiler/profiler_v2.h new file mode 100644 index 0000000..146152a --- /dev/null +++ b/src/include/plugin/profiler/profiler_v2.h @@ -0,0 +1,104 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V2_H_ +#define PROFILER_V2_H_ + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v2_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v2_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v2_t; + +#endif diff --git a/src/include/plugin/profiler/profiler_v3.h b/src/include/plugin/profiler/profiler_v3.h new file mode 100644 index 0000000..10c5059 --- /dev/null +++ b/src/include/plugin/profiler/profiler_v3.h @@ -0,0 +1,112 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V3_H_ +#define PROFILER_V3_H_ + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v3_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v3_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v3_t; + +#endif diff --git a/src/include/plugin/tuner/tuner_v2.h b/src/include/plugin/tuner/tuner_v2.h new file mode 100644 index 0000000..ec96f60 --- /dev/null +++ b/src/include/plugin/tuner/tuner_v2.h @@ -0,0 +1,53 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef TUNER_V2_H_ +#define TUNER_V2_H_ + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - collNetTypeSupport: whether collnet supports this type + // - nvlsTypeSupport: whether nvlink sharp supports this time + // - numPipeOps: number of operations in the group + // + // Outputs: + // - algorithm: selected algorithm to be used for the given collective + // - protocol: selected protocol to be used for the give collective + // - nChannels: number of channels (hence SMs) to be used. + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int collNetSupport, int nvlsSupport, int numPipeOps, + int* algorithm, int* protocol, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v2_t; + +#endif diff --git a/src/include/plugin/tuner/tuner_v3.h b/src/include/plugin/tuner/tuner_v3.h new file mode 100644 index 0000000..4fa10e8 --- /dev/null +++ b/src/include/plugin/tuner/tuner_v3.h @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef TUNER_V3_H_ +#define TUNER_V3_H_ + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v3_t; + +#endif diff --git a/src/include/plugin/tuner/tuner_v4.h b/src/include/plugin/tuner/tuner_v4.h new file mode 100644 index 0000000..a4b38a0 --- /dev/null +++ b/src/include/plugin/tuner/tuner_v4.h @@ -0,0 +1,56 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef TUNER_V4_H_ +#define TUNER_V4_H_ + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // - regBuff: can register user buffer + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v4_t; + +#endif diff --git a/src/include/profiler.h b/src/include/profiler.h index 2b7efe0..8d41079 100644 --- a/src/include/profiler.h +++ b/src/include/profiler.h @@ -17,6 +17,18 @@ struct ncclTaskP2p; struct ncclInfo; struct ncclComm; struct ncclProxyOp; +struct ncclProxyConnector; + +struct ncclProfilerProxy { + bool initialized; + uint64_t* workStarted/*[MAXCHANNELS]*/; + uint64_t* workCompleted/*[MAXCHANNELS]*/; + uint64_t workCounter[MAXCHANNELS]; // host work counter + struct ncclProxyConnector sendProxyConn[MAXCHANNELS]; + struct ncclProxyConnector recvProxyConn[MAXCHANNELS]; +}; + +extern int ncclProfilerEventMask; // Plugin Init/Finalize Wrappers ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm); @@ -44,6 +56,10 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle); ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle); +// Kernel Channel Start/Stop Event Wrappers +ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s); +ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s); + // Record Event Wrappers ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState); ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState); @@ -51,5 +67,9 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n // Profiler utility functions ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op); +bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op); + +// Profiler callback for network plugin +ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); #endif diff --git a/src/include/proxy.h b/src/include/proxy.h index c97a4d7..225acb2 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -32,7 +32,8 @@ typedef enum : uint8_t { ncclPatternPatUp, ncclPatternPatDown, ncclPatternSend, - ncclPatternRecv + ncclPatternRecv, + ncclPatternProfiler, } ncclPattern_t; enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; @@ -93,6 +94,7 @@ struct ncclProxyOp { int peer; pid_t pid; void* profilerContext; + uint64_t workCounter; struct ncclProxyOp *enqNext; }; @@ -129,12 +131,15 @@ struct ncclProxySubArgs { // Profiler plugin int eActivationMask; int rank; + uint64_t profilerSteps; pid_t pid; void* profilerContext; void* taskEventHandle; void* opEventHandle; + void* kernelEventHandle; void* stepEventHandles[NCCL_STEPS]; size_t transSize; + uint64_t workCounter; void* recvRequestsCache[NCCL_STEPS]; int recvRequestsSubCount; diff --git a/src/include/ras.h b/src/include/ras.h index 7909b3d..d27a543 100644 --- a/src/include/ras.h +++ b/src/include/ras.h @@ -15,6 +15,8 @@ struct rasRankInit { pid_t pid; int cudaDev; int nvmlDev; + uint64_t hostHash; + uint64_t pidHash; }; ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank); diff --git a/src/include/register.h b/src/include/register.h index 740a645..143f41b 100644 --- a/src/include/register.h +++ b/src/include/register.h @@ -42,7 +42,7 @@ struct ncclReg { uintptr_t baseAddr; size_t baseSize; CUdeviceptr regAddr; - size_t regSize; + size_t regUCSize, regMCSize; int dev; CUmemGenericAllocationHandle mcHandle; uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */ diff --git a/src/include/shm.h b/src/include/shm.h index b519e5d..223d873 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -14,7 +14,6 @@ struct shmCuIpc { CUmemFabricHandle handle; CUmemGenericAllocationHandle data; }; - int tpProxyRank; void *ptr; size_t size; }; @@ -30,8 +29,8 @@ struct shmIpcDesc { typedef struct shmIpcDesc ncclShmIpcDesc_t; -ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr); -ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut); +ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr); +ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut); ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc); #endif diff --git a/src/include/socket.h b/src/include/socket.h index f0a3237..ffa1480 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -96,5 +96,5 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize); ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how); -ncclResult_t ncclSocketClose(struct ncclSocket* sock); +ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false); #endif diff --git a/src/include/strongstream.h b/src/include/strongstream.h index 0984dfe..c56d5ac 100644 --- a/src/include/strongstream.h +++ b/src/include/strongstream.h @@ -10,13 +10,24 @@ #include "nccl.h" #include "checks.h" +#include +#include #include +// ncclCudaContext: wraps a CUDA context with per-context state. +struct ncclCudaContext; + +// Get a ncclCudaContext to track the currently active CUDA context. +ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out); +// Drop reference. +void ncclCudaContextDrop(struct ncclCudaContext* cxt); + /* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes * easily. */ struct ncclCudaGraph { #if CUDART_VERSION >= 11030 + cudaStream_t origin; cudaGraph_t graph; unsigned long long graphId; #endif @@ -25,6 +36,7 @@ struct ncclCudaGraph { inline struct ncclCudaGraph ncclCudaGraphNone() { struct ncclCudaGraph tmp; #if CUDART_VERSION >= 11030 + tmp.origin = nullptr; tmp.graph = nullptr; tmp.graphId = ULLONG_MAX; #endif @@ -33,7 +45,7 @@ inline struct ncclCudaGraph ncclCudaGraphNone() { inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) { #if CUDART_VERSION >= 11030 - return graph.graph != nullptr; + return graph.graphId != ULLONG_MAX; #else return false; #endif @@ -57,60 +69,37 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t * streams unfit for the use of serializing access to a persistent resource. * Strong streams have been introduced to address this need. * - * - All updates to a strong stream must be enclosed by a Acquire/Release pair. + * All updates to a strong stream must be enclosed by a Acquire/Release pair. * - * - The Acquire, Release, and all updates take a ncclCudaGraph parameter - * indicating the currently capturing graph (or none). This parameter must be - * the same for the entire sequence of {Acquire; ...; Release}. + * Acquire retrieves a "work" stream (cudaStream_t) which may be used to add + * work. * - * - An {Acquire; ...; Release} sequence must not be concurrent with any - * other operations against the strong stream including graph launches which - * reference this stream. + * Release publishes the work streams work into the strong stream. The Release + * must be issued by the same thread that did the Acquire. */ struct ncclStrongStream; ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss); ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss); -// Acquire-fence the strong stream. +// Acquire the strong stream. Upon return `*workStream` will be usable to add work. +// `concurrent` indicates if other threads may be using the strong stream. ncclResult_t ncclStrongStreamAcquire( - struct ncclCudaGraph graph, struct ncclStrongStream* ss + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream ); -// Acquire-fence the strong stream assuming no graph is capturing. This permits -// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA -// calls. Strong stream still must be released via: -// ncclStrongStreamRelease(ncclCudaGraphNone(), ss); -ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss); - -// Release-fence of the strong stream. -ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss); - -// Add a host launch to the stream. -ncclResult_t ncclStrongStreamLaunchHost( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, - cudaHostFn_t fn, void* arg -); -// Add a kernel launch to the stream. -ncclResult_t ncclStrongStreamLaunchKernel( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, - void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes +// Get the workStream for an already acquired strong stream. +// `concurrent` indicates if other threads may be using the strong stream. +ncclResult_t ncclStrongStreamAcquiredWorkStream( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream ); -// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired. -// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus -// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the -// implementation to induce few graph dependencies. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false -); -// `b` must be capturing within `graph`. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false -); -// `a` must be capturing within `graph`. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false +// Release of the strong stream. +// `concurrent` indicates if other threads may be using the strong stream. +ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent); + +ncclResult_t ncclStreamWaitStream( + cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent ); // Synchrnoization does not need the strong stream to be acquired. @@ -118,23 +107,28 @@ ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss); //////////////////////////////////////////////////////////////////////////////// -struct ncclStrongStreamGraph; // internal to ncclStrongStream +struct ncclStrongStreamCapture; // internal to ncclStrongStream struct ncclStrongStream { - // Used when not graph capturing. - cudaStream_t cudaStream; + // The stream to use for non-captured work. + cudaStream_t liveStream; + void* liveAcquiredBy; #if CUDART_VERSION >= 11030 + // This stream ever appeared in a graph capture. + bool everCaptured; + pthread_mutex_t lock; + struct ncclStrongStreamCapture* captureHead; // The event used to establish order between graphs and streams. During acquire // this event is waited on, during release it is recorded to. cudaEvent_t serialEvent; - // This stream ever appeared in a graph capture. - bool everCaptured; - // Tracks whether serialEvent needs to be recorded to upon Release(). - bool serialEventNeedsRecord; - struct ncclStrongStreamGraph* graphHead; -#else - cudaEvent_t scratchEvent; #endif }; +struct ncclCudaContext { + struct ncclCudaContext* next; + CUcontext hcontext; + int refCount; + struct ncclStrongStream launchOrder; +}; + #endif diff --git a/src/include/transport.h b/src/include/transport.h index 37187f6..c563fbb 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -18,6 +18,7 @@ #define TRANSPORT_SHM 1 #define TRANSPORT_NET 2 #define TRANSPORT_COLLNET 3 +#define TRANSPORT_PROFILER 4 #include "proxy.h" #include "comm.h" @@ -26,6 +27,7 @@ extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; extern struct ncclTransport netTransport; extern struct ncclTransport collNetTransport; +extern struct ncclTransport profilerTransport; extern struct ncclTransport* ncclTransports[]; // Forward declarations @@ -65,8 +67,10 @@ struct ncclNvlsSharedRes { CUmulticastObjectProp signalProp; CUmemAccessDesc accessDesc; int dev; - size_t buffSize; - size_t creditSize; + size_t creditUCSize; + size_t creditMCSize; + size_t buffUCSize; + size_t buffMCSize; CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer char* mcBuff; // Multicast NVLS buffer address @@ -123,7 +127,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm); ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm); ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); -ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize); ncclResult_t ncclNvlsFree(struct ncclComm* comm); enum { collNetRecv=0, collNetSend=1 }; diff --git a/src/init.cc b/src/init.cc index 3e218ab..46b02e6 100644 --- a/src/init.cc +++ b/src/init.cc @@ -51,17 +51,6 @@ NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1); static ncclResult_t commReclaim(ncclComm_t comm); -static uint64_t hashUniqueId(ncclUniqueId const &id) { - char const *bytes = (char const*)&id; - uint64_t h = 0xdeadbeef; - for(int i=0; i < (int)sizeof(ncclUniqueId); i++) { - h ^= h >> 32; - h *= 0x8db3db47fa2994ad; - h += bytes[i]; - } - return h; -} - // GDRCOPY support: Off by default NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); @@ -111,7 +100,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { memset(out, 0, sizeof(*out)); // copy to avoid alignment mismatch memcpy(out, &handle, sizeof(handle)); - TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out)); + TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(out->internal, NCCL_UNIQUE_ID_BYTES)); return ncclSuccess; } @@ -232,6 +221,8 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->sharedRes->tpRankToLocalRank); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream)); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream)); + CUDACHECK(cudaEventDestroy(comm->sharedRes->launchEvent)); + CUDACHECK(cudaEventDestroy(comm->sharedRes->scratchEvent)); NCCLCHECK(ncclProxyDestroy(comm)); free(comm->sharedRes); } @@ -268,6 +259,9 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclProfilerPluginFinalize(comm)); NCCLCHECK(ncclNetFinalize(comm)); NCCLCHECK(ncclNetPluginUnload(comm)); + + ncclCudaContextDrop(comm->context); + free(comm); return ncclSuccess; @@ -309,17 +303,12 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { ncclGroupJobAbort(comm->groupJob); } else { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); - if (ret != ncclSuccess) { - /* if ret is not ncclInProgress, we just keep it. */ + if (ret == ncclInProgress) { WARN("Attempt to use communicator before the previous operation returned ncclSuccess"); - if (ret == ncclInProgress) ret = ncclInvalidArgument; + ret = ncclInvalidArgument; goto exit; } - /* if there is linked group job, we should complete it. */ - if (comm->groupJob) { - NCCLCHECK(ncclGroupJobComplete(comm->groupJob)); - comm->groupJob = NULL; - } + /* if ret is not ncclInProgress, we just keep it. */ } exit: @@ -357,6 +346,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in // the device we're on (failure cause #1) , better know it early. CUDACHECK(cudaGetDevice(&comm->cudaDev)); + NCCLCHECK(ncclCudaContextTrack(&comm->context)); + NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); nvmlDevice_t nvmlDev; char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; @@ -396,6 +387,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks)); NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream)); NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream)); + CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming)); + CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming)); comm->sharedRes = sharedRes; sharedRes->refCount = 1; } else { @@ -437,13 +430,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { struct ncclDevCommAndChannels *devCommAndChans = NULL; struct ncclNvmlCCStatus ccStatus; bool ccEnable; + cudaStream_t deviceStream; - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail); - NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail); ncclCommPushCudaFree(comm, devCommAndChans); - NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, deviceStream), ret, fail); ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank); - NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, deviceStream), ret, fail); comm->devComm = &devCommAndChans->comm; tmpCommAndChans.comm.rank = comm->rank; tmpCommAndChans.comm.nRanks = nRanks; @@ -494,10 +488,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { comm->workFifoConsumedLeast = 0; tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed; + // Alloc profiler counters for the kernel + NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workCompleted, MAXCHANNELS), ret, fail); + tmpCommAndChans.comm.workStarted = comm->profiler.workStarted; + tmpCommAndChans.comm.workCompleted = comm->profiler.workCompleted; + ncclCommPushCudaHostFree(comm, comm->profiler.workStarted); + ncclCommPushCudaHostFree(comm, comm->profiler.workCompleted); + if (comm->collNetDenseToUserRank != nullptr) { - NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, deviceStream), ret, fail); ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank); - NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, deviceStream), ret, fail); } for (int c=0; c < MAXCHANNELS; c++) { @@ -510,14 +512,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls; if (comm->channels[c].ring.userRanks != nullptr) { - NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, deviceStream), ret, fail); } } - NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, deviceStream), ret, fail); exit: + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false)); NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream)); return ret; fail: goto exit; @@ -1000,6 +1002,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter); graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic); } + comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern); } if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0; if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0; @@ -1376,12 +1379,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { timers[TIMER_INIT_ALLOC] = clockNano(); NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; - // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), - // add unique split counter and the color - ncclUniqueId tmpId; - memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits - snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color); - comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES); + // child hash obtained from (parent hash, split count, color) + uint64_t hacc[2] = {1, 1}; + eatHash(hacc, &job->parent->commHash); + eatHash(hacc, &job->splitCount); + eatHash(hacc, &job->color); + comm->commHash = digestHash(hacc); INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName, comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key); timers[TIMER_INIT_BOOTSTRAP] = clockNano(); @@ -1394,8 +1397,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; // obtain a unique hash using the first commId - comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES); - commIdHash = hashUniqueId(job->commId[0]); + comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES); INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName, comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash); timers[TIMER_INIT_BOOTSTRAP] = clockNano(); @@ -1610,6 +1612,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s"); NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d"); /* assign config to communicator */ comm->config.blocking = internalConfigPtr->blocking; @@ -1618,6 +1621,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { comm->config.maxCTAs = internalConfigPtr->maxCTAs; comm->config.netName = internalConfigPtr->netName; comm->config.splitShare = internalConfigPtr->splitShare; + comm->config.trafficClass = internalConfigPtr->trafficClass; NCCLCHECKGOTO(envConfigOverride(comm), ret, fail); @@ -1642,6 +1646,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId const char* commIdEnv = NULL; ncclComm_t comm = NULL; struct ncclCommInitRankAsyncJob* job = NULL; + bool launchedJob = false; // first call ncclInit, this will setup the environment NCCLCHECKGOTO(ncclInit(), res, fail); @@ -1695,12 +1700,13 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId // start the bootstrap root before bootstrapping, use only the first handle NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail); } + launchedJob = true; NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail); exit: return ncclGroupErrCheck(res); fail: - if (job) ncclCommInitJobFree(job); + if (job && !launchedJob) ncclCommInitJobFree(job); if (comm) { free(comm->abortFlag); if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev); @@ -1896,7 +1902,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail); NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); // And keep polling until all graphs referencing us die. - while (comm->persistentRefs != 0) { + while (comm->localPersistentRefs != 0) { NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail); } while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) { @@ -1964,7 +1970,6 @@ exit: } return ret; fail: - free(job); if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret); goto exit; } @@ -2215,6 +2220,11 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE); if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE); + /* if there is linked group job, we should complete it. */ + if (*asyncError == ncclSuccess && comm->groupJob) { + NCCLCHECK(ncclGroupJobComplete(comm->groupJob)); + comm->groupJob = NULL; + } return ncclSuccess; } @@ -2265,16 +2275,13 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { #if CUDART_VERSION >= 12010 size_t memGran = 0; - size_t mcGran = 0; CUdevice currentDev; CUmemAllocationProp memprop = {}; - CUmulticastObjectProp mcprop = {}; CUmemAccessDesc accessDesc = {}; CUmemGenericAllocationHandle handle; int cudaDev; int flag; int dcnt; - int mcSupport = 0; if (ptr == NULL || size == 0) goto fallback; @@ -2284,6 +2291,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { CUCHECK(cuDeviceGet(¤tDev, cudaDev)); if (ncclCuMemEnable()) { + size_t handleSize = size; int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; // Query device to see if FABRIC handle support is available flag = 0; @@ -2299,40 +2307,25 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1; CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); CUDACHECK(cudaGetDeviceCount(&dcnt)); - - if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev)); - if (mcSupport) { - /* mc property */ - mcprop.size = size; - /* device cnt is a dummy value right now, it might affect mc granularity in the future. */ - mcprop.numDevices = dcnt; - mcprop.handleTypes = requestedHandleTypes; - mcprop.flags = 0; - CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - - /* only size needs to be aligned to mcGran */ - ALIGN_SIZE(size, mcGran); - } else { - ALIGN_SIZE(size, memGran); - } + ALIGN_SIZE(handleSize, memGran); if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) { /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */ - CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0)); + CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0)); if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) { requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC; memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes; /* Allocate the physical memory on the device */ - CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); + CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); } } else { /* Allocate the physical memory on the device */ - CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); + CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); } /* Reserve a virtual address range */ - CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0)); + CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0)); /* Map the virtual address range to the physical allocation */ - CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0)); /* Now allow RW access to the newly mapped memory */ for (int i = 0; i < dcnt; ++i) { int p2p = 0; @@ -2340,7 +2333,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = i; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1)); } if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i); } diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc index 23746b3..3e9dfcd 100644 --- a/src/misc/ipcsocket.cc +++ b/src/misc/ipcsocket.cc @@ -169,7 +169,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, } control_un; struct cmsghdr *cmptr; - char dummy_buffer[1]; + char dummy_buffer[1] = {'\0'}; struct sockaddr_un cliaddr; // Construct client address to send this shareable handle to @@ -190,6 +190,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp); if (sendFd != -1) { + memset(&control_un, '\0', sizeof(control_un)); msg.msg_control = control_un.control; msg.msg_controllen = sizeof(control_un.control); diff --git a/src/misc/param.cc b/src/misc/param.cc index eb50cfe..d7c324f 100644 --- a/src/misc/param.cc +++ b/src/misc/param.cc @@ -32,6 +32,7 @@ void setEnvFile(const char* fileName) { size_t n = 0; ssize_t read; while ((read = getline(&line, &n, file)) != -1) { + if (line[0] == '#') continue; if (line[read-1] == '\n') line[read-1] = '\0'; int s=0; // Env Var Size while (line[s] != '\0' && line[s] != '=') s++; diff --git a/src/misc/socket.cc b/src/misc/socket.cc index dfb4e68..731dbce 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -171,6 +171,7 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); // Store the IP address int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + memset(addrs+found, '\0', sizeof(*addrs)); memcpy(addrs+found, interface->ifa_addr, salen); found++; } @@ -905,9 +906,17 @@ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) { return ncclSuccess; } -ncclResult_t ncclSocketClose(struct ncclSocket* sock) { +ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) { if (sock != NULL) { if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) { + if (wait) { + char data; + int closed = 0; + do { + int offset = 0; + if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break; + } while (closed == 0); + } /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index 61b0e4b..e6cce98 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -9,28 +9,61 @@ #include "checks.h" #include "param.h" -// Tracks the chain of graph nodes for a given graph captured identified by -// its graph id. This state has to live for as long as captured work is being -// submitted. CUDA doesn't have mechanism to inform us when the user ends capture -// so the best we can do is get notified when the graph is destroyed. -struct ncclStrongStreamGraph { - struct ncclStrongStreamGraph* next; - // Atomically exchanged to false by both the main thread or the graph destructor - // callback. The last to arrive deletes the node. - bool alive; +// Tracks the captured work a given graph captured identified by its graph id. +struct ncclStrongStreamCapture { + struct ncclStrongStreamCapture* next; + cudaGraph_t graph; unsigned long long graphId; - // For each graph we track the "tip" of the chain of graph nodes. A linear - // chain would always have just one node at its tip, but since we have to merge - // in chains from other streams (via ncclStrongStreamWaitStream) some spots - // in the chain can be wider than a single node and thus need a list, so we - // maintain a dynamically sized array of tip nodes. - int tipCount, tipCapacity; - cudaGraphNode_t* tipNodes; + cudaStream_t captureStream; + cudaGraphNode_t lastRecord; + void* acquiredBy; }; -static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) { - free(g->tipNodes); - free(g); +//////////////////////////////////////////////////////////////////////////////// + +static ncclCudaContext* cxtListHead = nullptr; +static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER; + +ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) { + ncclResult_t result = ncclSuccess; + CUcontext hcontext; + CUCHECK(cuCtxGetCurrent(&hcontext)); + + pthread_mutex_lock(&cxtListLock); + struct ncclCudaContext* p = cxtListHead; + while (1) { + if (p == nullptr) { + p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext)); + p->refCount = 1; + p->hcontext = hcontext; + p->next = cxtListHead; + cxtListHead = p; + NCCLCHECKGOTO(ncclStrongStreamConstruct(&p->launchOrder), result, leave); + break; + } + if (p->hcontext == hcontext) { + p->refCount += 1; + break; + } + p = p->next; + } +leave: + pthread_mutex_unlock(&cxtListLock); + *out = p; + return ncclSuccess; +} + +void ncclCudaContextDrop(struct ncclCudaContext* cxt) { + pthread_mutex_lock(&cxtListLock); + if (0 == --cxt->refCount) { + struct ncclCudaContext** pp = &cxtListHead; + while (*pp != cxt) pp = &(*pp)->next; + *pp = cxt->next; // remove from list + // Destroy resources held in cxt + ncclStrongStreamDestruct(&cxt->launchOrder); + free(cxt); + } + pthread_mutex_unlock(&cxtListLock); } //////////////////////////////////////////////////////////////////////////////// @@ -43,9 +76,9 @@ ncclResult_t ncclCudaGetCapturingGraph( NCCLCHECK(ncclCudaDriverVersion(&driver)); if (CUDART_VERSION < 11030 || driver < 11030) { cudaStreamCaptureStatus status; - unsigned long long gid; - CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid)); + CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, nullptr)); #if CUDART_VERSION >= 11030 + graph->origin = nullptr; graph->graph = nullptr; graph->graphId = ULLONG_MAX; #endif @@ -56,13 +89,14 @@ ncclResult_t ncclCudaGetCapturingGraph( } else { #if CUDART_VERSION >= 11030 cudaStreamCaptureStatus status; - unsigned long long gid; - CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr)); + CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr)); if (status != cudaStreamCaptureStatusActive) { + graph->origin = nullptr; graph->graph = nullptr; - gid = ULLONG_MAX; + graph->graphId = ULLONG_MAX; + } else { + graph->origin = stream; } - graph->graphId = gid; #endif } #endif @@ -86,315 +120,218 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t //////////////////////////////////////////////////////////////////////////////// ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) { - CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking)); + CUDACHECK(cudaStreamCreateWithFlags(&ss->liveStream, cudaStreamNonBlocking)); #if CUDART_VERSION >= 11030 - CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming)); ss->everCaptured = false; - ss->serialEventNeedsRecord = false; - ss->graphHead = nullptr; - #else - CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming)); + ss->captureHead = nullptr; + pthread_mutex_init(&ss->lock, nullptr); + CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming)); #endif return ncclSuccess; } -static void graphDestructor(void* arg) { - struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg; - if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) { - // Last to arrive deletes list node. - ncclStrongStreamGraphDelete(g); - } -} - ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) { - CUDACHECK(cudaStreamDestroy(ss->cudaStream)); + CUDACHECK(cudaStreamDestroy(ss->liveStream)); #if CUDART_VERSION >= 11030 - CUDACHECK(cudaEventDestroy(ss->serialEvent)); - // Delete list of per-graph chains. - struct ncclStrongStreamGraph* g = ss->graphHead; - while (g != nullptr) { - struct ncclStrongStreamGraph* next = g->next; - if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) { - // Last to arrive deletes list node. - ncclStrongStreamGraphDelete(g); - } - g = next; + struct ncclStrongStreamCapture* cap = ss->captureHead; + while (cap) { + struct ncclStrongStreamCapture* next = cap->next; + CUDACHECK(cudaStreamDestroy(cap->captureStream)); + free(cap); + cap = next; } - #else - CUDACHECK(cudaEventDestroy(ss->scratchEvent)); + CUDACHECK(cudaEventDestroy(ss->serialEvent)); + pthread_mutex_destroy(&ss->lock); #endif return ncclSuccess; } NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1) +NCCL_PARAM(LaunchRaceFatal, "LAUNCH_RACE_FATAL", 1); +constexpr char const* launchRaceFatalMsg = "Fatal: host threads racing to launch NCCL on same device."; -static void ensureTips(struct ncclStrongStreamGraph* g, int n) { - if (g->tipCapacity < n) { - g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t)); - g->tipCapacity = n; - } -} +static __thread char threadIdMarker; +static void* localThreadId() { return &threadIdMarker; } ncclResult_t ncclStrongStreamAcquire( - struct ncclCudaGraph graph, struct ncclStrongStream* ss + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, + cudaStream_t* workStream ) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); - if (graph.graph == nullptr) { - if (mixing && ss->everCaptured) { - CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); - ss->serialEventNeedsRecord = false; + if (graph.graphId == ULLONG_MAX) { + *workStream = ss->liveStream; + ss->liveAcquiredBy = localThreadId(); + if (mixing && __atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) { + CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0)); } } else { - ss->everCaptured = true; - // Find the current graph in our list of graphs if it exists. - struct ncclStrongStreamGraph** pg = &ss->graphHead; - struct ncclStrongStreamGraph* g; - while (*pg != nullptr) { - g = *pg; - if (g->graphId == graph.graphId) { - // Move to front of list so that operations after acquire don't have to search the list. - *pg = g->next; - g->next = ss->graphHead; - ss->graphHead = g; + bool firstCapture = !ss->everCaptured; + __atomic_store_n(&ss->everCaptured, true, __ATOMIC_RELAXED); + + ncclResult_t ret = ncclSuccess; + if (concurrent) pthread_mutex_lock(&ss->lock); + + // Look for capture in our list of active captures. + struct ncclStrongStreamCapture** pcap = &ss->captureHead; + struct ncclStrongStreamCapture* cap; + struct ncclStrongStreamCapture* spare = nullptr; + while (*pcap != nullptr) { + cap = *pcap; + if (cap->graphId == graph.graphId) { // Capture node already exists. + *workStream = cap->captureStream; + cap->acquiredBy = localThreadId(); + if (concurrent) pthread_mutex_unlock(&ss->lock); return ncclSuccess; - } else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) { - // Unrelated graph that has been destroyed. Remove and delete. - *pg = g->next; - ncclStrongStreamGraphDelete(g); } else { - pg = &g->next; + cudaStreamCaptureStatus status; + CUDACHECKGOTO(cudaStreamIsCapturing(cap->captureStream, &status), ret, do_unlock); + if (status == cudaStreamCaptureStatusActive) { + pcap = &cap->next; // Active capture doesn't match, on to next. + } else { // Capture no longer active + *pcap = cap->next; // Remove from current list + if (spare == nullptr) { // Keep one spare to reuse below. + spare = cap; + } else { + cudaStreamDestroy(cap->captureStream); + free(cap); + } + } } } - - // This is a new graph so add to the list. - g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph)); - g->graphId = graph.graphId; - g->tipNodes = nullptr; - g->tipCapacity = 0; - g->tipCount = 0; - g->next = ss->graphHead; - ss->graphHead = g; - g->alive = true; - NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g)); - - if (mixing && ss->serialEventNeedsRecord) { - // Can only be here if previous release was for uncaptured work that - // elided updating the event because no capture had yet occurred. - CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); - CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream)); + // No matching capture, need a new entry. + cap = spare; + if (cap == nullptr) { + cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture)); + CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock); } - ss->serialEventNeedsRecord = false; + cap->graphId = graph.graphId; + cap->lastRecord = nullptr; + cap->acquiredBy = localThreadId(); + // Push to capturing list. + cap->next = ss->captureHead; + ss->captureHead = cap; - // First node in the chain must be a wait on the serialEvent. + do_unlock: + if (concurrent) pthread_mutex_unlock(&ss->lock); + if (ret != ncclSuccess) return ret; + + *workStream = cap->captureStream; + + // Bring captureStream into the graph but without any dependencies. + cudaEvent_t scratch; + CUDACHECK(cudaEventCreateWithFlags(&scratch, cudaEventDisableTiming)); + CUDACHECK(cudaEventRecord(scratch, graph.origin)); + CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0)); + CUDACHECK(cudaEventDestroy(scratch)); + CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies)); + + if (mixing && firstCapture) { + CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream)); + } if (mixing) { - ensureTips(g, 1); - CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent)); - g->tipCount = 1; - } else { - g->tipCount = 0; + // First dependency is to wait on serialEvent + CUDACHECK(cudaStreamWaitEvent(cap->captureStream, ss->serialEvent, cudaEventWaitExternal)); } } #endif return ncclSuccess; } -ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) { +ncclResult_t ncclStrongStreamAcquiredWorkStream( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, + cudaStream_t* workStream + ) { #if CUDART_VERSION >= 11030 - bool mixing = ncclParamGraphMixingSupport(); - if (mixing && ss->everCaptured) { - CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); + if (graph.graphId == ULLONG_MAX) { + *workStream = ss->liveStream; + } else { + if (concurrent) pthread_mutex_lock(&ss->lock); + struct ncclStrongStreamCapture* cap = ss->captureHead; + while (cap->graphId != graph.graphId) cap = cap->next; + *workStream = cap->captureStream; + if (concurrent) pthread_mutex_unlock(&ss->lock); } - ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream. + #else + *workStream = ss->liveStream #endif return ncclSuccess; } -static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) { - if (g == nullptr || g->graphId != id) { - WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id); - return ncclInternalError; - } - return ncclSuccess; -} - -ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) { +ncclResult_t ncclStrongStreamRelease( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent + ) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); - if (mixing && ss->serialEventNeedsRecord) { - if (graph.graph == nullptr) { - if (ss->everCaptured) { - CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream)); - ss->serialEventNeedsRecord = false; + if (mixing) { + if (graph.graphId == ULLONG_MAX) { + if (__atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) { + CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream)); + } + if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { + WARN("%s", launchRaceFatalMsg); + return ncclInvalidUsage; } } else { - struct ncclStrongStreamGraph* g = ss->graphHead; - NCCLCHECK(checkGraphId(g, graph.graphId)); - ensureTips(g, 1); - CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent)); - g->tipCount = 1; - ss->serialEventNeedsRecord = false; + if (concurrent) pthread_mutex_lock(&ss->lock); + struct ncclStrongStreamCapture* cap = ss->captureHead; + while (cap->graphId != graph.graphId) cap = cap->next; + if (concurrent) pthread_mutex_unlock(&ss->lock); + + // Add event record node with dependencies added further down. + cudaGraphNode_t recordNode; + CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent)); + + // Make this record order after previous record on this stream. + if (cap->lastRecord != nullptr) { + CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1)); + } + cap->lastRecord = recordNode; + + // Get current nodes from work stream so we can add them as dependencies. + cudaStreamCaptureStatus status; + cudaGraphNode_t const* nodes; + size_t count = 0; + cudaError_t res = cudaStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count); + + #if CUDART_VERSION >= 12030 + if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations. + cudaGraphEdgeData const* edges; + CUDACHECK(cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, &edges, &count)); + for (int i=0; i < (int)count; i++) { + CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, &edges[i], 1)); + } + } + #else + if (false) {} + #endif + else { + CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/); + for (int i=0; i < (int)count; i++) { + CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1)); + } + } + + if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { + WARN("%s", launchRaceFatalMsg); + return ncclInvalidUsage; + } } } #endif return ncclSuccess; } -ncclResult_t ncclStrongStreamLaunchHost( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg - ) { - #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg)); - } else { - cudaHostNodeParams p; - p.fn = fn; - p.userData = arg; - struct ncclStrongStreamGraph* g = ss->graphHead; - NCCLCHECK(checkGraphId(g, graph.graphId)); - ensureTips(g, 1); - CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p)); - g->tipCount = 1; - } - ss->serialEventNeedsRecord = true; - #else - CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg)); - #endif - return ncclSuccess; -} - -ncclResult_t ncclStrongStreamLaunchKernel( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, - void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes - ) { - #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream)); - } else { - cudaKernelNodeParams p; - p.func = fn; - p.gridDim = grid; - p.blockDim = block; - p.kernelParams = args; - p.sharedMemBytes = sharedMemBytes; - p.extra = nullptr; - struct ncclStrongStreamGraph* g = ss->graphHead; - NCCLCHECK(checkGraphId(g, graph.graphId)); - ensureTips(g, 1); - CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p)); - g->tipCount = 1; - } - ss->serialEventNeedsRecord = true; - #else - CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream)); - #endif - return ncclSuccess; -} - -// Merge node list `b` into list `a` but don't add duplicates. -static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) { - int an = a->tipCount; - ensureTips(a, an + bn); - for (int bi=0; bi < bn; bi++) { - for (int ai=0; ai < an; ai++) { - if (a->tipNodes[ai] == bNodes[bi]) goto next_b; - } - a->tipNodes[a->tipCount++] = bNodes[bi]; - next_b:; - } -} - -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, - bool b_subsumes_a - ) { - #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - if (b->serialEventNeedsRecord) { - b->serialEventNeedsRecord = false; - CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream)); - } - CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0)); - } else { - struct ncclStrongStreamGraph* ag = a->graphHead; - NCCLCHECK(checkGraphId(ag, graph.graphId)); - struct ncclStrongStreamGraph* bg = b->graphHead; - NCCLCHECK(checkGraphId(bg, graph.graphId)); - if (b_subsumes_a) ag->tipCount = 0; - mergeTips(ag, bg->tipNodes, bg->tipCount); - } - a->serialEventNeedsRecord = true; - #else - CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream)); - CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0)); - #endif - return ncclSuccess; -} - -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, - bool b_subsumes_a - ) { - #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - // It is ok to use a->serialEvent to record b since we'll be setting - // a->serialEventNeedsRecord so the event won't be considered accurate - // until re-recorded. - CUDACHECK(cudaEventRecord(a->serialEvent, b)); - CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0)); - } else { - cudaStreamCaptureStatus status; - unsigned long long bGraphId; - cudaGraphNode_t const* bNodes; - size_t bCount = 0; - CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount)); - if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) { - WARN("Stream is not being captured by the expected graph."); - return ncclInvalidUsage; - } - struct ncclStrongStreamGraph* ag = a->graphHead; - NCCLCHECK(checkGraphId(ag, graph.graphId)); - if (b_subsumes_a) ag->tipCount = 0; - mergeTips(ag, bNodes, bCount); - } - a->serialEventNeedsRecord = true; - #else - CUDACHECK(cudaEventRecord(a->scratchEvent, b)); - CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0)); - #endif - return ncclSuccess; -} - -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, - bool b_subsumes_a - ) { - #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - if (b->serialEventNeedsRecord) { - b->serialEventNeedsRecord = false; - CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream)); - } - CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0)); - } else { - struct ncclStrongStreamGraph* bg = b->graphHead; - NCCLCHECK(checkGraphId(bg, graph.graphId)); - CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount, - b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies - )); - } - #else - CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream)); - CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0)); - #endif +ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent) { + CUDACHECK(cudaEventRecord(scratchEvent, b)); + CUDACHECK(cudaStreamWaitEvent(a, scratchEvent, 0)); return ncclSuccess; } ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 - CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); - ss->serialEventNeedsRecord = false; + CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0)); #endif - CUDACHECK(cudaStreamSynchronize(ss->cudaStream)); + CUDACHECK(cudaStreamSynchronize(ss->liveStream)); return ncclSuccess; } diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc deleted file mode 100644 index 267e12a..0000000 --- a/src/misc/tuner.cc +++ /dev/null @@ -1,267 +0,0 @@ -/************************************************************************* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include -#include -#include - -#include "checks.h" -#include "debug.h" -#include "tuner.h" - -pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; -static int tunerPluginRefCount; -static void* tunerPluginLib = nullptr; -static ncclTuner_v4_t* tunerSymbol = nullptr; -static ncclTuner_v3_t* ncclTuner_v3 = nullptr; -static ncclTuner_v2_t* ncclTuner_v2 = nullptr; -static ncclTuner_v4_t ncclTuner_v2_as_v4; -static ncclTuner_v4_t ncclTuner_v3_as_v4; - -static int hasNvlsSupport(float** collCostTable) { - // Requirements for support of different algorithms: - // - // - NVLS intra-node: nvlsSupport - // - NVLS intra+inter-node: collNetSupport - // - NVLSTree intra-node: always disabled - // - NVLSTree inter-node: nvlsSupport - // - Collnet* inter-node: collNetSupport - // - // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1 - float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; - return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0; -} - -static int hasCollNetSupport(float** collCostTable) { - float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; - return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1; -} - -static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) { - NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto, nChannels)); - return ncclSuccess; -} - -static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { - NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context)); - ncclTuner_v3_as_v4.name = ncclTuner_v3->name; - ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo; - ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy; - return ncclSuccess; -} - -static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) { - int algorithm = NCCL_ALGO_UNDEF; - int protocol = NCCL_PROTO_UNDEF; - int nvlsSupport = hasNvlsSupport(collCostTable); - int collNetSupport = hasCollNetSupport(collCostTable); - NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels)); - // set time to 0 below to make sure this algorithm/protocol is selected later on - if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) { - float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; - if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0; - } - return ncclSuccess; -} - -static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { - NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context)); - ncclTuner_v2_as_v4.name = ncclTuner_v2->name; - ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo; - ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy; - return ncclSuccess; -} - -#define MAX_STR_LEN 255 - -static void* tryOpenLib(const char* name, int* err, char* errStr) { - *err = 0; - if (nullptr == name || strlen(name) == 0) { - return nullptr; - } - - if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { - name = nullptr; - } - - void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); - if (nullptr == handle) { - strncpy(errStr, dlerror(), MAX_STR_LEN); - errStr[MAX_STR_LEN] = '\0'; - // "handle" and "name" won't be NULL at the same time. - // coverity[var_deref_model] - if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { - *err = ENOENT; - } - } - return handle; -} - -static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { - if (openErr == ENOENT) { - snprintf(nameList, *nameListLen, " %s", name); - nameList += strlen(name) + 1; - *nameListLen -= strlen(name) + 1; - return nameList; - } - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr); - return nameList; -} - -static void* openTunerPluginLib(char* couldNotFindNames, int len) { - int openErr; - void *pluginLib; - char tunerPluginLibName[PATH_MAX]; - char openErrStr[MAX_STR_LEN + 1] = { 0 }; - const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN"); - if (envTunerPluginName && strlen(envTunerPluginName)) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName); - snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - - snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - } else { - snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so"); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - } - - const char *envNetPluginName = getenv("NCCL_NET_PLUGIN"); - if (envNetPluginName && strlen(envNetPluginName)) { - // Users are allowed to pack tuner into the net plugin - snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - - snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - } else { - snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so"); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - } - tunerPluginLibName[0] = '\0'; - return nullptr; -} - -enum { - tunerPluginLoadFailed = -1, - tunerPluginLoadReady = 0, - tunerPluginLoadSuccess = 1, -}; - -#define MAX_PLUGIN_LOAD 4 - -static int status = tunerPluginLoadReady; - -ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { - // Initialize to nullptr by default if plugin tuner cannot be loaded. - char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; - comm->tuner = nullptr; - if (tunerPluginLoadFailed == status) { - return ncclSuccess; - } - - pthread_mutex_lock(&tunerPluginLock); - if (tunerPluginLoadFailed == status) { - goto exit; - } - - if (tunerPluginLoadSuccess == status) { - comm->tuner = tunerSymbol; - ++tunerPluginRefCount; - goto exit; - } - - tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); - if (nullptr == tunerPluginLib) { - if (strlen(couldNotFindNames)) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames); - } else { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin."); - } - goto fail; - } - - tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4"); - if (tunerSymbol == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol."); - ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3"); - if (ncclTuner_v3 == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); - ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2"); - if (ncclTuner_v2 == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); - dlclose(tunerPluginLib); - goto fail; - } else { - ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init; - ncclTuner_v2_as_v4.name = ncclTuner_v2->name; - tunerSymbol = &ncclTuner_v2_as_v4; - } - } else { - ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init; - ncclTuner_v3_as_v4.name = ncclTuner_v3->name; - tunerSymbol = &ncclTuner_v3_as_v4; - } - } - - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name); - comm->tuner = tunerSymbol; - ++tunerPluginRefCount; - status = tunerPluginLoadSuccess; - comm->tunerPluginLoaded = 1; - -exit: - pthread_mutex_unlock(&tunerPluginLock); - return ncclSuccess; -fail: - tunerPluginLib = nullptr; - status = tunerPluginLoadFailed; - goto exit; -} - -ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { - pthread_mutex_lock(&tunerPluginLock); - if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { - INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); - dlclose(tunerPluginLib); - tunerPluginLib = nullptr; - tunerSymbol = nullptr; - comm->tuner = nullptr; - status = tunerPluginLoadReady; - comm->tunerPluginLoaded = 0; - } - pthread_mutex_unlock(&tunerPluginLock); - return ncclSuccess; -} diff --git a/src/nccl.h.in b/src/nccl.h.in index 8a6f94e..f3ab534 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -66,6 +66,7 @@ typedef struct ncclConfig_v21700 { int maxCTAs; const char *netName; int splitShare; + int trafficClass; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. @@ -79,7 +80,8 @@ typedef struct ncclConfig_v21700 { NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ NCCL_CONFIG_UNDEF_PTR, /* netName */ \ - NCCL_CONFIG_UNDEF_INT /* splitShare */ \ + NCCL_CONFIG_UNDEF_INT, /* splitShare */ \ + NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \ } /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */ diff --git a/src/net.cc b/src/net.cc deleted file mode 100644 index 13e8c2b..0000000 --- a/src/net.cc +++ /dev/null @@ -1,1033 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "net.h" -#include "bootstrap.h" -#include "checks.h" - -#include -#include -#include -//#include -//#include -//#include - -static ncclNet_v9_t ncclNet_v5_as_v9; -static ncclNet_v9_t ncclNet_v6_as_v9; -static ncclNet_v9_t ncclNet_v7_as_v9; -static ncclNet_v9_t ncclNet_v8_as_v9; -static ncclNet_v5_t *ncclNet_v5; -static ncclNet_v6_t *ncclNet_v6; -static ncclNet_v7_t *ncclNet_v7; -static ncclNet_v8_t *ncclNet_v8; -static ncclCollNet_v9_t ncclCollNet_v5_as_v9; -static ncclCollNet_v9_t ncclCollNet_v6_as_v9; -static ncclCollNet_v9_t ncclCollNet_v7_as_v9; -static ncclCollNet_v9_t ncclCollNet_v8_as_v9; -static ncclCollNet_v5_t *ncclCollNet_v5; -static ncclCollNet_v6_t *ncclCollNet_v6; -static ncclCollNet_v7_t *ncclCollNet_v7; -static ncclCollNet_v8_t *ncclCollNet_v8; - -#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. -#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried - -static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v8_t p8; - ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8); - if (ans != ncclSuccess) return ans; - props->name = p8.name; - props->pciPath = p8.pciPath; - props->guid = p8.guid; - props->ptrSupport = p8.ptrSupport; - props->regIsGlobal = p8.regIsGlobal; - props->forceFlush = 0; - props->speed = p8.speed; - props->port = p8.port; - props->maxComms = p8.maxComms; - props->maxRecvs = p8.maxRecvs; - props->latency = p8.latency; - props->netDeviceType = p8.netDeviceType; - props->netDeviceVersion = p8.netDeviceVersion; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { - int sizeInt; - if (size > MAX_NET_SIZE) return ncclInternalError; - sizeInt = (int)size; - ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request); - return ans; -} - -static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { - int sizesInt[NCCL_PROXY_MAX_SUBS]; - //reset to NULL if optional receive completion is set - if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; - for (int i=0; i MAX_NET_SIZE) return ncclInternalError; - sizesInt[i] = (int) sizes[i]; - } - ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); - return ans; -} - -static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v8->init(logfn)); - ncclNet_v8_as_v9.name = ncclNet_v8->name; - ncclNet_v8_as_v9.devices = ncclNet_v8->devices; - ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties; - ncclNet_v8_as_v9.listen = ncclNet_v8->listen; - ncclNet_v8_as_v9.connect = ncclNet_v8->connect; - ncclNet_v8_as_v9.accept = ncclNet_v8->accept; - ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr; - ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf; - ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr; - ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend; - ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv; - ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush; - ncclNet_v8_as_v9.test = ncclNet_v8->test; - ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend; - ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv; - ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen; - ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr; - ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed; - ncclNet_v8_as_v9.makeVDevice = NULL; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v7_t p7; - ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7); - if (ans != ncclSuccess) return ans; - props->name = p7.name; - props->pciPath = p7.pciPath; - props->guid = p7.guid; - props->ptrSupport = p7.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p7.speed; - props->port = p7.port; - props->maxComms = p7.maxComms; - props->maxRecvs = p7.maxRecvs; - props->latency = p7.latency; - props->netDeviceType = p7.netDeviceType; - props->netDeviceVersion = p7.netDeviceVersion; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { - int sizeInt; - if (size > MAX_NET_SIZE) return ncclInternalError; - sizeInt = (int)size; - ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request); - return ans; -} - -static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { - int sizesInt[NCCL_PROXY_MAX_SUBS]; - //reset to NULL if optional receive completion is set - if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; - for (int i=0; i MAX_NET_SIZE) return ncclInternalError; - sizesInt[i] = (int) sizes[i]; - } - ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); - return ans; -} - -static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v7->init(logfn)); - ncclNet_v7_as_v9.name = ncclNet_v7->name; - ncclNet_v7_as_v9.devices = ncclNet_v7->devices; - ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties; - ncclNet_v7_as_v9.listen = ncclNet_v7->listen; - ncclNet_v7_as_v9.connect = ncclNet_v7->connect; - ncclNet_v7_as_v9.accept = ncclNet_v7->accept; - ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr; - ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf; - ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr; - ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend; - ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv; - ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush; - ncclNet_v7_as_v9.test = ncclNet_v7->test; - ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend; - ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv; - ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen; - ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr; - ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed; - ncclNet_v7_as_v9.makeVDevice = NULL; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v6_t p6; - ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6); - if (ans != ncclSuccess) return ans; - props->name = p6.name; - props->pciPath = p6.pciPath; - props->guid = p6.guid; - props->ptrSupport = p6.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p6.speed; - props->port = p6.port; - props->maxComms = p6.maxComms; - props->maxRecvs = p6.maxRecvs; - props->latency = p6.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { - return ncclNet_v6->connect(dev, handle, sendComm); -} - -static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { - return ncclNet_v6->accept(listenComm, recvComm); -} - -static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { - int sizeInt; - if (size > MAX_NET_SIZE) return ncclInternalError; - sizeInt = (int)size; - ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request); - return ans; -} - -static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { - int sizesInt[NCCL_PROXY_MAX_SUBS]; - //reset to NULL if optional receive completion is set - if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; - for (int i=0; i MAX_NET_SIZE) return ncclInternalError; - sizesInt[i] = (int) sizes[i]; - } - ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); - return ans; -} - -static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v6->init(logfn)); - ncclNet_v6_as_v9.name = ncclNet_v6->name; - ncclNet_v6_as_v9.devices = ncclNet_v6->devices; - ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties; - ncclNet_v6_as_v9.listen = ncclNet_v6->listen; - ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect; - ncclNet_v6_as_v9.accept = ncclNet_v6_as_v9_accept; - ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr; - ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf; - ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr; - ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend; - ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv; - ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush; - ncclNet_v6_as_v9.test = ncclNet_v6->test; - ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend; - ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv; - ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen; - ncclNet_v6_as_v9.getDeviceMr = NULL; - ncclNet_v6_as_v9.irecvConsumed = NULL; - ncclNet_v6_as_v9.makeVDevice = NULL; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v6_t p6; - ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6); - if (ans != ncclSuccess) return ans; - props->name = p6.name; - props->pciPath = p6.pciPath; - props->guid = p6.guid; - props->ptrSupport = p6.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p6.speed; - props->port = p6.port; - props->maxComms = p6.maxComms; - props->maxRecvs = p6.maxRecvs; - props->latency = p6.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { - return ncclNet_v5->connect(dev, handle, sendComm); -} - -static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { - return ncclNet_v5->accept(listenComm, recvComm); -} - -static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { - int sizeInt; - if (size > MAX_NET_SIZE) return ncclInternalError; - sizeInt = (int)size; - ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request); - return ans; -} - -static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { - int sizesInt[NCCL_PROXY_MAX_SUBS]; - //reset to NULL if optional receive completion is set - if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; - for (int i=0; i MAX_NET_SIZE) return ncclInternalError; - sizesInt[i] = (int) sizes[i]; - } - ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); - return ans; -} - -// We use a wrapper around the v5 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v5->init(logfn)); - ncclNet_v5_as_v9.name = ncclNet_v5->name; - ncclNet_v5_as_v9.devices = ncclNet_v5->devices; - ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties; - ncclNet_v5_as_v9.listen = ncclNet_v5->listen; - ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect; - ncclNet_v5_as_v9.accept = ncclNet_v5_as_v9_accept; - ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr; - ncclNet_v5_as_v9.regMrDmaBuf = NULL; - ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr; - ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend; - ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv; - ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush; - ncclNet_v5_as_v9.test = ncclNet_v5->test; - ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend; - ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv; - ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen; - ncclNet_v5_as_v9.getDeviceMr = NULL; - ncclNet_v5_as_v9.irecvConsumed = NULL; - ncclNet_v5_as_v9.makeVDevice = NULL; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v6_t p6; - ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6); - if (ans != ncclSuccess) return ans; - props->name = p6.name; - props->pciPath = p6.pciPath; - props->guid = p6.guid; - props->ptrSupport = p6.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p6.speed; - props->port = p6.port; - props->maxComms = p6.maxComms; - props->maxRecvs = p6.maxRecvs; - props->latency = p6.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - int countInt; - if (count > MAX_NET_SIZE) return ncclInternalError; - countInt = (int)count; - ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, - sendMhandle, recvMhandle, request); - return ans; -} - -// We use a wrapper around the v5 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v5->init(logfn)); - ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name; - ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices; - ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties; - ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen; - ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect; - ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport; - ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr; - ncclCollNet_v5_as_v9.regMrDmaBuf = NULL; - ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr; - ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce; - ncclCollNet_v5_as_v9.iallgather = nullptr; - ncclCollNet_v5_as_v9.ireducescatter = nullptr; - ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush; - ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test; - ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl; - ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v6_t p6; - ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); - if (ans != ncclSuccess) return ans; - props->name = p6.name; - props->pciPath = p6.pciPath; - props->guid = p6.guid; - props->ptrSupport = p6.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p6.speed; - props->port = p6.port; - props->maxComms = p6.maxComms; - props->maxRecvs = p6.maxRecvs; - props->latency = p6.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - int countInt; - if (count > MAX_NET_SIZE) return ncclInternalError; - countInt = (int)count; - ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, - sendMhandle, recvMhandle, request); - return ans; -} - -// We use a wrapper around the v6 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v6->init(logfn)); - ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name; - ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices; - ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties; - ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen; - ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect; - ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport; - ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr; - ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf; - ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr; - ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce; - ncclCollNet_v6_as_v9.iallgather = nullptr; - ncclCollNet_v6_as_v9.ireducescatter = nullptr; - ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush; - ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test; - ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl; - ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v7_t p7; - ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7); - if (ans != ncclSuccess) return ans; - props->name = p7.name; - props->pciPath = p7.pciPath; - props->guid = p7.guid; - props->ptrSupport = p7.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p7.speed; - props->port = p7.port; - props->maxComms = p7.maxComms; - props->maxRecvs = p7.maxRecvs; - props->latency = p7.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - int countInt; - if (count > MAX_NET_SIZE) return ncclInternalError; - countInt = (int)count; - ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, - sendMhandle, recvMhandle, request); - return ans; -} - -// We use a wrapper around the v7 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v7->init(logfn)); - ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name; - ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices; - ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties; - ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen; - ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect; - ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport; - ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr; - ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf; - ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr; - ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce; - ncclCollNet_v7_as_v9.iallgather = nullptr; - ncclCollNet_v7_as_v9.ireducescatter = nullptr; - ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush; - ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test; - ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl; - ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v8_t p8; - ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8); - if (ans != ncclSuccess) return ans; - props->name = p8.name; - props->pciPath = p8.pciPath; - props->guid = p8.guid; - props->ptrSupport = p8.ptrSupport; - props->regIsGlobal = p8.regIsGlobal; - props->forceFlush = 0; - props->speed = p8.speed; - props->port = p8.port; - props->maxComms = p8.maxComms; - props->maxRecvs = p8.maxRecvs; - props->latency = p8.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - int countInt; - if (count > MAX_NET_SIZE) return ncclInternalError; - countInt = (int)count; - ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, - sendMhandle, recvMhandle, request); - return ans; -} - -static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - void* sendMhandle, void** request) { - ncclNetSGE_v8_t recvPartsInt; - if (nRecvParts > 1) return ncclInternalError; - if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError; - recvPartsInt.mhandle = recvParts->mhandle; - recvPartsInt.address = recvParts->address; - recvPartsInt.size = (int)recvParts->size; - ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt, - bytesPerRank, windowOffset, windowBytes, - sendMhandle, request); - return ans; -} - -static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - ncclDataType_t dataType, ncclRedOp_t redOp, - void* recvMhandle, void** request) { - ncclNetSGE_v8_t sendPartsInt; - if (nSendParts > 1) return ncclInternalError; - if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError; - sendPartsInt.mhandle = sendParts->mhandle; - sendPartsInt.address = sendParts->address; - sendPartsInt.size = (int)sendParts->size; - ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt, - recvData, bytesPerRank, windowOffset, windowBytes, - dataType, redOp, - recvMhandle, request); - return ans; -} - -// We use a wrapper around the v8 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v8->init(logfn)); - ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name; - ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices; - ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties; - ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen; - ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect; - ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport; - ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr; - ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf; - ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr; - ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce; - ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather; - ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter; - ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush; - ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test; - ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl; - ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen; - return ncclSuccess; -} - -static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; -ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket }; -ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr }; -enum ncclNetState { - ncclNetStateInit = 0, - ncclNetStateEnabled = 1, - ncclNetStateDisabled = 2 -}; -enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; -enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; - -#define MAX_STR_LEN 255 - -static void* tryOpenLib(char* name, int* err, char* errStr) { - *err = 0; - if (nullptr == name || strlen(name) == 0) { - return nullptr; - } - - if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { - name = nullptr; - } - - void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); - if (nullptr == handle) { - strncpy(errStr, dlerror(), MAX_STR_LEN); - errStr[MAX_STR_LEN] = '\0'; - // "handle" and "name" won't be NULL at the same time. - // coverity[var_deref_model] - if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { - *err = ENOENT; - } - } - return handle; -} - -static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { - if (openErr == ENOENT) { - snprintf(nameList, *nameListLen, " %s", name); - nameList += strlen(name) + 1; - *nameListLen -= strlen(name) + 1; - return nameList; - } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: %s", openErrStr); - return nameList; -} - -static void* openNetPluginLib(char* couldNotFindNames, int len) { - int openErr; - void *pluginLib; - char netPluginLibName[PATH_MAX]; - char openErrStr[MAX_STR_LEN + 1] = { 0 }; - const char *envNetPluginName = getenv("NCCL_NET_PLUGIN"); - if (envNetPluginName && strlen(envNetPluginName)) { - snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName); - pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); - - snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName); - pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); - } else { - snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so"); - pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); - if (pluginLib) { - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); - } - return nullptr; -} - -static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER; -static int netPluginRefCount; -static void* netPluginLib; - -enum { - netPluginLoadFailed = -1, - netPluginLoadReady = 0, - netPluginLoadSuccess = 1, -}; - -static int netPluginStatus = netPluginLoadReady; - -#define MAX_PLUGIN_LOAD 2 - -ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { - char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; - pthread_mutex_lock(&netPluginLock); - if (netPluginLoadFailed == netPluginStatus) { - goto exit; - } - if (netPluginLoadSuccess == netPluginStatus) { - ++netPluginRefCount; - goto exit; - } - - netPluginLib = openNetPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); - if (netPluginLib == nullptr) { - if (strlen(couldNotFindNames)) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Could not find:%s. Using internal network plugin.", couldNotFindNames); - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin."); - } - goto fail; - } - - ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9"); - if (ncclNets[0] == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol."); - ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8"); - if (ncclNet_v8 == nullptr) { - // Try v7 plugin - ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7"); - if (ncclNet_v7 == nullptr) { - // Try v6 plugin - ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); - if (ncclNet_v6 == nullptr) { - // Try v5 plugin - ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); - if (ncclNet_v5 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported."); - goto fail; - } else { - ncclNets[0] = &ncclNet_v5_as_v9; - ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v5_as_v9.name = ncclNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); - } - } else { - ncclNets[0] = &ncclNet_v6_as_v9; - ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v6_as_v9.name = ncclNet_v6->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name); - } - } else { - ncclNets[0] = &ncclNet_v7_as_v9; - ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v7_as_v9.name = ncclNet_v7->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name); - } - } else { - ncclNets[0] = &ncclNet_v8_as_v9; - ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v8_as_v9.name = ncclNet_v8->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name); - } - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name); - } - - // Check for CollNet - ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9"); - if (ncclCollNets[0] == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol."); - ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8"); - if (ncclCollNet_v8 == nullptr) { - ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7"); - if (ncclCollNet_v7 == nullptr) { - ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); - if (ncclCollNet_v6 == nullptr) { - ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); - if (ncclCollNet_v5 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported."); - } else { - ncclCollNets[0] = &ncclCollNet_v5_as_v9; - ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init; - ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name); - } - } else { - ncclCollNets[0] = &ncclCollNet_v6_as_v9; - ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init; - ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name); - } - } else { - ncclCollNets[0] = &ncclCollNet_v7_as_v9; - ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init; - ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name); - } - } else { - ncclCollNets[0] = &ncclCollNet_v8_as_v9; - ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init; - ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name); - } - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name); - } - - ++netPluginRefCount; - netPluginStatus = netPluginLoadSuccess; - comm->netPluginLoaded = 1; - -exit: - pthread_mutex_unlock(&netPluginLock); - return ncclSuccess; -fail: - if (netPluginLib) dlclose(netPluginLib); - netPluginStatus = netPluginLoadFailed; - goto exit; -} - -ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) { - pthread_mutex_lock(&netPluginLock); - if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) { - if (ncclNets[0]) { - INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name); - } - if (ncclCollNets[0]) { - INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name); - } - dlclose(netPluginLib); - netPluginLib = nullptr; - ncclNets[0] = nullptr; - ncclCollNets[0] = nullptr; - netPluginStatus = netPluginLoadReady; - comm->netPluginLoaded = 0; - for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i) - ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit; - } - pthread_mutex_unlock(&netPluginLock); - return ncclSuccess; -} - -ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { - ncclNetProperties_t props; - - NCCLCHECK(net->getProperties(dev, &props)); - ncclNetDeviceType type = props.netDeviceType; - if (type) switch (type) { - case NCCL_NET_DEVICE_UNPACK: - if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) { - INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d", - props.netDeviceVersion); - return ncclSuccess; - } else { - WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it", - props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION); - return ncclInternalError; - } - default: - WARN("Unknown device code index %d \n", type); - return ncclInternalError; - } - - return ncclSuccess; -} - -static ncclResult_t netGetState(int i, enum ncclNetState* state) { - pthread_mutex_lock(&netLock); - if (ncclNetStates[i] == ncclNetStateInit) { - int ndev; - if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled; - else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled; - else ncclNetStates[i] = ncclNetStateEnabled; - } - *state = ncclNetStates[i]; - pthread_mutex_unlock(&netLock); - return ncclSuccess; -} - -static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { - pthread_mutex_lock(&netLock); - if (ncclCollNetStates[i] == ncclNetStateInit) { - int ndev; - if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled; - else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled; - else ncclCollNetStates[i] = ncclNetStateEnabled; - } - *state = ncclCollNetStates[i]; - pthread_mutex_unlock(&netLock); - return ncclSuccess; -} - -ncclResult_t ncclNetInit(struct ncclComm* comm) { - // Initialize main communication network - const char* netName; - bool ok = false; - - netName = comm->config.netName; - for (int i=0; i<3; i++) { - if (ncclNets[i] == nullptr) continue; - enum ncclNetState state; - NCCLCHECK(netGetState(i, &state)); - if (state != ncclNetStateEnabled) continue; - if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; - if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) { - // Mismatched device plugin version - continue; - } - - comm->ncclNet = ncclNets[i]; - ok = true; - - if (ncclCollNets[i]) { - NCCLCHECK(collNetGetState(i, &state)); - if (state == ncclNetStateEnabled) { - comm->ncclCollNet = ncclCollNets[i]; - } - } - break; - } - - if (!ok) { - WARN("Error: network %s not found.", netName ? netName : ""); - return ncclInvalidUsage; - } - return ncclSuccess; -} - -ncclResult_t ncclNetFinalize(struct ncclComm* comm) { - comm->ncclNet = nullptr; - comm->ncclCollNet = nullptr; - return ncclSuccess; -} - -ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { - constexpr int GPU_BUF_SIZE = 2*1024*1024; -#if CUDART_VERSION >= 11030 - // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute - int driverVersion; - CUDACHECK(cudaDriverGetVersion(&driverVersion)); - if (driverVersion >= 11030) { - int cudaDev, attr = 0; - CUDACHECK(cudaGetDevice(&cudaDev)); - CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev)); - *gdrSupport = attr; - return ncclSuccess; - } -#endif - static int gdrSupportMatrix[32] = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; - if (gdrSupportMatrix[comm->cudaDev] == -1) { - int netDevs; - NCCLCHECK(comm->ncclNet->devices(&netDevs)); - gdrSupportMatrix[comm->cudaDev] = 0; - for (int dev=0; devncclNet->getProperties(dev, &props)); - if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; - - // Allocate memory on the GPU and try to register it on the NIC. - void *lComm = NULL, *sComm = NULL, *rComm = NULL; - ncclNetHandle_t handle; - char* gpuPtr = NULL; - void* mHandle = NULL; - ncclResult_t ret; - ncclDebugNoWarn = NCCL_NET; - NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); - - bool connected; - connected = false; - while (!connected) { - - // If we're aborting now, skip to cleanup - if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { - goto cleanup2; - } - - if (sComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2); - - if (rComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2); - - connected = (rComm != NULL) && (sComm != NULL); - } - - NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); - if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { - NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle)); - NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); - NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle)); - gdrSupportMatrix[comm->cudaDev] = 1; - } - ncclDebugNoWarn = 0; - NCCLCHECK(ncclCudaFree(gpuPtr)); -cleanup2: - if (rComm != NULL) - NCCLCHECK(comm->ncclNet->closeRecv(rComm)); - if (sComm != NULL) - NCCLCHECK(comm->ncclNet->closeSend(sComm)); - NCCLCHECK(comm->ncclNet->closeListen(lComm)); -cleanup1: - break; - } - } - *gdrSupport = gdrSupportMatrix[comm->cudaDev]; - return ncclSuccess; -} - -int ncclNetVersion(struct ncclComm* comm) { - return - (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 : - (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 : - (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 : - (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 : - 9; -} diff --git a/src/plugin/net.cc b/src/plugin/net.cc new file mode 100644 index 0000000..9257d77 --- /dev/null +++ b/src/plugin/net.cc @@ -0,0 +1,319 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "net.h" +#include "bootstrap.h" +#include "checks.h" +#include "plugin.h" + +#include +#include +//#include +//#include +//#include + +extern ncclNet_t* getNcclNet_v6(void* netPluginLib); +extern ncclNet_t* getNcclNet_v7(void* netPluginLib); +extern ncclNet_t* getNcclNet_v8(void* netPluginLib); +extern ncclNet_t* getNcclNet_v9(void* netPluginLib); +extern ncclNet_t* getNcclNet_v10(void* netPluginLib); + +extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib); + +static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; +ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket }; +static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 }; +ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr }; +enum ncclNetState { + ncclNetStateInit = 0, + ncclNetStateEnabled = 1, + ncclNetStateDisabled = 2 +}; +enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; +enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; + +NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1); +static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER; +static void* netPluginLib; + +static int netPluginRefCount; +static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();} + +enum { + netPluginLoadFailed = -1, + netPluginLoadReady = 0, + netPluginLoadSuccess = 1, +}; + +static int netPluginStatus = netPluginLoadReady; + +ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { + static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT; + pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce); + + pthread_mutex_lock(&netPluginLock); + if (netPluginLoadFailed == netPluginStatus) { + goto exit; + } + if (netPluginLoadSuccess == netPluginStatus) { + ++netPluginRefCount; + goto exit; + } + + netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN")); + if (netPluginLib == nullptr) { + goto fail; + } + + ncclNets[0] = getNcclNet_v10(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 10; + if (ncclNets[0] == nullptr) { + // Try v9 plugin + ncclNets[0] = getNcclNet_v9(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 9; + } + if (ncclNets[0] == nullptr) { + // Try v8 plugin + ncclNets[0] = getNcclNet_v8(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 8; + } + if (ncclNets[0] == nullptr) { + // Try v7 plugin + ncclNets[0] = getNcclNet_v7(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 7; + } + if (ncclNets[0] == nullptr) { + // Try v6 plugin + ncclNets[0] = getNcclNet_v6(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 6; + } + if (ncclNets[0] == nullptr) { + goto fail; + } + + // Check for CollNet + ncclCollNets[0] = getNcclCollNet_v10(netPluginLib); + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v9(netPluginLib); + } + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v8(netPluginLib); + } + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v7(netPluginLib); + } + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v6(netPluginLib); + } + + ++netPluginRefCount; + netPluginStatus = netPluginLoadSuccess; + comm->netPluginLoaded = 1; + +exit: + pthread_mutex_unlock(&netPluginLock); + return ncclSuccess; +fail: + if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib)); + netPluginStatus = netPluginLoadFailed; + goto exit; +} + +ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) { + pthread_mutex_lock(&netPluginLock); + if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) { + if (ncclNets[0]) { + INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name); + } + if (ncclCollNets[0]) { + INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name); + } + NCCLCHECK(ncclClosePluginLib(netPluginLib)); + netPluginLib = nullptr; + ncclNets[0] = nullptr; + ncclCollNets[0] = nullptr; + netPluginStatus = netPluginLoadReady; + comm->netPluginLoaded = 0; + for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i) + ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit; + } + pthread_mutex_unlock(&netPluginLock); + return ncclSuccess; +} + +ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { + ncclNetProperties_t props; + + NCCLCHECK(net->getProperties(dev, &props)); + ncclNetDeviceType type = props.netDeviceType; + if (type) switch (type) { + case NCCL_NET_DEVICE_UNPACK: + if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) { + INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d", + props.netDeviceVersion); + return ncclSuccess; + } else { + WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it", + props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION); + return ncclInternalError; + } + default: + WARN("Unknown device code index %d \n", type); + return ncclInternalError; + } + + return ncclSuccess; +} + +static ncclResult_t netGetState(int i, enum ncclNetState* state) { + pthread_mutex_lock(&netLock); + if (ncclNetStates[i] == ncclNetStateInit) { + int ndev; + if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled; + else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled; + else ncclNetStates[i] = ncclNetStateEnabled; + } + *state = ncclNetStates[i]; + pthread_mutex_unlock(&netLock); + return ncclSuccess; +} + +static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { + pthread_mutex_lock(&netLock); + if (ncclCollNetStates[i] == ncclNetStateInit) { + int ndev; + if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled; + else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled; + else ncclCollNetStates[i] = ncclNetStateEnabled; + } + *state = ncclCollNetStates[i]; + pthread_mutex_unlock(&netLock); + return ncclSuccess; +} + +ncclResult_t ncclNetInit(struct ncclComm* comm) { + // Initialize main communication network + const char* netName; + bool ok = false; + + netName = comm->config.netName; + for (int i=0; i<3; i++) { + if (ncclNets[i] == nullptr) continue; + enum ncclNetState state; + NCCLCHECK(netGetState(i, &state)); + if (state != ncclNetStateEnabled) continue; + if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; + if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) { + // Mismatched device plugin version + continue; + } + + comm->ncclNet = ncclNets[i]; + comm->ncclNetVer = ncclNetsVer[i]; + ok = true; + + if (ncclCollNets[i]) { + NCCLCHECK(collNetGetState(i, &state)); + if (state == ncclNetStateEnabled) { + comm->ncclCollNet = ncclCollNets[i]; + } + } + break; + } + + if (!ok) { + WARN("Error: network %s not found.", netName ? netName : ""); + return ncclInvalidUsage; + } + return ncclSuccess; +} + +ncclResult_t ncclNetFinalize(struct ncclComm* comm) { + comm->ncclNet = nullptr; + comm->ncclCollNet = nullptr; + return ncclSuccess; +} + +ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { + constexpr int GPU_BUF_SIZE = 2*1024*1024; +#if CUDART_VERSION >= 11030 + // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute + int driverVersion; + CUDACHECK(cudaDriverGetVersion(&driverVersion)); + if (driverVersion >= 11030) { + int cudaDev, attr = 0; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev)); + *gdrSupport = attr; + return ncclSuccess; + } +#endif + static int gdrSupportMatrix[32] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; + if (gdrSupportMatrix[comm->cudaDev] == -1) { + int netDevs; + NCCLCHECK(comm->ncclNet->devices(&netDevs)); + gdrSupportMatrix[comm->cudaDev] = 0; + for (int dev=0; devncclNet->getProperties(dev, &props)); + if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; + + // Allocate memory on the GPU and try to register it on the NIC. + void *lComm = NULL, *sComm = NULL, *rComm = NULL; + ncclNetHandle_t handle; + char* gpuPtr = NULL; + void* mHandle = NULL; + ncclResult_t ret; + ncclDebugNoWarn = NCCL_NET; + NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); + + bool connected; + connected = false; + while (!connected) { + + // If we're aborting now, skip to cleanup + if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { + goto cleanup2; + } + + if (sComm == NULL) + NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2); + + if (rComm == NULL) + NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2); + + connected = (rComm != NULL) && (sComm != NULL); + } + + NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); + if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { + NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle)); + NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); + NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle)); + gdrSupportMatrix[comm->cudaDev] = 1; + } + ncclDebugNoWarn = 0; + NCCLCHECK(ncclCudaFree(gpuPtr)); +cleanup2: + if (rComm != NULL) + NCCLCHECK(comm->ncclNet->closeRecv(rComm)); + if (sComm != NULL) + NCCLCHECK(comm->ncclNet->closeSend(sComm)); + NCCLCHECK(comm->ncclNet->closeListen(lComm)); +cleanup1: + break; + } + } + *gdrSupport = gdrSupportMatrix[comm->cudaDev]; + return ncclSuccess; +} diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc new file mode 100644 index 0000000..682f239 --- /dev/null +++ b/src/plugin/net/net_v10.cc @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" + +static ncclNet_v10_t* ncclNet_v10; +static ncclCollNet_v10_t* ncclCollNet_v10; + +ncclNet_t* getNcclNet_v10(void* lib) { + ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10"); + if (ncclNet_v10) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name); + return ncclNet_v10; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol."); + return nullptr; +} + +ncclCollNet_t* getNcclCollNet_v10(void* lib) { + ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10"); + if (ncclCollNet_v10) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name); + return ncclCollNet_v10; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc new file mode 100644 index 0000000..baff679 --- /dev/null +++ b/src/plugin/net/net_v6.cc @@ -0,0 +1,178 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include "checks.h" + +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; +static ncclNet_v6_t* ncclNet_v6; +static ncclCollNet_v6_t* ncclCollNet_v6; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v6_t p6; + ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6); + if (ans != ncclSuccess) return ans; + props->name = p6.name; + props->pciPath = p6.pciPath; + props->guid = p6.guid; + props->ptrSupport = p6.ptrSupport; + props->regIsGlobal = 0; + props->forceFlush = 0; + props->speed = p6.speed; + props->port = p6.port; + props->maxComms = p6.maxComms; + props->maxRecvs = p6.maxRecvs; + props->latency = p6.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { + if (size >= 1UL<<31) return ncclInternalError; + return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle); +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { + return ncclNet_v6->connect(dev, handle, sendComm); +} + +static ncclResult_t ncclNet_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { + return ncclNet_v6->accept(listenComm, recvComm); +} + +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to nullptr if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v6_t p6; + ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); + if (ans != ncclSuccess) return ans; + props->name = p6.name; + props->pciPath = p6.pciPath; + props->guid = p6.guid; + props->ptrSupport = p6.ptrSupport; + props->regIsGlobal = 0; + props->forceFlush = 0; + props->speed = p6.speed; + props->port = p6.port; + props->maxComms = p6.maxComms; + props->maxRecvs = p6.maxRecvs; + props->latency = p6.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { + if (size >= 1UL<<31) return ncclInternalError; + return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle); +} + +static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v6->init(logfn)); + ncclNet.devices = ncclNet_v6->devices; + ncclNet.getProperties = ncclNet_getProperties; + ncclNet.listen = ncclNet_v6->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_accept; + ncclNet.regMr = ncclNet_regMr; + ncclNet.regMrDmaBuf = ncclNet_v6->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v6->deregMr; + ncclNet.isend = ncclNet_isend; + ncclNet.irecv = ncclNet_irecv; + ncclNet.iflush = ncclNet_v6->iflush; + ncclNet.test = ncclNet_v6->test; + ncclNet.closeSend = ncclNet_v6->closeSend; + ncclNet.closeRecv = ncclNet_v6->closeRecv; + ncclNet.closeListen = ncclNet_v6->closeListen; + ncclNet.getDeviceMr = NULL; + ncclNet.irecvConsumed = NULL; + ncclNet.makeVDevice = NULL; + return ncclSuccess; +} + +ncclNet_t* getNcclNet_v6(void* lib) { + ncclNet_v6 = (ncclNet_v6_t*)dlsym(lib, "ncclNetPlugin_v6"); + if (ncclNet_v6) { + ncclNet.name = ncclNet_v6->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNet_v6->name); + return &ncclNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol."); + return nullptr; +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v6->init(logfn)); + ncclCollNet.devices = ncclCollNet_v6->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v6->listen; + ncclCollNet.connect = ncclCollNet_v6->connect; + ncclCollNet.reduceSupport = ncclCollNet_v6->reduceSupport; + ncclCollNet.regMr = ncclCollNet_regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v6->deregMr; + ncclCollNet.iallreduce = ncclCollNet_iallreduce; + ncclCollNet.iallgather = nullptr; + ncclCollNet.ireducescatter = nullptr; + ncclCollNet.iflush = ncclCollNet_v6->iflush; + ncclCollNet.test = ncclCollNet_v6->test; + ncclCollNet.closeColl = ncclCollNet_v6->closeColl; + ncclCollNet.closeListen = ncclCollNet_v6->closeListen; + return ncclSuccess; +} + +ncclCollNet_t* getNcclCollNet_v6(void* lib) { + ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(lib, "ncclCollNetPlugin_v6"); + if (ncclCollNet_v6) { + ncclCollNet.name = ncclCollNet_v6->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNet_v6->name); + return &ncclCollNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc new file mode 100644 index 0000000..4bad5ec --- /dev/null +++ b/src/plugin/net/net_v7.cc @@ -0,0 +1,174 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include "checks.h" + +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; +static ncclNet_v7_t* ncclNet_v7; +static ncclCollNet_v7_t* ncclCollNet_v7; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v7_t p7; + ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7); + if (ans != ncclSuccess) return ans; + props->name = p7.name; + props->pciPath = p7.pciPath; + props->guid = p7.guid; + props->ptrSupport = p7.ptrSupport; + props->regIsGlobal = 0; + props->forceFlush = 0; + props->speed = p7.speed; + props->port = p7.port; + props->maxComms = p7.maxComms; + props->maxRecvs = p7.maxRecvs; + props->latency = p7.latency; + props->netDeviceType = p7.netDeviceType; + props->netDeviceVersion = p7.netDeviceVersion; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return ncclNet_v7->connect(dev, handle, sendComm, sendDevComm); +} + +static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { + if (size >= 1UL<<31) return ncclInternalError; + return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle); +} + +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to nullptr if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v7_t p7; + ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7); + if (ans != ncclSuccess) return ans; + props->name = p7.name; + props->pciPath = p7.pciPath; + props->guid = p7.guid; + props->ptrSupport = p7.ptrSupport; + props->regIsGlobal = 0; + props->forceFlush = 0; + props->speed = p7.speed; + props->port = p7.port; + props->maxComms = p7.maxComms; + props->maxRecvs = p7.maxRecvs; + props->latency = p7.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { + if (size >= 1UL<<31) return ncclInternalError; + return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle); +} + +static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v7->init(logfn)); + ncclNet.devices = ncclNet_v7->devices; + ncclNet.getProperties = ncclNet_getProperties; // ncclNet_v5->getProperties; + ncclNet.listen = ncclNet_v7->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_v7->accept; + ncclNet.regMr = ncclNet_regMr; + ncclNet.regMrDmaBuf = ncclNet_v7->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v7->deregMr; + ncclNet.isend = ncclNet_isend; + ncclNet.irecv = ncclNet_irecv; + ncclNet.iflush = ncclNet_v7->iflush; + ncclNet.test = ncclNet_v7->test; + ncclNet.closeSend = ncclNet_v7->closeSend; + ncclNet.closeRecv = ncclNet_v7->closeRecv; + ncclNet.closeListen = ncclNet_v7->closeListen; + ncclNet.getDeviceMr = ncclNet_v7->getDeviceMr; + ncclNet.irecvConsumed = ncclNet_v7->irecvConsumed; + ncclNet.makeVDevice = NULL; + return ncclSuccess; +} + +ncclNet_t* getNcclNet_v7(void* lib) { + ncclNet_v7 = (ncclNet_v7_t*)dlsym(lib, "ncclNetPlugin_v7"); + if (ncclNet_v7) { + ncclNet.name = ncclNet_v7->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNet_v7->name); + return &ncclNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol."); + return nullptr; +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v7->init(logfn)); + ncclCollNet.devices = ncclCollNet_v7->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v7->listen; + ncclCollNet.connect = ncclCollNet_v7->connect; + ncclCollNet.reduceSupport = ncclCollNet_v7->reduceSupport; + ncclCollNet.regMr = ncclCollNet_regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v7->deregMr; + ncclCollNet.iallreduce = ncclCollNet_iallreduce; + ncclCollNet.iallgather = nullptr; + ncclCollNet.ireducescatter = nullptr; + ncclCollNet.iflush = ncclCollNet_v7->iflush; + ncclCollNet.test = ncclCollNet_v7->test; + ncclCollNet.closeColl = ncclCollNet_v7->closeColl; + ncclCollNet.closeListen = ncclCollNet_v7->closeListen; + return ncclSuccess; +} + +ncclCollNet_t* getNcclCollNet_v7(void* lib) { + ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(lib, "ncclCollNetPlugin_v7"); + if (ncclCollNet_v7) { + ncclCollNet.name = ncclCollNet_v7->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNet_v7->name); + return &ncclCollNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc new file mode 100644 index 0000000..b43bb89 --- /dev/null +++ b/src/plugin/net/net_v8.cc @@ -0,0 +1,196 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include "checks.h" + +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; +static ncclNet_v8_t* ncclNet_v8; +static ncclCollNet_v8_t* ncclCollNet_v8; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v8_t p8; + ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8); + if (ans != ncclSuccess) return ans; + props->name = p8.name; + props->pciPath = p8.pciPath; + props->guid = p8.guid; + props->ptrSupport = p8.ptrSupport; + props->regIsGlobal = p8.regIsGlobal; + props->forceFlush = 0; + props->speed = p8.speed; + props->port = p8.port; + props->maxComms = p8.maxComms; + props->maxRecvs = p8.maxRecvs; + props->latency = p8.latency; + props->netDeviceType = p8.netDeviceType; + props->netDeviceVersion = p8.netDeviceVersion; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return ncclNet_v8->connect(dev, handle, sendComm, sendDevComm); +} + +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to nullptr if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v8_t p8; + ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8); + if (ans != ncclSuccess) return ans; + props->name = p8.name; + props->pciPath = p8.pciPath; + props->guid = p8.guid; + props->ptrSupport = p8.ptrSupport; + props->regIsGlobal = p8.regIsGlobal; + props->forceFlush = 0; + props->speed = p8.speed; + props->port = p8.port; + props->maxComms = p8.maxComms; + props->maxRecvs = p8.maxRecvs; + props->latency = p8.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + +static ncclResult_t ncclCollNet_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request) { + ncclNetSGE_v8_t recvPartsInt; + if (nRecvParts > 1) return ncclInternalError; + if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError; + recvPartsInt.mhandle = recvParts->mhandle; + recvPartsInt.address = recvParts->address; + recvPartsInt.size = (int)recvParts->size; + ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt, + bytesPerRank, windowOffset, windowBytes, + sendMhandle, request); + return ans; +} + +static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request) { + ncclNetSGE_v8_t sendPartsInt; + if (nSendParts > 1) return ncclInternalError; + if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError; + sendPartsInt.mhandle = sendParts->mhandle; + sendPartsInt.address = sendParts->address; + sendPartsInt.size = (int)sendParts->size; + ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt, + recvData, bytesPerRank, windowOffset, windowBytes, + dataType, redOp, + recvMhandle, request); + return ans; +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v8->init(logfn)); + ncclNet.devices = ncclNet_v8->devices; + ncclNet.getProperties = ncclNet_getProperties; + ncclNet.listen = ncclNet_v8->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_v8->accept; + ncclNet.regMr = ncclNet_v8->regMr; + ncclNet.regMrDmaBuf = ncclNet_v8->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v8->deregMr; + ncclNet.isend = ncclNet_isend; + ncclNet.irecv = ncclNet_irecv; + ncclNet.iflush = ncclNet_v8->iflush; + ncclNet.test = ncclNet_v8->test; + ncclNet.closeSend = ncclNet_v8->closeSend; + ncclNet.closeRecv = ncclNet_v8->closeRecv; + ncclNet.closeListen = ncclNet_v8->closeListen; + ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr; + ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed; + ncclNet.makeVDevice = NULL; + return ncclSuccess; +} + +ncclNet_t* getNcclNet_v8(void* lib) { + ncclNet_v8 = (ncclNet_v8_t*)dlsym(lib, "ncclNetPlugin_v8"); + if (ncclNet_v8) { + ncclNet.name = ncclNet_v8->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNet_v8->name); + return &ncclNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol."); + return nullptr; +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v8->init(logfn)); + ncclCollNet.devices = ncclCollNet_v8->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v8->listen; + ncclCollNet.connect = ncclCollNet_v8->connect; + ncclCollNet.reduceSupport = ncclCollNet_v8->reduceSupport; + ncclCollNet.regMr = ncclCollNet_v8->regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v8->deregMr; + ncclCollNet.iallreduce = ncclCollNet_iallreduce; + ncclCollNet.iallgather = ncclCollNet_iallgather; + ncclCollNet.ireducescatter = ncclCollNet_ireducescatter; + ncclCollNet.iflush = ncclCollNet_v8->iflush; + ncclCollNet.test = ncclCollNet_v8->test; + ncclCollNet.closeColl = ncclCollNet_v8->closeColl; + ncclCollNet.closeListen = ncclCollNet_v8->closeListen; + return ncclSuccess; +} + +ncclCollNet_t* getNcclCollNet_v8(void* lib) { + ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(lib, "ncclCollNetPlugin_v8"); + if (ncclCollNet_v8) { + ncclCollNet.name = ncclCollNet_v8->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNet_v8->name); + return &ncclCollNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc new file mode 100644 index 0000000..34e0393 --- /dev/null +++ b/src/plugin/net/net_v9.cc @@ -0,0 +1,121 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include "checks.h" + +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; +static ncclNet_v9_t* ncclNet_v9; +static ncclCollNet_v9_t* ncclCollNet_v9; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props); +} + +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { + return ncclNet_v9->isend(sendComm, data, size, tag, mhandle, request); +} + +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { + return ncclNet_v9->irecv(recvComm, n, data, sizes, tags, mhandles, request); +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return ncclNet_v9->connect(dev, handle, sendComm, sendDevComm); +} + +static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) { + return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props); +} + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props); +} + +static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request) { + return ncclCollNet_v9->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v9_t*)recvParts, bytesPerRank, + windowOffset, windowBytes, sendMhandle, request); +} + +static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request) { + return ncclCollNet_v9->ireducescatter(collComm, nSendParts, (ncclNetSGE_v9_t*)sendParts, recvData, bytesPerRank, + windowOffset, windowBytes, dataType, redOp, recvMhandle, request); +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v9->init(logfn)); + ncclNet.devices = ncclNet_v9->devices; + ncclNet.getProperties = ncclNet_getProperties; + ncclNet.listen = ncclNet_v9->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_v9->accept; + ncclNet.regMr = ncclNet_v9->regMr; + ncclNet.regMrDmaBuf = ncclNet_v9->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v9->deregMr; + ncclNet.isend = ncclNet_isend; + ncclNet.irecv = ncclNet_irecv; + ncclNet.iflush = ncclNet_v9->iflush; + ncclNet.test = ncclNet_v9->test; + ncclNet.closeSend = ncclNet_v9->closeSend; + ncclNet.closeRecv = ncclNet_v9->closeRecv; + ncclNet.closeListen = ncclNet_v9->closeListen; + ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr; + ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed; + ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr; + return ncclSuccess; +} + +ncclNet_t* getNcclNet_v9(void* lib) { + ncclNet_v9 = (ncclNet_v9_t*)dlsym(lib, "ncclNetPlugin_v9"); + if (ncclNet_v9) { + ncclNet.name = ncclNet_v9->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNet_v9->name); + return &ncclNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol."); + return nullptr; +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v9->init(logfn)); + ncclCollNet.devices = ncclCollNet_v9->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v9->listen; + ncclCollNet.connect = ncclCollNet_v9->connect; + ncclCollNet.reduceSupport = ncclCollNet_v9->reduceSupport; + ncclCollNet.regMr = ncclCollNet_v9->regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v9->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v9->deregMr; + ncclCollNet.iallreduce = ncclCollNet_v9->iallreduce; + ncclCollNet.iallgather = ncclCollNet_iallgather; + ncclCollNet.ireducescatter = ncclCollNet_ireducescatter; + ncclCollNet.iflush = ncclCollNet_v9->iflush; + ncclCollNet.test = ncclCollNet_v9->test; + ncclCollNet.closeColl = ncclCollNet_v9->closeColl; + ncclCollNet.closeListen = ncclCollNet_v9->closeListen; + return ncclSuccess; +} + +ncclCollNet_t* getNcclCollNet_v9(void* lib) { + ncclCollNet_v9 = (ncclCollNet_v9_t*)dlsym(lib, "ncclCollNetPlugin_v9"); + if (ncclCollNet_v9) { + ncclCollNet.name = ncclCollNet_v9->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNet_v9->name); + return &ncclCollNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol."); + return nullptr; +} diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc new file mode 100644 index 0000000..a43df28 --- /dev/null +++ b/src/plugin/plugin_open.cc @@ -0,0 +1,134 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include +#include + +#include "debug.h" + +#define MAX_STR_LEN 255 + +enum ncclPluginType { + ncclPluginTypeNet, + ncclPluginTypeTuner, + ncclPluginTypeProfiler, +}; + +#define NUM_LIBS 3 +static void *libHandles[NUM_LIBS]; +static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" }; +static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" }; +static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" }; +static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT }; + +static void* tryOpenLib(char* name, int* err, char* errStr) { + *err = 0; + if (nullptr == name || strlen(name) == 0) { + return nullptr; + } + + if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { + name = nullptr; + } + + void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); + if (nullptr == handle) { + strncpy(errStr, dlerror(), MAX_STR_LEN); + errStr[MAX_STR_LEN] = '\0'; + // "handle" and "name" won't be NULL at the same time. + // coverity[var_deref_model] + if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { + *err = ENOENT; + } + } + return handle; +} + +static void appendNameToList(char* nameList, int *nameListLen, char* name) { + snprintf(nameList, *nameListLen, " %s", name); + nameList += strlen(name) + 1; + *nameListLen -= strlen(name) + 1; +} + +static void* openPluginLib(enum ncclPluginType type, const char* libName) { + int openErr, len = PATH_MAX; + char libName_[MAX_STR_LEN] = { 0 }; + char openErrStr[MAX_STR_LEN + 1] = { 0 }; + char eNoEntNameList[PATH_MAX] = { 0 }; + + if (libName && strlen(libName)) { + snprintf(libName_, MAX_STR_LEN, "%s", libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } + + snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } + } else { + snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } + } + + if (strlen(eNoEntNameList)) { + INFO(subsys[type], "%s/Plugin: Could not find:%s. %s", pluginNames[type], eNoEntNameList, pluginFallback[type]); + } else if (strlen(pluginFallback[type])) { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], pluginFallback[type]); + } + return nullptr; +} + +void* ncclOpenNetPluginLib(const char* name) { + return openPluginLib(ncclPluginTypeNet, name); +} + +void* ncclOpenTunerPluginLib(const char* name) { + return openPluginLib(ncclPluginTypeTuner, name); +} + +void* ncclOpenProfilerPluginLib(const char* name) { + return openPluginLib(ncclPluginTypeProfiler, name); +} + +void* ncclGetNetPluginLib(void) { + return libHandles[ncclPluginTypeNet]; +} + +ncclResult_t ncclClosePluginLib(void* handle) { + for (int l=0; ltype; - eDescr_v1.parentObj = eDescr->parentObj; - eDescr_v1.rank = eDescr->rank; - switch(eDescr->type) { - case ncclProfileGroup: break; - case ncclProfileColl: { - eDescr_v1.coll.name = eDescr->coll.name; - eDescr_v1.coll.commHash = eDescr->coll.commHash; - eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber; - eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func); - eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff; - eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff; - eDescr_v1.coll.count = eDescr->coll.count; - eDescr_v1.coll.root = eDescr->coll.root; - eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype); - eDescr_v1.coll.op = 0; // removed in v2 - eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes; - eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels; - eDescr_v1.coll.nWarps = eDescr->coll.nWarps; - eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo); - eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto); - } break; - case ncclProfileP2p: { - eDescr_v1.p2p.name = eDescr->p2p.name; - eDescr_v1.p2p.commHash = eDescr->p2p.commHash; - eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func); - eDescr_v1.p2p.buff = eDescr->p2p.buff; - eDescr_v1.p2p.count = eDescr->p2p.count; - eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype); - eDescr_v1.p2p.peer = eDescr->p2p.peer; - } break; - case ncclProfileProxyOp: { - eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid; - eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId; - eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer; - eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps; - eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize; - eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend; - } break; - case ncclProfileProxyStep: { - eDescr_v1.proxyStep.step = eDescr->proxyStep.step; - } break; - case ncclProfileProxyCtrl: break; - default:; - } - return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1); -} - -static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) { - ncclProfiler_v1->init(context, eActivationMask); - ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent; - ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent; - ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState; - ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize; - return ncclSuccess; -} #define MAX_STR_LEN 256 -static void* tryOpenLib(char* name, int *err, char* errStr) { - if (nullptr == name || strlen(name) == 0) { - return nullptr; - } - - if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { - name = nullptr; - } - - void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); - if (nullptr == handle) { - strncpy(errStr, dlerror(), MAX_STR_LEN); - errStr[MAX_STR_LEN] = 0; - if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) { - *err = ENOENT; - } - } - - return handle; -} - -static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { - if (openErr == ENOENT) { - snprintf(nameList, *nameListLen, " %s", name); - nameList += strlen(name) + 1; - *nameListLen -= strlen(name) + 1; - return nameList; - } - INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr); - return nameList; -} - -static void* openProfilerPluginLib(char* couldNotFindNames, int len) { - int openErr; - void *pluginLib; - char profilerPluginLibName[PATH_MAX]; - char openErrStr[MAX_STR_LEN + 1] = { 0 }; - - const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN"); - if (envProfilerPluginName && strlen(envProfilerPluginName)) { - snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName); - pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName); - return pluginLib; - } - - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); - pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); - } else { - snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so"); - pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); - } - - return nullptr; -} - enum { profilerPluginLoadFailed = -1, profilerPluginLoadReady = 0, @@ -195,43 +33,31 @@ enum { static int profilerPluginStatus = profilerPluginLoadReady; static pid_t pid; -#define MAX_PLUGIN_LOAD 2 - static ncclResult_t ncclProfilerPluginLoad(void) { if (profilerPluginLoadFailed == profilerPluginStatus) { return ncclSuccess; } - char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; pthread_mutex_lock(&profilerLock); if (profilerPluginLoadSuccess == profilerPluginStatus) { ++profilerPluginRefCount; goto exit; } - profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); + profilerPluginLib = ncclOpenProfilerPluginLib(ncclGetEnv("NCCL_PROFILER_PLUGIN")); if (profilerPluginLib == nullptr) { - if (strlen(couldNotFindNames)) { - INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames); - } goto fail; } - ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2"); + ncclProfiler = getNcclProfiler_v3(profilerPluginLib); if (ncclProfiler == nullptr) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2."); - ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1"); - if (ncclProfiler_v1 == nullptr) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1."); - goto fail; - } else { - ncclProfiler = &ncclProfiler_v1_as_v2; - ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name; - ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init; - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1."); - } - } else { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2."); + ncclProfiler = getNcclProfiler_v2(profilerPluginLib); + } + if (ncclProfiler == NULL) { + ncclProfiler = getNcclProfiler_v1(profilerPluginLib); + } + if (ncclProfiler == NULL) { + goto fail; } ++profilerPluginRefCount; @@ -247,7 +73,7 @@ exit: pthread_mutex_unlock(&profilerLock); return ncclSuccess; fail: - if (profilerPluginLib) dlclose(profilerPluginLib); + if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib)); profilerPluginStatus = profilerPluginLoadFailed; goto exit; } @@ -256,7 +82,7 @@ static ncclResult_t ncclProfilerPluginUnload(void) { pthread_mutex_lock(&profilerLock); if (0 == (--profilerPluginRefCount)) { INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name); - dlclose(profilerPluginLib); + NCCLCHECK(ncclClosePluginLib(profilerPluginLib)); profilerPluginLib = nullptr; ncclProfiler = nullptr; profilerPluginStatus = profilerPluginLoadReady; @@ -269,6 +95,11 @@ static ncclResult_t ncclProfilerPluginUnload(void) { #include "timer.h" #if ENABLE_TIMER +// These counters are used to measure profiler overheads for different part of the code +// These counters are only useful/meaningful in controlled test environments where there +// is only one thread updating each set of counters, i.e., every communicator has its +// own proxy thread and the network uses only one thread to make progress (this is true +// for net_ib plugin but might not be true for net_socket plugin). static int64_t elapsedCount; static int64_t initCount, finalizeCount; static int64_t groupStartCount, groupStopCount; @@ -324,15 +155,14 @@ static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2]; #endif -static int eActivationMask; // Set by profiler -static int eActivationMaskGroup; // Cached for current group +int ncclProfilerEventMask; // Set by profiler ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) { TIME_START_EVENT(elapsed); TIME_START_EVENT(init); ncclProfilerPluginLoad(); if (__builtin_expect(ncclProfiler != NULL, 0)) { - int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask); + int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask); if (err) { WARN("Profiler init failed with error (%d). Continue without profiler.", err); ncclProfiler = NULL; @@ -356,9 +186,29 @@ ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) { ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) { TIME_START_EVENT(groupStart); - eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED); if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) { + // Check if any collective in the plan has a set event activation mask + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + int eActivationMask_ = 0; + while (ct) { + if (ct->eActivationMask) { + eActivationMask_ = ct->eActivationMask; + goto startGroup; + } + ct = ct->next; + } + // Check if any pt2pt in the plan has a set event activation mask + while (pt) { + if (pt->eActivationMask) { + eActivationMask_ = pt->eActivationMask; + goto startGroup; + } + pt = pt->next; + } + + startGroup: + if (eActivationMask_ & (ncclProfileGroup | ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin)) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileGroup; ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr); @@ -379,52 +229,63 @@ ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) { ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { TIME_START_EVENT(taskStart); - if (__builtin_expect(ncclProfiler != NULL, 0)) { - int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl); - if (plan->groupEventHandle && enable) { - struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); - while (ct) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileColl; - eDescr.parentObj = plan->groupEventHandle; - eDescr.rank = plan->comm->rank; - eDescr.coll.name = plan->comm->commName; - eDescr.coll.commHash = plan->comm->commHash; - eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++; - eDescr.coll.func = ncclFuncToString(ct->func); - eDescr.coll.sendBuff = ct->sendbuff; - eDescr.coll.recvBuff = ct->recvbuff; - eDescr.coll.count = ct->count; - eDescr.coll.root = ct->root; - eDescr.coll.datatype = ncclDatatypeToString(ct->datatype); - eDescr.coll.trafficBytes = ct->trafficBytes; - eDescr.coll.nMaxChannels = ct->nMaxChannels; - eDescr.coll.nWarps = ct->nWarps; - eDescr.coll.algo = ncclAlgoToString(ct->algorithm); - eDescr.coll.proto = ncclProtoToString(ct->protocol); - ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr); - - // update collective task with group event activation mask - ct->eActivationMask = eActivationMaskGroup; - ct = ct->next; + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + while (ct) { + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (plan->groupEventHandle) { + int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin); + if (enable) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileColl; + eDescr.parentObj = plan->groupEventHandle; + eDescr.rank = plan->comm->rank; + eDescr.coll.name = plan->comm->commName; + eDescr.coll.commHash = plan->comm->commHash; + eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]; + eDescr.coll.func = ncclFuncToString(ct->func); + eDescr.coll.sendBuff = ct->sendbuff; + eDescr.coll.recvBuff = ct->recvbuff; + eDescr.coll.count = ct->count; + eDescr.coll.root = ct->root; + eDescr.coll.datatype = ncclDatatypeToString(ct->datatype); + eDescr.coll.nMaxChannels = ct->nMaxChannels; + eDescr.coll.nWarps = ct->nWarps; + eDescr.coll.algo = ncclAlgoToString(ct->algorithm); + eDescr.coll.proto = ncclProtoToString(ct->protocol); + ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr); + } } + } + // comm->seqNumber values are updated even if the plugin is not active, since they are used by RAS as well. + // The test for "persistent" is a workaround for graph-captured collectives. In their case this function may not be + // consistently invoked on all the ranks, which would lead to mismatched counter values and thus false-positive + // reports from RAS. Instead, we choose not to include graph-captured collectives in our counts. An exception is + // made if ncclProfileKernelCh profiler events are active, as they result in proxy events always being added, which + // gives the consistency. + if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle && + (ct->eActivationMask & ncclProfileKernelCh))) + plan->comm->seqNumber[ct->func]++; + ct = ct->next; + } + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (plan->groupEventHandle) { struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); while (pt) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileP2p; - eDescr.parentObj = plan->groupEventHandle; - eDescr.rank = plan->comm->rank; - eDescr.p2p.name = plan->comm->commName; - eDescr.p2p.commHash = plan->comm->commHash; - eDescr.p2p.func = ncclFuncToString(pt->func); - eDescr.p2p.buff = pt->buff; - eDescr.p2p.count = pt->count; - eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype); - eDescr.p2p.peer = pt->root; - ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); - - // update collective task with group event activation mask - pt->eActivationMask = eActivationMaskGroup; + int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh); + if (enable) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileP2p; + eDescr.parentObj = plan->groupEventHandle; + eDescr.rank = plan->comm->rank; + eDescr.p2p.name = plan->comm->commName; + eDescr.p2p.commHash = plan->comm->commHash; + eDescr.p2p.func = ncclFuncToString(pt->func); + eDescr.p2p.buff = pt->buff; + eDescr.p2p.count = pt->count; + eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype); + eDescr.p2p.peer = pt->root; + ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); + } pt = pt->next; } } @@ -436,16 +297,15 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) { TIME_START_EVENT(taskStop); if (__builtin_expect(ncclProfiler != NULL, 0)) { - int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl); - if (plan->groupEventHandle && enable) { + if (plan->groupEventHandle) { struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); while (ct) { - ncclProfiler->stopEvent(ct->eventHandle); + if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle); ct = ct->next; } struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); while (pt) { - ncclProfiler->stopEvent(pt->eventHandle); + if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle); pt = pt->next; } } @@ -463,7 +323,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args TIME_START_EVENT(proxyOpStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) { + if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyOp; eDescr.parentObj = sub->taskEventHandle; @@ -485,7 +345,7 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args TIME_START_EVENT(proxyOpStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) { + if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyOp; eDescr.parentObj = sub->taskEventHandle; @@ -518,7 +378,7 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar TIME_START_EVENT(proxyStepStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) { + if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) { int step_ = DIVUP(stepId, args->sliceSteps); ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyStep; @@ -536,7 +396,7 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar TIME_START_EVENT(proxyStepStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) { + if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) { int step_ = DIVUP(stepId, args->sliceSteps); ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyStep; @@ -568,7 +428,7 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand TIME_START_EVENT(proxyCtrlStart); if (__builtin_expect(ncclProfiler != NULL, 0)) { // for proxy control events we allow profiling mode to change on a per event basis - int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED); + int eActivationMaskProxy = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); if (eActivationMaskProxy & ncclProfileProxyCtrl) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyCtrl; @@ -591,6 +451,30 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) { return ncclSuccess; } +ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) { + if (__builtin_expect(ncclProfiler != NULL, 0)) { + struct ncclProxySubArgs* sub = &args->subs[s]; + if (sub->eActivationMask & ncclProfileKernelCh) { + ncclProfilerEventDescr_t eDescr = { }; + eDescr.type = ncclProfileKernelCh; + eDescr.parentObj = sub->taskEventHandle; + eDescr.kernelCh.channelId = sub->channelId; + ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr); + } + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) { + if (__builtin_expect(ncclProfiler != NULL, 0)) { + struct ncclProxySubArgs* sub = &args->subs[s]; + if (sub->kernelEventHandle) { + ncclProfiler->stopEvent(sub->kernelEventHandle); + } + } + return ncclSuccess; +} + ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) { TIME_START_EVENT(proxyOpRecord); struct ncclProxySubArgs* sub = &args->subs[s]; @@ -619,7 +503,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs* ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) { TIME_START_EVENT(proxyCtrlRecord); - if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) { + if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) { ncclProfilerEventStateArgs_t args = { }; args.proxyCtrl.appendedProxyOps = appended; ncclProfiler->recordEventState(eHandle, eState, &args); @@ -632,3 +516,47 @@ ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) { op->pid = pid; return ncclSuccess; } + +static pthread_mutex_t proxyProfilerConnectLock = PTHREAD_MUTEX_INITIALIZER; + +static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxyOp* op) { + ncclResult_t ret = ncclSuccess; + pthread_mutex_lock(&proxyProfilerConnectLock); + if (comm->profiler.initialized) goto exit; + for (int c = 0; c < MAXCHANNELS; c++) { + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.sendProxyConn[c]), ret, exit); + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.sendProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit); + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.recvProxyConn[c]), ret, exit); + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.recvProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit); + } + comm->profiler.initialized = true; +exit: + pthread_mutex_unlock(&proxyProfilerConnectLock); + return ret; +} + +bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op) { + bool enabled = (__builtin_expect(ncclProfiler != NULL, 0) && (op->eActivationMask & ncclProfileKernelCh)); + if (enabled && !comm->profiler.initialized) (void)proxyProfilerConnect(comm, op); + return enabled; +} + +ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) { + if (__builtin_expect(ncclProfiler != NULL, 0)) { + struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle; + if (type == 0) { // start + if (sub->eActivationMask & ncclProfileNetPlugin) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileNetPlugin; + eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS]; + eDescr.rank = sub->rank; + eDescr.netPlugin.id = pluginId; + eDescr.netPlugin.data = extData; + ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr); + } + } else { // stop + ncclProfiler->stopEvent(*eHandle); + } + } + return ncclSuccess; +} diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc new file mode 100644 index 0000000..1397429 --- /dev/null +++ b/src/plugin/profiler/profiler_v1.cc @@ -0,0 +1,133 @@ +/************************************************************************* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "nccl_profiler.h" +#include "checks.h" + +static ncclProfiler_t ncclProfiler; +static ncclProfiler_v1_t* ncclProfiler_v1; + +static uint8_t ncclStringToFunc(const char* func) { + if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather; + if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce; + if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast; + if (0 == strcmp(func, "Recv")) return ncclFuncRecv; + if (0 == strcmp(func, "Reduce")) return ncclFuncReduce; + if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter; + if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv; + return ncclFuncSend; +} + +static uint8_t ncclStringToAlgo(const char* algo) { + if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE; + if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING; + if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT; + if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN; + if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS; + if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE; + return NCCL_ALGO_PAT; +} + +static uint8_t ncclStringToProto(const char* proto) { + if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL; + if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128; + return NCCL_PROTO_SIMPLE; +} + +static uint8_t ncclStringToDatatype(const char* dt) { + if (0 == strcmp(dt, "ncclInt8")) return ncclInt8; + if (0 == strcmp(dt, "ncclInt32")) return ncclInt32; + if (0 == strcmp(dt, "ncclUint32")) return ncclUint32; + if (0 == strcmp(dt, "ncclInt64")) return ncclInt64; + if (0 == strcmp(dt, "ncclUint64")) return ncclUint64; + if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16; + if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32; +#if defined(__CUDA_BF16_TYPES_EXIST__) + if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16; +#endif + return ncclFloat64; +} + +static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { + ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 }; + eDescr_v1.type = eDescr->type; + eDescr_v1.parentObj = eDescr->parentObj; + eDescr_v1.rank = eDescr->rank; + switch(eDescr->type) { + case ncclProfileGroup: break; + case ncclProfileColl: { + eDescr_v1.coll.name = eDescr->coll.name; + eDescr_v1.coll.commHash = eDescr->coll.commHash; + eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber; + eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func); + eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff; + eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff; + eDescr_v1.coll.count = eDescr->coll.count; + eDescr_v1.coll.root = eDescr->coll.root; + eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype); + eDescr_v1.coll.op = 0; // removed in v2 + eDescr_v1.coll.trafficBytes = 0; // removed in v3 + eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels; + eDescr_v1.coll.nWarps = eDescr->coll.nWarps; + eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo); + eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto); + } break; + case ncclProfileP2p: { + eDescr_v1.p2p.name = eDescr->p2p.name; + eDescr_v1.p2p.commHash = eDescr->p2p.commHash; + eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func); + eDescr_v1.p2p.buff = eDescr->p2p.buff; + eDescr_v1.p2p.count = eDescr->p2p.count; + eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype); + eDescr_v1.p2p.peer = eDescr->p2p.peer; + } break; + case ncclProfileProxyOp: { + eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid; + eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId; + eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer; + eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps; + eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize; + eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend; + } break; + case ncclProfileProxyStep: { + eDescr_v1.proxyStep.step = eDescr->proxyStep.step; + } break; + case ncclProfileProxyCtrl: break; + case ncclProfileKernelCh: + case ncclProfileNetPlugin: { + *eHandle = NULL; + return ncclSuccess; + } + default:; + } + return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1); +} + +static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { + return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs); +} + +static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) { + NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask)); + ncclProfiler.startEvent = ncclProfiler_startEvent; + ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent; + ncclProfiler.recordEventState = ncclProfiler_recordEventState; + ncclProfiler.finalize = ncclProfiler_v1->finalize; + return ncclSuccess; +} + +ncclProfiler_t* getNcclProfiler_v1(void* lib) { + ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(lib, "ncclProfiler_v1"); + if (ncclProfiler_v1) { + ncclProfiler.name = ncclProfiler_v1->name; + ncclProfiler.init = ncclProfiler_init; + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v1->name); + return &ncclProfiler; + } + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1."); + return NULL; +} diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc new file mode 100644 index 0000000..3d00008 --- /dev/null +++ b/src/plugin/profiler/profiler_v2.cc @@ -0,0 +1,45 @@ +/************************************************************************* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "nccl_profiler.h" +#include "checks.h" + +static ncclProfiler_t ncclProfiler; +static ncclProfiler_v2_t* ncclProfiler_v2; + +static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { + if (eDescr->type == ncclProfileKernelCh || eDescr->type == ncclProfileNetPlugin) { + *eHandle = NULL; + return ncclSuccess; + } + return ncclProfiler_v2->startEvent(context, eHandle, (ncclProfilerEventDescr_v2_t *)eDescr); +} + +static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { + return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs); +} + +static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) { + NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask)); + ncclProfiler.startEvent = ncclProfiler_startEvent; + ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent; + ncclProfiler.recordEventState = ncclProfiler_recordEventState; + ncclProfiler.finalize = ncclProfiler_v2->finalize; + return ncclSuccess; +} + +ncclProfiler_t* getNcclProfiler_v2(void* lib) { + ncclProfiler_v2 = (ncclProfiler_v2_t*)dlsym(lib, "ncclProfiler_v2"); + if (ncclProfiler_v2) { + ncclProfiler.name = ncclProfiler_v2->name; + ncclProfiler.init = ncclProfiler_init; + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v2->name); + return &ncclProfiler; + } + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2"); + return NULL; +} diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc new file mode 100644 index 0000000..322bea5 --- /dev/null +++ b/src/plugin/profiler/profiler_v3.cc @@ -0,0 +1,20 @@ +/************************************************************************* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "nccl_profiler.h" + +static ncclProfiler_v3_t* ncclProfiler_v3; + +ncclProfiler_t* getNcclProfiler_v3(void* lib) { + ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3"); + if (ncclProfiler_v3) { + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name); + return ncclProfiler_v3; + } + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3"); + return NULL; +} diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc new file mode 100644 index 0000000..443bf78 --- /dev/null +++ b/src/plugin/tuner.cc @@ -0,0 +1,99 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include + +#include "checks.h" +#include "debug.h" +#include "tuner.h" +#include "plugin.h" + +extern ncclTuner_t* getNcclTuner_v2(void* lib); +extern ncclTuner_t* getNcclTuner_v3(void* lib); +extern ncclTuner_t* getNcclTuner_v4(void* lib); + +pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; +static int tunerPluginRefCount; +static void* tunerPluginLib = nullptr; +static ncclTuner_t* tunerSymbol = nullptr; + +enum { + tunerPluginLoadFailed = -1, + tunerPluginLoadReady = 0, + tunerPluginLoadSuccess = 1, +}; + +#define MAX_PLUGIN_LOAD 4 + +static int status = tunerPluginLoadReady; + +ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { + // Initialize to nullptr by default if plugin tuner cannot be loaded. + comm->tuner = nullptr; + if (tunerPluginLoadFailed == status) { + return ncclSuccess; + } + + pthread_mutex_lock(&tunerPluginLock); + if (tunerPluginLoadFailed == status) { + goto exit; + } + + if (tunerPluginLoadSuccess == status) { + comm->tuner = tunerSymbol; + ++tunerPluginRefCount; + goto exit; + } + + tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN")); + if (nullptr == tunerPluginLib) { + tunerPluginLib = ncclGetNetPluginLib(); + if (nullptr == tunerPluginLib) { + goto fail; + } + } + + tunerSymbol = getNcclTuner_v4(tunerPluginLib); + if (tunerSymbol == NULL) { + tunerSymbol = getNcclTuner_v3(tunerPluginLib); + } + if (tunerSymbol == NULL) { + tunerSymbol = getNcclTuner_v2(tunerPluginLib); + } + if (tunerSymbol == NULL) { + goto fail; + } + + comm->tuner = tunerSymbol; + ++tunerPluginRefCount; + status = tunerPluginLoadSuccess; + comm->tunerPluginLoaded = 1; + +exit: + pthread_mutex_unlock(&tunerPluginLock); + return ncclSuccess; +fail: + tunerPluginLib = nullptr; + status = tunerPluginLoadFailed; + goto exit; +} + +ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { + pthread_mutex_lock(&tunerPluginLock); + if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { + INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); + NCCLCHECK(ncclClosePluginLib(tunerPluginLib)); + tunerPluginLib = nullptr; + tunerSymbol = nullptr; + comm->tuner = nullptr; + status = tunerPluginLoadReady; + comm->tunerPluginLoaded = 0; + } + pthread_mutex_unlock(&tunerPluginLock); + return ncclSuccess; +} diff --git a/src/plugin/tuner/tuner_v2.cc b/src/plugin/tuner/tuner_v2.cc new file mode 100644 index 0000000..005638f --- /dev/null +++ b/src/plugin/tuner/tuner_v2.cc @@ -0,0 +1,66 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "debug.h" +#include "checks.h" +#include "nccl_tuner.h" + +static ncclTuner_v2_t* ncclTuner_v2; +static ncclTuner_t ncclTuner; + +static int hasNvlsSupport(float** collCostTable) { + // Requirements for support of different algorithms: + // + // - NVLS intra-node: nvlsSupport + // - NVLS intra+inter-node: collNetSupport + // - NVLSTree intra-node: always disabled + // - NVLSTree inter-node: nvlsSupport + // - Collnet* inter-node: collNetSupport + // + // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1 + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0; +} + +static int hasCollNetSupport(float** collCostTable) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1; +} + +static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) { + int algorithm = NCCL_ALGO_UNDEF; + int protocol = NCCL_PROTO_UNDEF; + int nvlsSupport = hasNvlsSupport(collCostTable); + int collNetSupport = hasCollNetSupport(collCostTable); + NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels)); + // set time to 0 below to make sure this algorithm/protocol is selected later on + if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0; + } + return ncclSuccess; +} + +static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) { + NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context)); + ncclTuner.getCollInfo = ncclTuner_getCollInfo; + ncclTuner.destroy = ncclTuner_v2->destroy; + return ncclSuccess; +} + +ncclTuner_t* getNcclTuner_v2(void* lib) { + ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(lib, "ncclTunerPlugin_v2"); + if (ncclTuner_v2) { + ncclTuner.name = ncclTuner_v2->name; + ncclTuner.init = ncclTuner_init; + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name); + return &ncclTuner; + } + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); + return NULL; +} diff --git a/src/plugin/tuner/tuner_v3.cc b/src/plugin/tuner/tuner_v3.cc new file mode 100644 index 0000000..3898243 --- /dev/null +++ b/src/plugin/tuner/tuner_v3.cc @@ -0,0 +1,38 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "debug.h" +#include "checks.h" +#include "nccl_tuner.h" + +static ncclTuner_v3_t* ncclTuner_v3; +static ncclTuner_t ncclTuner; + +static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) { + NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto, nChannels)); + return ncclSuccess; +} + +static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) { + NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context)); + ncclTuner.getCollInfo = ncclTuner_getCollInfo; + ncclTuner.destroy = ncclTuner_v3->destroy; + return ncclSuccess; +} + +ncclTuner_t* getNcclTuner_v3(void* lib) { + ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(lib, "ncclTunerPlugin_v3"); + if (ncclTuner_v3) { + ncclTuner.name = ncclTuner_v3->name; + ncclTuner.init = ncclTuner_init; + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name); + return &ncclTuner; + } + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); + return NULL; +} diff --git a/src/plugin/tuner/tuner_v4.cc b/src/plugin/tuner/tuner_v4.cc new file mode 100644 index 0000000..4bfd116 --- /dev/null +++ b/src/plugin/tuner/tuner_v4.cc @@ -0,0 +1,22 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "debug.h" +#include "nccl_tuner.h" + +static ncclTuner_v4_t* ncclTuner_v4; + +ncclTuner_t* getNcclTuner_v4(void* lib) { + ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4"); + if (ncclTuner_v4) { + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name); + return ncclTuner_v4; + } + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol."); + return NULL; +} diff --git a/src/proxy.cc b/src/proxy.cc index 5a83ef3..7e8021e 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -383,6 +383,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr sub->pid = op->pid; sub->profilerContext = op->profilerContext; sub->ringAlgo = op->ringAlgo; + sub->workCounter = op->workCounter; args->nsubs = subIndex+1; if (subIndex) { if ((args->sliceSteps != op->sliceSteps) || @@ -532,6 +533,19 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon return ncclSuccess; } +static ncclResult_t SaveProxyProfiler(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) { + struct ncclProxyConnector* proxyConn = (op->coll == ncclFuncRecv) ? &comm->profiler.recvProxyConn[op->channelId] : &comm->profiler.sendProxyConn[op->channelId]; + if (justInquire) *justInquire = true; + else { + op->sendbuff = (uint8_t *)comm->profiler.workStarted; + op->recvbuff = (uint8_t *)comm->profiler.workCompleted; + NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op)); + // Ensure that in graph capturing the proxy workCounter is incremented to keep up with kernel workCounter + op->workCounter += comm->profiler.workCounter[op->channelId]; + } + return ncclSuccess; +} + static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) { if (peer < 0) return ncclSuccess; @@ -612,20 +626,19 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool // Run full algorithm to count the number of steps for each peer. ncclResult_t result = ncclSuccess; const ssize_t size = op->nbytes/comm->nRanks; - int last = 0; - int *nstepsSend = NULL, *nstepsRecv = NULL; const int rank = comm->rank, nranks = comm->nRanks; - PatRSAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); + int *nstepsSend = NULL, *nstepsRecv = NULL; + PatRSAlgorithm algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks); NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up); NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up); - while (last == 0) { - int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem; - size_t inpIx, outIx; - algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last); - if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++; - if (sendDim != -1 && postSend) nstepsSend[sendDim]++; - } + struct ncclPatStep ps; + do { + algo.getNextOp(&ps); + if (ps.flags & PatSkipped) continue; + if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++; + if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++; + } while (ps.last != 2); for (int i=0; inbytes/comm->nRanks; - int last = 0; - int *nstepsSend = NULL, *nstepsRecv = NULL; const int rank = comm->rank, nranks = comm->nRanks; - PatAGAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); + int *nstepsSend = NULL, *nstepsRecv = NULL; + PatAGAlgorithm algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks); NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down); NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down); - while (last == 0) { - int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem; - size_t inpIx, outIx; - algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last); - if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++; - if (sendDim != -1 && postSend) nstepsSend[sendDim]++; - } + struct ncclPatStep ps; + do { + algo.getNextOp(&ps); + if (ps.flags & PatSkipped) continue; + if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++; + if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++; + } while (ps.last != 2); for (int i=0; iroot == comm->rank) return ncclSuccess; NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire)); } break; + case ncclPatternProfiler: { + if (ncclProfilerNeedsProxy(comm, op)) { + NCCLCHECK(SaveProxyProfiler(comm, op, justInquire)); + } + } break; } return ncclSuccess; } @@ -725,10 +742,10 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; TIME_START(0); TIME_START(1); - NCCLCHECK(op->progress(proxyState, op)); + ncclResult_t ret = op->progress(proxyState, op); if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; - if (op->state == ncclProxyOpNone) { + if (op->state == ncclProxyOpNone || ret != ncclSuccess) { TIME_START(2); NCCLCHECK(removeOp(state, &op, &prevOp)); TIME_STOP(2); @@ -910,7 +927,7 @@ void* ncclProxyProgress(void *proxyState_) { if (ret != ncclSuccess) { __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE); INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); - continue; + break; } void* eHandle; ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); @@ -932,7 +949,7 @@ void* ncclProxyProgress(void *proxyState_) { } } lastIdle = idle; - } while (state->stop == 0 || (state->stop == 1 && state->active)); + } while ((state->stop == 0 || (state->stop == 1 && state->active)) && __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0); return NULL; } @@ -1140,6 +1157,7 @@ ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyCon } ncclIpcHdr hdr; + memset(&hdr, '\0', sizeof(hdr)); hdr.type = type; hdr.rank = rank; hdr.reqSize = reqSize; @@ -1323,9 +1341,12 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) { pthread_mutexattr_init(&mutexAttr); pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED); pthread_mutex_init(&pool->mutex, &mutexAttr); + pthread_mutexattr_destroy(&mutexAttr); pthread_condattr_t condAttr; + pthread_condattr_init(&condAttr); pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED); pthread_cond_init(&pool->cond, &condAttr); + pthread_condattr_destroy(&condAttr); state->opsPool = pool; memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1); diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc index 3e4e9a5..3eafe1b 100644 --- a/src/ras/client_support.cc +++ b/src/ras/client_support.cc @@ -4,8 +4,6 @@ * See LICENSE.txt for license information ************************************************************************/ -#define NDEBUG // Comment out duriyng development only! -#include #include #include @@ -26,26 +24,26 @@ #define STR2(v) #v #define STR(v) STR2(v) -// The RAS client listening socket of this RAS thread (normally port 28028). -int rasClientListeningSocket = -1; - -// Auxiliary structure used when processing the results. Helps with statistics gathering and sorting. +// Generic auxiliary structure used when processing the results. Helps with statistics gathering and sorting, +// e.g., for the calculation of the distribution of the number of peers per node, of the number of GPUs per peer, +// of the communicator sizes, or of the counts of collective operations. struct rasValCount { uint64_t value; // The observed value. int count; // The number of occurences of this value in the results. int firstIdx; // The index of the first occurence of this value in the results. }; -// Used in rasAuxComm below. The values are bitmasks so that they can be combined. +// Communicator status, used in rasAuxComm below. The values are bitmasks so that they can be combined. typedef enum { - RAS_ACS_UNKNOWN = 1, // Set if a peer did not provide info about a given communicator. + RAS_ACS_NOCOMM = 1, // Set if the peer claims not to be a member of a given communicator. RAS_ACS_INIT = 2, RAS_ACS_RUNNING = 4, RAS_ACS_FINALIZE = 8, RAS_ACS_ABORT = 16 } rasACStatus; -// Used in rasAuxComm below. The values are bitmasks so that they can be combined (with the exception of RAS_ACE_OK). +// Communicator errors, used in rasAuxComm below. The values are bitmasks so that they can be combined (with the +// exception of RAS_ACE_OK). typedef enum { RAS_ACE_OK = 0, RAS_ACE_MISMATCH = 1, @@ -53,22 +51,45 @@ typedef enum { RAS_ACE_INCOMPLETE = 4 } rasACError; -// Auxiliary structure used when processing the results. Helps with sorting and includes additional statistics -// on the number of peers and nodes for a communicator. +// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query. For each communicator, caches +// statistics extracted from the results, such as the number of peers and nodes or the communicator status. Includes +// a pointer to the communicator data in the results, making it easy to sort the communicators by a different key +// without altering the results buffer, or just to iterate over the communicators, given that the communicator data +// in the resuls is of variable length. struct rasAuxComm { - struct rasCollComms::comm* comm; + struct rasCollComms::comm* comm; // Points to the results buffer. int nPeers; int nNodes; int ranksPerNodeMin; int ranksPerNodeMax; unsigned int status; // Bitmask of rasACStatus values. unsigned int errors; // Bitmask of rasACError values. - uint64_t firstCollOpCount; // collOpCount of the first rank, to compare against. + uint64_t firstCollOpCounts[NCCL_NUM_FUNCTIONS]; // collOpCounts of the first rank, to compare against. + int nIncompleteRanks; // Number of ranks that we didn't get any response from. }; +// Auxiliary structure used when processing the rasPeerInfo data stored in the global rasPeers array. Makes it possible +// to extract a subset of peers (e.g., the dead ones), to sort by a different key without altering the original array, +// and also has room for extracted temporary data such as the number of peers per node or the number of GPUs per peer. +struct rasAuxPeerInfo { + struct rasPeerInfo* peer; // Points to an element in rasPeers. + int value; +}; + +// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query, specifically when iterating over +// each communicator's ranks. Makes it possible to sort by a different key without altering the original array, and +// also has room for extracted temporary data such as the rank's status or a count of collective operations. +struct rasAuxCommRank { + struct rasCollComms::comm::rank* rank; // Points to the results buffer. + uint64_t value; +}; + +// The RAS client listening socket of this RAS thread (normally port 28028). +int rasClientListeningSocket = -1; + // Connected RAS clients. -struct rasClient* rasClients; -int nRasClients; +struct rasClient* rasClientsHead; +struct rasClient* rasClientsTail; // Minimum byte count to increment the output buffer size by if it's too small. #define RAS_OUT_INCREMENT 4096 @@ -85,6 +106,7 @@ static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS // Still, 1024 should normally be plenty (verbose output may make things more difficult, // but we do check for overflows, so it will just be trimmed). + static ncclResult_t getNewClientEntry(struct rasClient** pClient); static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen); static void rasClientTerminate(struct rasClient* client); @@ -101,15 +123,13 @@ static void rasOutExtract(char* buffer); static int rasOutLength(); static void rasOutReset(); -static int rasPeersNGpuCompare(const void* e1, const void* e2); -static int rasPeersNProcsCompare(const void* e1, const void* e2); -static int rasPeersHostPidCompare(const void* e1, const void* e2); +static int rasAuxPeersValueCompare(const void* e1, const void* e2); static int ncclSocketsHostCompare(const void* p1, const void* p2); static int rasValCountsCompareRev(const void* p1, const void* p2); static int rasAuxCommsCompareRev(const void* p1, const void* p2); -static int rasCommRanksPeerCompare(const void* p1, const void* p2); -static int rasCommRanksCollOpCompare(const void* p1, const void* p2); +static int rasAuxCommRanksValueCompare(const void* p1, const void* p2); +static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size); static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size); static const char* ncclErrorToString(ncclResult_t err); static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size); @@ -181,21 +201,20 @@ fail: // Returns the index of the first available entry in the rasClients array, enlarging the array if necessary. static ncclResult_t getNewClientEntry(struct rasClient** pClient) { struct rasClient* client; - int i; - for (i = 0; i < nRasClients; i++) - if (rasClients[i].status == RAS_CLIENT_CLOSED) - break; - if (i == nRasClients) { - NCCLCHECK(ncclRealloc(&rasClients, nRasClients, nRasClients+RAS_INCREMENT)); - nRasClients += RAS_INCREMENT; - } - client = rasClients+i; - memset(client, '\0', sizeof(*client)); + NCCLCHECK(ncclCalloc(&client, 1)); + client->sock = client->pfd = -1; ncclIntruQueueConstruct(&client->sendQ); client->timeout = RAS_COLLECTIVE_LEG_TIMEOUT; - client->collIdx = -1; + + if (rasClientsHead) { + rasClientsTail->next = client; + client->prev = rasClientsTail; + rasClientsTail = client; + } else { + rasClientsHead = rasClientsTail = client; + } *pClient = client; return ncclSuccess; @@ -219,22 +238,32 @@ static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgL struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg)); meta->offset = 0; meta->length = msgLen; - ncclIntruQueueEnqueue(&client->sendQ, meta); - assert(client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED); - rasPfds[client->pfd].events |= POLLOUT; + if (client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED) { + ncclIntruQueueEnqueue(&client->sendQ, meta); + rasPfds[client->pfd].events |= POLLOUT; + } else { + INFO(NCCL_RAS, "RAS invalid client status %d -- internal error?", client->status); + } } // Terminates a connection with a RAS client. static void rasClientTerminate(struct rasClient* client) { (void)close(client->sock); - client->sock = -1; - client->status = RAS_CLIENT_CLOSED; rasPfds[client->pfd].fd = -1; rasPfds[client->pfd].events = rasPfds[client->pfd].revents = 0; - client->pfd = -1; while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&client->sendQ)) { free(meta); } + + if (client == rasClientsHead) + rasClientsHead = rasClientsHead->next; + if (client == rasClientsTail) + rasClientsTail = rasClientsTail->prev; + if (client->prev) + client->prev->next = client->next; + if (client->next) + client->next->prev = client->prev; + free(client); } @@ -245,16 +274,12 @@ static void rasClientTerminate(struct rasClient* client) { // Invoked when an asynchronous operation that a client was waiting on completes. Finds the right client and // reinvokes rasClientRun. ncclResult_t rasClientResume(struct rasCollective* coll) { - int collIdx = coll-rasCollectives; - int i; - struct rasClient* client = nullptr; - for (i = 0; i < nRasClients; i++) { - client = rasClients+i; - if (client->status != RAS_CLIENT_CLOSED && client->collIdx == collIdx) { + struct rasClient* client; + + for (client = rasClientsHead; client; client = client->next) + if (client->coll == coll) break; - } - } - if (i == nRasClients) { + if (client == nullptr) { INFO(NCCL_RAS, "RAS failed to find a matching client!"); rasCollFree(coll); goto exit; @@ -266,8 +291,7 @@ exit: } // Handles a ready client FD from the main event loop. -void rasClientEventLoop(int clientIdx, int pollIdx) { - struct rasClient* client = rasClients+clientIdx; +void rasClientEventLoop(struct rasClient* client, int pollIdx) { bool closed = false; if (client->status == RAS_CLIENT_CONNECTED) { @@ -431,7 +455,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) { break; } case RAS_CLIENT_CONNS: - assert(client->collIdx != -1); NCCLCHECKGOTO(rasClientRunConns(client), ret, exit); #endif client->status = RAS_CLIENT_COMMS; @@ -440,7 +463,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) { break; } case RAS_CLIENT_COMMS: - assert(client->collIdx != -1); NCCLCHECKGOTO(rasClientRunComms(client), ret, exit); client->status = RAS_CLIENT_FINISHED; break; @@ -459,7 +481,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { ncclResult_t ret = ncclSuccess; char* msg = nullptr; int msgLen; - struct rasPeerInfo* peersReSorted = nullptr; + struct rasAuxPeerInfo* auxRasPeers = nullptr; int totalGpus, totalNodes, firstNGpusNode, firstNGpusGlobal, firstNPeersGlobal; bool consistentNGpusNode, consistentNGpusGlobal, consistentNPeersGlobal; int firstIdx, nPeers; @@ -467,6 +489,8 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { int nValCounts; static int cudaDriver = -1, cudaRuntime = -1; + TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting"); + rasOutReset(); rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n"); @@ -481,7 +505,6 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; - rasOutReset(); totalGpus = totalNodes = 0; firstNGpusNode = 0; // #GPUs on the first peer of a node. firstNGpusGlobal = 0; // #GPUs on peerIdx 0. @@ -489,7 +512,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { consistentNGpusGlobal = true; // Whether #GPUs/peer is consistent between the peers *on all nodes*. consistentNPeersGlobal = true; // Whether #peers/node is consistent between all nodes. nPeers = 0; // #peers on a node. - firstNPeersGlobal = 0; + firstNPeersGlobal = 0; // #peers on the first node. for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { int nGpus = __builtin_popcountll(rasPeers[peerIdx].cudaDevs); totalGpus += nGpus; @@ -522,6 +545,11 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { } } // for (peerIdx) + TRACE(NCCL_RAS, "RAS: totalNodes %d, nRasPeers %d, totalGpus %d", totalNodes, nRasPeers, totalGpus); + TRACE(NCCL_RAS, "RAS: consistentNPeersGlobal %d, consistentNGpusGlobal %d, consistentNGpusNode %d", + consistentNPeersGlobal, consistentNGpusGlobal, consistentNGpusNode); + TRACE(NCCL_RAS, "RAS: firstNPeersGlobal %d, firstNGpusGlobal %d", firstNPeersGlobal, firstNGpusGlobal); + rasOutAppend("Job summary\n" "===========\n\n"); @@ -532,22 +560,24 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus); } else { // Gather the stats on the number of processes per node. However, that number is not a property of a peer, - // but of a group of peers, so calculating it is more involved. We make a copy of rasPeers and creatively - // misuse it: cudaDevs of each element will be repurposed to store the number of processes on the node. - NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail); - memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted)); + // but of a group of peers, so calculating it is more involved. We store the value in a temporary auxRasPeers + // array. + NCCLCHECKGOTO(ncclCalloc(&auxRasPeers, nRasPeers), ret, fail); firstIdx = 0; nPeers = 0; for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { + auxRasPeers[peerIdx].peer = rasPeers+peerIdx; if (peerIdx == 0) { nPeers = 1; firstIdx = 0; } else { // peerIdx > 0 - if (!ncclSocketsSameNode(&peersReSorted[peerIdx].addr, &peersReSorted[peerIdx-1].addr)) { + if (!ncclSocketsSameNode(&auxRasPeers[peerIdx].peer->addr, &auxRasPeers[peerIdx-1].peer->addr)) { + TRACE(NCCL_RAS, "RAS: node %s: nPeers %d", + ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers); for (int i = firstIdx; i < peerIdx; i++) { // Go back and update the number of processes of all the elements of that node. - peersReSorted[i].cudaDevs = nPeers; + auxRasPeers[i].value = nPeers; } nPeers = 1; firstIdx = peerIdx; @@ -557,21 +587,23 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { } // peerIdx > 0 if (peerIdx == nRasPeers-1) { // Last iteration of the loop. + TRACE(NCCL_RAS, "RAS: node %s: nPeers %d", + ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers); for (int i = firstIdx; i < nRasPeers; i++) { - peersReSorted[i].cudaDevs = nPeers; + auxRasPeers[i].value = nPeers; } } } // for (peerIdx) - // Re-sort it now using the number of processes on the node (cudaDevs) as the primary key, host IP as the + // Re-sort it now using the number of processes on the node (value) as the primary key, host IP as the // secondary, and process id as the tertiary. - qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNProcsCompare); + qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare); // Calculate the distribution of different numbers of peers per node. nValCounts = 0; for (int peerIdx = 0; peerIdx < nRasPeers;) { - if (peerIdx == 0 || peersReSorted[peerIdx].cudaDevs != peersReSorted[peerIdx-1].cudaDevs) { - valCounts[nValCounts].value = peersReSorted[peerIdx].cudaDevs; + if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) { + valCounts[nValCounts].value = auxRasPeers[peerIdx].value; valCounts[nValCounts].count = 1; valCounts[nValCounts].firstIdx = peerIdx; nValCounts++; @@ -579,14 +611,15 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { valCounts[nValCounts-1].count++; } // Advance peerIdx to the next node. - peerIdx += peersReSorted[peerIdx].cudaDevs; - } + peerIdx += auxRasPeers[peerIdx].value; + } // for (peerIdx) // valCounts is currently sorted by value (the number of peers per node). Sort it by the count (most frequent // number of peers first). qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev); // Print it out, the most frequent peer counts first. if (consistentNGpusNode && consistentNGpusGlobal) { + // consistentNPeersGlobal must be false rasOutAppend(" Nodes Processes GPUs\n" " per node per process\n"); for (int i = 0; i < nValCounts; i++) { @@ -594,7 +627,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasOutAppend("%7d %9ld %11d\n", vc->count, vc->value, firstNGpusGlobal); } - } else { + } else { // !consistentNGpusNode || !consistentNGpusGlobal rasOutAppend(" Nodes Processes\n" " per node\n"); for (int i = 0; i < nValCounts; i++) { @@ -606,24 +639,29 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { // We calculate and print the GPUs/process separately. This is required for !consistentNGpusNode and // it also makes our life easier above for !consistentNGpusGlobal (which could require a larger valCounts). - // Sort peers by the GPU count, to simplify data extraction. - memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted)); + // Sort peers by the GPU count, to simplify data extraction. Not sure how fast __builtin_popcountll is so we + // may just as well cache it... + for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { + auxRasPeers[peerIdx].value = __builtin_popcountll(auxRasPeers[peerIdx].peer->cudaDevs); + TRACE(NCCL_RAS, "RAS: node %s pid %d: nGpus %d", + ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), + auxRasPeers[peerIdx].peer->pid, auxRasPeers[peerIdx].value); + } // GPU count is the primary key, host IP is the secondary, and process id is the tertiary. - qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNGpuCompare); + qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare); // Calculate the distribution of different numbers of GPUs per peer. nValCounts = 0; for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { - if (peerIdx == 0 || __builtin_popcountll(peersReSorted[peerIdx].cudaDevs) != - __builtin_popcountll(peersReSorted[peerIdx-1].cudaDevs)) { - valCounts[nValCounts].value = __builtin_popcountll(peersReSorted[peerIdx].cudaDevs); + if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) { + valCounts[nValCounts].value = auxRasPeers[peerIdx].value; valCounts[nValCounts].count = 1; valCounts[nValCounts].firstIdx = peerIdx; nValCounts++; } else { valCounts[nValCounts-1].count++; } - } + } // for (peerIdx) // valCounts is currently sorted by value (number of GPUs per peer). Sort it by the count (most frequent // GPU counts first). qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev); @@ -637,7 +675,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasOutAppend(" %9d %11ld\n", vc->count, vc->value); } - } + } // !consistentNGpusNode || !consistentNGpusGlobal rasOutAppend("\n" " Nodes Processes GPUs\n" "(total) (total) (total)\n" @@ -652,16 +690,16 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { // provided that they meet our definition of an outlier. if (rasCountIsOutlier(vc->count, client->verbose, totalNodes)) { rasOutAppend("\nThe outlier node%s:\n", (vc->count > 1 ? "s" : "")); - // peersReSorted is sorted by the node IP address (not port!) as the secondary key and the pid as + // auxRasPeers is sorted by the node IP address (not port!) as the secondary key and the pid as // the tertiary, which comes in handy when printing... for (int peerIdx = vc->firstIdx; peerIdx < vc->count*vc->value + vc->firstIdx; peerIdx += vc->value) { lineBuf[0] = '\0'; for (int j = 0; j < vc->value; j++) { snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", - (j > 0 ? "," : ""), peersReSorted[j].pid); + (j > 0 ? "," : ""), auxRasPeers[j].peer->pid); } rasOutAppend(" Node %s running process%s %s\n", - ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)), + ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), (vc->value > 1 ? "es" : ""), lineBuf); } // for (peerIdx) } // if (rasCountIsOutlier(vc->count)) @@ -678,13 +716,12 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; { - struct rasCollRequest collReq; + struct rasCollRequest collReq = {}; bool allDone = false; rasCollReqInit(&collReq); collReq.timeout = client->timeout; collReq.type = RAS_COLL_CONNS; - NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_CONNS), &allDone, &client->collIdx), - ret, fail); + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail); if (!allDone) ret = ncclInProgress; // We need to wait for async. responses. } @@ -696,18 +733,18 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; { - struct rasCollRequest collReq; + struct rasCollRequest collReq = {}; bool allDone = false; rasCollReqInit(&collReq); collReq.timeout = client->timeout; collReq.type = RAS_COLL_COMMS; - NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx), - ret, fail); + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail); if (!allDone) ret = ncclInProgress; } + TRACE(NCCL_RAS, "RAS: rasClientRunInit: scheduling RAS_COLL_COMMS and finishing"); exit: - free(peersReSorted); + free(auxRasPeers); return ret; fail: goto exit; @@ -721,13 +758,16 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) { ncclResult_t ret = ncclSuccess; char* msg = nullptr; int msgLen; - struct rasCollective* coll = rasCollectives+client->collIdx; + struct rasCollective* coll = client->coll; struct rasCollConns* connsData = (struct rasCollConns*)coll->data; int expected; struct rasPeerInfo* peersBuf = nullptr; - assert(coll->nFwdSent == coll->nFwdRecv); - client->collIdx = -1; + if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) { + INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status); + return ncclInternalError; + } + client->coll = nullptr; rasOutReset(); rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9); @@ -822,13 +862,12 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) { rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; { - struct rasCollRequest collReq; + struct rasCollRequest collReq = {}; bool allDone = false; rasCollReqInit(&collReq); collReq.timeout = client->timeout; collReq.type = RAS_COLL_COMMS; - NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx), - ret, fail); + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail); if (!allDone) ret = ncclInProgress; } @@ -847,10 +886,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { ncclResult_t ret = ncclSuccess; char* msg = nullptr; int msgLen; - struct rasCollective* coll = rasCollectives+client->collIdx; + struct rasCollective* coll = client->coll; struct rasCollComms* commsData = (struct rasCollComms*)coll->data; struct rasCollComms::comm* comm; - struct rasCollComms::comm::rank* ranksReSorted = nullptr; + struct rasAuxCommRank* auxCommRanks = nullptr; struct rasValCount* valCounts = nullptr; int nValCounts; struct rasValCount* collOpCounts = nullptr; @@ -860,7 +899,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { int vcIdx; int nPeersMissing; uint64_t* peerNvmlDevs = nullptr; - const char*const statusStr[] = { "UNKNOWN", "INIT", "RUNNING", "FINALIZE", "ABORT" }; + const char*const statusStr[] = { "NOCOMM", "INIT", "RUNNING", "FINALIZE", "ABORT" }; const char*const errorStr[] = { // Listing them all like this, while a bit of a hassle, is less effort than formatting in a temporary buffer. "OK", @@ -873,14 +912,22 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { "INCOMPLETE,ERROR,MISMATCH" }; - assert(coll->nFwdSent == coll->nFwdRecv); - client->collIdx = -1; + TRACE(NCCL_RAS, "RAS: rasClientRunComms: starting"); + TRACE(NCCL_RAS, "RAS: coll nLegTimeouts %d, nPeers %d, nData %d; commsData nComms %d", + coll->nLegTimeouts, coll->nPeers, coll->nData, commsData->nComms); + + if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) { + INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status); + return ncclInternalError; + } + client->coll = nullptr; rasOutReset(); rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9); // Calculate the number of missing peers early as we rely on it for other things. nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers; + TRACE(NCCL_RAS, "RAS: nRasPeers %d, nRasDeadPeers %d, nPeersMissing %d", nRasPeers, nRasDeadPeers, nPeersMissing); // Sort the communicators by size. As the structure is inconvenient to move around due to the elements being // of variable length, we create an auxiliary array that includes pointers to individual elements and simply sort @@ -896,12 +943,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { auxComms[commIdx].comm = comm; comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks)); } - NCCLCHECKGOTO(ncclCalloc(&ranksReSorted, maxCommSize), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&auxCommRanks, maxCommSize), ret, fail); + TRACE(NCCL_RAS, "RAS: maxCommSize %d", maxCommSize); // For convenience, create a translation table from rasCollective's peerIdx to rasPeers peerIdx. NCCLCHECKGOTO(ncclCalloc(&peerIdxConv, coll->nPeers), ret, fail); - for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) + for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) { peerIdxConv[peerIdx] = rasPeerFind(coll->peers+peerIdx); + TRACE(NCCL_RAS, "RAS: coll peers[%d] -> rasPeers[%d]", peerIdx, peerIdxConv[peerIdx]); + } // Sort coll->peers to match the ordering of rasPeers -- we may need it later... qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare); @@ -910,42 +960,75 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { struct rasAuxComm* auxComm = auxComms+commIdx; int nRanks = 0; comm = auxComm->comm; + TRACE(NCCL_RAS, "RAS: coll comms[%d]: commId (0x%lx, 0x%lx, 0x%lx), commNRanks %d, nRanks %d, nMissingRanks %d", + commIdx, comm->commId.commHash, comm->commId.hostHash, comm->commId.pidHash, + comm->commNRanks, comm->nRanks, comm->nMissingRanks); - if (comm->commNRanks > comm->nRanks) { + if (comm->nMissingRanks > 0) { // There are two possibilities here. Either we are missing the data on some ranks because the processes are // unreachable, or the processes _are_ reachable but didn't report to be part of this communicator (which - // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort). Because we - // currently don't collect data about missing ranks, we can't reliably distinguish these two cases. - // For now we rely on an approximation: if we _know_ that some peers failed to respond, we mark this - // as an INCOMPLETE error; otherwise as a MISMATCH warning. - if (nPeersMissing > 0 || nRasDeadPeers > 0) - auxComm->errors |= RAS_ACE_INCOMPLETE; - else { + // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort). + if (nPeersMissing == 0 && nRasDeadPeers == 0) { + // We received data from _all_ processes. That's an easy case. auxComm->errors |= RAS_ACE_MISMATCH; - auxComm->status |= RAS_ACS_UNKNOWN; - } - } + auxComm->status |= RAS_ACS_NOCOMM; + } else { + // We failed to receive data from some processes but we don't know if that's why we don't have the info about + // some ranks of this communicator. We need to check all the missing ranks one-by-one as different ranks may + // have different reason. + struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks); - memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); - // Convert ranksReSorted' peerIdx to rasPeers and sort by it -- that way we will have the ranks sorted - // by process _and_ node, which makes counting easy. - for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) - ranksReSorted[rankIdx].peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; - qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksPeerCompare); + for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) { + struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx; + void* found; + if ((found = bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), + ncclSocketsCompare)) != nullptr) { + // We did receive the data from that process, but not about this communicator. + auxComm->errors |= RAS_ACE_MISMATCH; + auxComm->status |= RAS_ACS_NOCOMM; + } else { + // We failed to receive data from that process. + auxComm->errors |= RAS_ACE_INCOMPLETE; + auxComm->nIncompleteRanks++; + } + TRACE(NCCL_RAS, "RAS: comm missingRank[%d] commRank %d, addr %td (-> %d), cudaDev %d, nvmlDev %d", + rankIdx, missingRank->commRank, (found ? ((union ncclSocketAddress*)found) - coll->peers: -1), + rasPeerFind(&missingRank->addr), missingRank->cudaDev, missingRank->nvmlDev); + } // for (rankIdx) + } // nPeersMissing > 0 || nRasDeadPeers > 0 + } // if (comm->nMissingRanks > 0) + + // Initialize auxCommRanks from comm->rank, converting peerIdx to rasPeers, then sort by it -- that way we will + // have the ranks sorted by node and process, which makes counting easy. + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx; + auxCommRanks[rankIdx].rank = rank; + auxCommRanks[rankIdx].value = peerIdxConv[rank->peerIdx]; + TRACE(NCCL_RAS, "RAS: comm rank[%d] commRank %d, peerIdx %d (-> %d), cudaDev %d, nvmlDev %d", + rankIdx, rank->commRank, rank->peerIdx, peerIdxConv[rank->peerIdx], rank->cudaDev, rank->nvmlDev); + TRACE(NCCL_RAS, "RAS: comm rank[%d] collOpCounts (%ld, %ld, %ld, %ld, %ld)", + rankIdx, rank->collOpCounts[0], rank->collOpCounts[1], rank->collOpCounts[2], rank->collOpCounts[3], + rank->collOpCounts[4]); + TRACE(NCCL_RAS, "RAS: comm rank[%d] status initState %d, asyncError %d, finalizeCalled %d, destroyFlag %d, " + "abortFlag %d", rankIdx, rank->status.initState, rank->status.asyncError, rank->status.finalizeCalled, + rank->status.destroyFlag, rank->status.abortFlag); /**/ + } + // This also sorts by the commRank, which we don't care about here, but it won't hurt. + qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare); // Count the peers and nodes, get the status/error indicators. for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx; + struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx; if (rankIdx == 0) { auxComm->nPeers = auxComm->nNodes = 1; auxComm->ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS; auxComm->ranksPerNodeMax = 0; - auxComm->firstCollOpCount = rank->collOpCount; + memcpy(auxComm->firstCollOpCounts, auxRank->rank->collOpCounts, sizeof(auxComm->firstCollOpCounts)); nRanks = 1; } else { // rankIdx > 0 - if (rank->peerIdx != rank[-1].peerIdx) { + if (auxRank->value != auxRank[-1].value) { auxComm->nPeers++; - if (!ncclSocketsSameNode(&rasPeers[rank->peerIdx].addr, &rasPeers[rank[-1].peerIdx].addr)) { + if (!ncclSocketsSameNode(&rasPeers[auxRank->value].addr, &rasPeers[auxRank[-1].value].addr)) { auxComm->nNodes++; if (auxComm->ranksPerNodeMin > nRanks) auxComm->ranksPerNodeMin = nRanks; @@ -953,7 +1036,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { auxComm->ranksPerNodeMax = nRanks; nRanks = 0; } - } // if (rank->peerIdx != rank[-1].peerIdx) + } // if (auxRank->value != auxRank[-1].value) nRanks++; } // rankIdx > 0 if (rankIdx == comm->nRanks-1) { @@ -964,25 +1047,27 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { auxComm->ranksPerNodeMax = nRanks; } - if (rank->status.abortFlag) + if (auxRank->rank->status.abortFlag) auxComm->status |= RAS_ACS_ABORT; - else if (rank->status.finalizeCalled || rank->status.destroyFlag) { + else if (auxRank->rank->status.finalizeCalled || auxRank->rank->status.destroyFlag) { // destroyFlag is set by ncclCommDestroy and ncclCommAbort. finalizeCalled appears to be set by // ncclCommFinalize only. According to the docs, ncclCommDestroy *can* be called without calling // ncclCommFinalize first. The code structure here ensures that we attribute destroyFlag properly // as a finalize state indicator (and ignore it in case of ncclCommAbort). auxComm->status |= RAS_ACS_FINALIZE; } - else if (rank->status.initState == ncclSuccess) + else if (auxRank->rank->status.initState == ncclSuccess) auxComm->status |= RAS_ACS_RUNNING; - else // rank->initState != ncclSuccess + else // auxRank->rank->initState != ncclSuccess auxComm->status |= RAS_ACS_INIT; - if (rank->collOpCount != auxComm->firstCollOpCount) - auxComm->errors |= RAS_ACE_MISMATCH; - if (rank->status.initState != ncclSuccess && rank->status.initState != ncclInProgress) + for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS && !(auxComm->errors & RAS_ACE_MISMATCH); collIdx++) { + if (auxRank->rank->collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx]) + auxComm->errors |= RAS_ACE_MISMATCH; + } + if (auxRank->rank->status.initState != ncclSuccess && auxRank->rank->status.initState != ncclInProgress) auxComm->errors |= RAS_ACE_ERROR; - if (rank->status.asyncError != ncclSuccess && rank->status.asyncError != ncclInProgress) + if (auxRank->rank->status.asyncError != ncclSuccess && auxRank->rank->status.asyncError != ncclInProgress) auxComm->errors |= RAS_ACE_ERROR; } // for (rankIdx) @@ -990,9 +1075,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { // We've got a status mismatch between ranks. auxComm->errors |= RAS_ACE_MISMATCH; } + TRACE(NCCL_RAS, "RAS: auxComm nPeers %d, nNodes %d, nIncompleteRanks %d", + auxComm->nPeers, auxComm->nNodes, auxComm->nIncompleteRanks); + TRACE(NCCL_RAS, "RAS: auxComm ranksPerNodeMin %d, ranksPerNodeMax %d, status 0x%x, errors 0x%x", + auxComm->ranksPerNodeMin, auxComm->ranksPerNodeMax, auxComm->status, auxComm->errors); } // for (commIdx) // Sort it by size/nNodes/status/errors/missing ranks. - qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev); + if (auxComms) + qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev); // Calculate the distribution of different communicator sizes. NCCLCHECKGOTO(ncclCalloc(&valCounts, commsData->nComms), ret, fail); @@ -1014,10 +1104,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { } } - rasOutAppend("Group Comms Nodes Ranks Ranks Ranks Status Errors\n" - " # in group per comm per node per comm in group\n"); - if (commsData->nComms == 0) + TRACE(NCCL_RAS, "RAS: rasClientRunComms: done with initial data processing"); + + if (commsData->nComms > 0) { + rasOutAppend("Group Comms Nodes Ranks Ranks Ranks Status Errors\n" + " # in group per comm per node per comm in group\n"); + } else { rasOutAppend("No communicator data collected!\n"); + } // Allocate an auxiliary structure used for counting the number of ranks (unique GPUs) in a group. NCCLCHECKGOTO(ncclCalloc(&peerNvmlDevs, coll->nPeers), ret, fail); @@ -1058,6 +1152,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { // status (which is a bitmask) into an array index. statusStr[(sizeof(unsigned int)*8-1)-__builtin_clz(auxComm->status)], errorStr[auxComm->errors]); } + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; rasOutAppend("\nErrors\n" "======\n\n"); @@ -1068,12 +1167,12 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { if (rasCountIsOutlier(nPeersMissing, client->verbose)) { // Extract a list of missing peers. We don't want to print it right away because it would be sorted // by address (including port, which isn't meaningful to end users). - struct rasPeerInfo* peersBuf = nullptr; + struct rasAuxPeerInfo* auxPeersBuf = nullptr; int nPeersBuf; // Both rasPeers and coll->peers are sorted by address (the latter we sorted above) which makes comparing // them much easier. - NCCLCHECKGOTO(ncclCalloc(&peersBuf, nPeersMissing), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nPeersMissing), ret, fail); nPeersBuf = 0; for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) { int cmp; @@ -1088,30 +1187,42 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { } else if (cmp < 0) { // Process missing from coll->peers. Don't report dead ones though, as they are not included // in nPeersMissing and are reported separately below. - if (!rasPeerIsDead(&rasPeers[rasPeerIdx].addr)) { - assert(nPeersBuf < nPeersMissing); - memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf)); + bool dead; + if (!(dead = rasPeerIsDead(&rasPeers[rasPeerIdx].addr))) { + if (nPeersBuf < nPeersMissing) { + auxPeersBuf[nPeersBuf++].peer = rasPeers+rasPeerIdx; + } else { + INFO(NCCL_RAS, "RAS overflow of auxPeersBuf: nPeersBuf %d, rasPeerIdx %d (%s), collPeerIdx %d -- " + "internal error?", + nPeersBuf, rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), collPeerIdx); + } } + TRACE(NCCL_RAS, "RAS rasPeerIdx %d (%s) is missing from coll->peers; dead %d", + rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), dead); rasPeerIdx++; } else { // cmp > 0 // Process not found in rasPeers -- shouldn't happen, unless during a race? + INFO(NCCL_RAS, "RAS failed to find coll->peer[%d] (%s) in rasPeers -- internal error?", + collPeerIdx, ncclSocketToString(coll->peers+collPeerIdx, rasLine)); collPeerIdx++; } // cmp > 0 } // for (rasPeerIdx, collPeerIdx) - // Sort the output by host and pid. - qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare); + // Sort the output by host and pid. rasAuxPeersValueCompare uses value as the primary key, which is 0 for + // all auxPeersBuf elements here, so it will do. + qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare); for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) { - rasOutAppend(" Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid, - ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)), - (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""), - rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf, + struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx; + rasOutAppend(" Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid, + ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)), + (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""), + rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf, sizeof(lineBuf))); } if (nPeersBuf != nPeersMissing) rasOutAppend(" [could not find information on %d process%s]\n", nPeersMissing-nPeersBuf, (nPeersMissing-nPeersBuf > 1 ? "es" : "")); - free(peersBuf); + free(auxPeersBuf); } // if (rasCountIsOutlier(nPeersMissing)) rasOutAppend("\n"); } @@ -1121,31 +1232,35 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { " %d job process%s considered dead (unreachable via the RAS network)\n", nRasDeadPeers, (nRasDeadPeers > 1 ? "es are" : " is")); if (rasCountIsOutlier(nRasDeadPeers, client->verbose)) { - struct rasPeerInfo* peersReSorted = nullptr; - int nPeersReSorted = 0; - NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasDeadPeers), ret, fail); + // rasDeadPeers contains only addresses, whereas we want a complete rasPeerInfo, and sorted differently. + struct rasAuxPeerInfo* auxPeersBuf = nullptr; + int nPeersBuf = 0; + NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nRasDeadPeers), ret, fail); for (int i = 0; i < nRasDeadPeers; i++) { int peerIdx = rasPeerFind(rasDeadPeers+i); if (peerIdx != -1) - memcpy(peersReSorted+(nPeersReSorted++), rasPeers+peerIdx, sizeof(*peersReSorted)); + auxPeersBuf[nPeersBuf++].peer = rasPeers+peerIdx; } - // Sort the output by host and pid, not host and port. - qsort(peersReSorted, nPeersReSorted, sizeof(*peersReSorted), rasPeersHostPidCompare); - for (int peerIdx = 0; peerIdx < nPeersReSorted; peerIdx++) { - rasOutAppend(" Process %d on node %s managing GPU%s %s\n", peersReSorted[peerIdx].pid, - ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)), - (__builtin_popcountll(peersReSorted[peerIdx].cudaDevs) > 1 ? "s" : ""), - rasGpuDevsToString(peersReSorted[peerIdx].cudaDevs, peersReSorted[peerIdx].nvmlDevs, lineBuf, + // Sort the output by host and pid, not host and port. rasAuxPeersValueCompare uses value as the primary key, + // which is 0 for all auxPeersBuf elements here, so it will do. + qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare); + for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) { + struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx; + rasOutAppend(" Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid, + ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)), + (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""), + rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf, sizeof(lineBuf))); } - if (nPeersReSorted != nRasDeadPeers) + if (nPeersBuf != nRasDeadPeers) rasOutAppend(" [could not find information on %d process%s]\n", - nRasDeadPeers-nPeersReSorted, (nRasDeadPeers-nPeersReSorted > 1 ? "es" : "")); - free(peersReSorted); + nRasDeadPeers-nPeersBuf, (nRasDeadPeers-nPeersBuf > 1 ? "es" : "")); + free(auxPeersBuf); } // if (rasCountIsOutlier(nRasDeadPeers) rasOutAppend("\n"); } + // Continue printing the largest communicators first, as in the summary table. for (vcIdx = 0; vcIdx < nValCounts; vcIdx++) { struct rasValCount* vc; vc = valCounts+vcIdx; @@ -1154,23 +1269,28 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { comm = auxComm->comm; if (auxComm->errors & RAS_ACE_INCOMPLETE) { - int nRanksMissing = comm->commNRanks - comm->nRanks; rasOutAppend("#%d-%d (%016lx) INCOMPLETE\n" " Missing communicator data from %d rank%s\n", vcIdx, commIdx - vc->firstIdx, - comm->commHash, nRanksMissing, (nRanksMissing > 1 ? "s" : "")); - if (rasCountIsOutlier(nRanksMissing, client->verbose)) { - lineBuf[0] = '\0'; - // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the - // exception of the missing ranks... - for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) { - if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) { - rankIdx++; - } else { - snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", - (rankIdx == commRank ? "" : ","), commRank); - } - } // for (commRank) - rasOutAppend(" The missing rank%s: %s\n", (nRanksMissing > 1 ? "s" : ""), lineBuf); + comm->commId.commHash, auxComm->nIncompleteRanks, (auxComm->nIncompleteRanks > 1 ? "s" : "")); + if (rasCountIsOutlier(auxComm->nIncompleteRanks, client->verbose)) { + struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks); + for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) { + struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx; + // Filter out ranks that provided a response but not for this communicator. + if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), ncclSocketsCompare) == + nullptr) { + int peerIdx = rasPeerFind(&missingRank->addr); + if (peerIdx != -1) { + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + missingRank->commRank, + rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine))); + } else { + rasOutAppend(" Rank %d -- [process information not found]\n", missingRank->commRank); + } + } // if rank did not respond + } // for (rankIdx) } // if (rasCountIsOutlier(nRanksMissing)) rasOutAppend("\n"); } // if (auxComm->errors & RAS_ACE_INCOMPLETE) @@ -1178,7 +1298,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { if (auxComm->errors & RAS_ACE_ERROR) { int ncclErrors[ncclNumResults]; int nErrors; - rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commHash); + rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash); memset(ncclErrors, '\0', sizeof(ncclErrors)); for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) @@ -1203,6 +1323,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { } // if (auxComm->errors & RAS_ACE_ERROR) } // for (commIdx) } // for (vcIdx) + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; rasOutAppend("Warnings\n" "========\n\n"); @@ -1213,15 +1338,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { coll->nLegTimeouts, (coll->nLegTimeouts > 1 ? "s" : "")); } + // Continue printing the largest communicators first, as in the summary table. for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) { struct rasValCount* vc = valCounts+vcIdx; for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) { - bool inconsistent; struct rasAuxComm* auxComm = auxComms+commIdx; comm = auxComm->comm; if (auxComm->errors & RAS_ACE_MISMATCH) { - rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commHash); + rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash); if (collOpCounts == nullptr) { // Allocating comm->commNRanks elements ensures that we won't need to reallocate, because the valCounts @@ -1234,28 +1359,31 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { rasOutAppend(" Communicator ranks have different status\n"); // We need to sort the ranks by status. However, status is normally calculated from other fields. - // We will copy the ranks and reuse collOpCount to store it. - memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); + // We will store it in the auxCommRanks' value. for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx; + struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx; + struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx; + auxRank->rank = rank; if (rank->status.abortFlag) - rank->collOpCount = RAS_ACS_ABORT; + auxRank->value = RAS_ACS_ABORT; else if (rank->status.finalizeCalled || rank->status.destroyFlag) - rank->collOpCount = RAS_ACS_FINALIZE; + auxRank->value = RAS_ACS_FINALIZE; else if (rank->status.initState == ncclSuccess) - rank->collOpCount = RAS_ACS_RUNNING; + auxRank->value = RAS_ACS_RUNNING; else - rank->collOpCount = RAS_ACS_INIT; + auxRank->value = RAS_ACS_INIT; } - qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare); + qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare); // Calculate the frequency of different status values. int nCollOpCounts = 0; for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) { + if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) { // __builtin_clz returns the number of leading 0-bits. This makes it possible to translate the - // status (which is a bitmask) into an array index. - collOpCounts[nCollOpCounts].value = (sizeof(unsigned int)*8-1) - __builtin_clz(ranksReSorted[rankIdx].collOpCount); + // status (which is a bitmask) into an array index. The argument is an unsigned int (there is no + // 64-bit version seemingly, but we don't actually need one here). + collOpCounts[nCollOpCounts].value = + (sizeof(unsigned int)*8-1) - __builtin_clz((unsigned int)auxCommRanks[rankIdx].value); collOpCounts[nCollOpCounts].count = 1; collOpCounts[nCollOpCounts].firstIdx = rankIdx; nCollOpCounts++; @@ -1263,11 +1391,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { collOpCounts[nCollOpCounts-1].count++; } } - if (comm->nRanks < comm->commNRanks) { - // Add a "fake" element corresponding to the missing entries. The statusStr array contains the "UNKNOWN" - // string at index 0. - collOpCounts[nCollOpCounts].value = 0; - collOpCounts[nCollOpCounts].count = comm->commNRanks - comm->nRanks; + if (comm->nMissingRanks - auxComm->nIncompleteRanks > 0) { + // Add a "fake" element corresponding to the NOCOMM entries, since they are not in the ranks array. + collOpCounts[nCollOpCounts].value = 0; // The index of "NOCOMM" in statusStr. + collOpCounts[nCollOpCounts].count = comm->nMissingRanks - auxComm->nIncompleteRanks; collOpCounts[nCollOpCounts].firstIdx = -1; // "Fake" entry identifier. nCollOpCounts++; } @@ -1280,114 +1407,159 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { rasOutAppend(" %d ranks have status %s\n", vcc->count, statusStr[vcc->value]); if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) { if (vcc->firstIdx != -1) { - // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing... + // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing... for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) { - int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; + int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx]; if (peerIdx != -1) { if (vcc->count > 1) rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", - ranksReSorted[rankIdx].commRank, - rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), + auxCommRanks[rankIdx].rank->commRank, + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid, ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); else rasOutAppend(" Rank %d has status %s -- GPU %s managed by process %d on node %s\n", - ranksReSorted[rankIdx].commRank, statusStr[vcc->value], - rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), + auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value], + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid, ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); } else { // peerIdx == -1 if (vcc->count > 1) - rasOutAppend(" Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank); + rasOutAppend(" Rank %d -- [process information not found]\n", + auxCommRanks[rankIdx].rank->commRank); else rasOutAppend(" Rank %d has status %s -- [process information not found]\n", - ranksReSorted[rankIdx].commRank, statusStr[vcc->value]); + auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value]); } // peerIdx == -1 } // for (rankIdx) } else { - // UNKNOWN ranks. Format a string with their rank numbers (we don't know anything more). - lineBuf[0] = '\0'; - // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the - // exception of the missing ranks... - for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) { - if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) { - rankIdx++; - } else { - snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", - (rankIdx == commRank ? "" : ","), commRank); - } - } // for (commRank) - if (vcc->count > 1) { - rasOutAppend(" The unknown ranks: %s\n", lineBuf); - } else { - rasOutAppend(" Rank %s has status %s\n", lineBuf, statusStr[vcc->value]); - } - } + // NOCOMM ranks are in a different array. + struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks + + comm->nRanks); + for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) { + struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx; + // Filter out ranks that did not respond at all. + if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), + ncclSocketsCompare)) { + int peerIdx = rasPeerFind(&missingRank->addr); + if (peerIdx != -1) { + if (vcc->count > 1) { + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + missingRank->commRank, rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, + lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine))); + } else { + rasOutAppend(" Rank %d has status %s -- GPU %s managed by process %d on node %s\n", + missingRank->commRank, statusStr[vcc->value], + rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, + lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid, + ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine))); + } + } else { // peerIdx == -1 + if (vcc->count > 1) { + rasOutAppend(" Rank %d -- [process information not found]\n", missingRank->commRank); + } else { + rasOutAppend(" Rank %d has status %s -- [process information not found]\n", + missingRank->commRank, statusStr[vcc->value]); + } + } // peerIdx == -1 + } // if rank responded + } // for (rankIdx) + } // vcc->firstIdx == -1 } // if (rasCountIsOutlier(vcc->count)) } // for (coc) } // if (__builtin_popcount(auxComm->status) > 1) - inconsistent = false; - for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - if (comm->ranks[rankIdx].collOpCount != auxComm->firstCollOpCount) { - inconsistent = true; - break; - } - } - if (inconsistent) { - rasOutAppend(" Communicator ranks have different collective operation counts\n"); + for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS; collIdx++) { + bool inconsistent = false; - // Sort the ranks by collOpCount and rank for easy counting. - memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); - qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare); - // Calculate the frequency of different collOpCount values. - int nCollOpCounts = 0; for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) { - collOpCounts[nCollOpCounts].value = ranksReSorted[rankIdx].collOpCount; - collOpCounts[nCollOpCounts].count = 1; - collOpCounts[nCollOpCounts].firstIdx = rankIdx; - nCollOpCounts++; - } else { - collOpCounts[nCollOpCounts-1].count++; + if (comm->ranks[rankIdx].collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx]) { + inconsistent = true; + break; } } - // Sort by that frequency (most frequent first). - qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev); - for (int coc = 0; coc < nCollOpCounts; coc++) { - struct rasValCount* vcc = collOpCounts+coc; - if (vcc->count > 1) - rasOutAppend(" %d ranks have launched up to operation %ld\n", vcc->count, vcc->value); - if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) { - // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing... - for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) { - int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; - if (peerIdx != -1) { - if (vcc->count > 1) - rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", - ranksReSorted[rankIdx].commRank, - rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), - rasPeers[peerIdx].pid, - ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); - else - rasOutAppend(" Rank %d has launched up to operation %ld -- GPU %s managed by process %d on node %s\n", - ranksReSorted[rankIdx].commRank, vcc->value, - rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), - rasPeers[peerIdx].pid, - ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); - } else { // peerIdx == -1 - if (vcc->count > 1) - rasOutAppend(" Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank); - else - rasOutAppend(" Rank %d has launched up to operation %ld -- [process information not found]\n", - ranksReSorted[rankIdx].commRank, vcc->value); - } // peerIdx == -1 - } // for (rankIdx) - } // if (rasCountIsOutlier(vcc->count)) - } // for (coc) - } // if (inconsistent) - rasOutAppend("\n"); + if (inconsistent) { + rasOutAppend(" Communicator ranks have different %s operation counts\n", ncclFuncStr[collIdx]); + + // Sort the ranks by collOpCounts[collIdx] and commRank for easy counting. + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx; + struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx; + auxRank->rank = rank; + auxRank->value = rank->collOpCounts[collIdx]; + } + qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare); + // Calculate the frequency of different collOpCounts[collIdx] values. + int nCollOpCounts = 0; + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) { + collOpCounts[nCollOpCounts].value = auxCommRanks[rankIdx].value; + collOpCounts[nCollOpCounts].count = 1; + collOpCounts[nCollOpCounts].firstIdx = rankIdx; + nCollOpCounts++; + } else { + collOpCounts[nCollOpCounts-1].count++; + } + } + // Sort by that frequency (most frequent first). + qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev); + + for (int coc = 0; coc < nCollOpCounts; coc++) { + struct rasValCount* vcc = collOpCounts+coc; + if (vcc->count > 1) { + if (vcc->value > 0) + rasOutAppend(" %d ranks have launched up to operation %ld\n", vcc->count, vcc->value); + else + rasOutAppend(" %d ranks have not launched any operations\n", vcc->count); + } + if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) { + // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing... + for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) { + int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx]; + if (peerIdx != -1) { + if (vcc->count > 1) { + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + auxCommRanks[rankIdx].rank->commRank, + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } else { + if (vcc->value > 0) { + rasOutAppend(" Rank %d has launched up to operation %ld -- GPU %s managed by process %d " + "on node %s\n", auxCommRanks[rankIdx].rank->commRank, vcc->value, + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } else { + rasOutAppend(" Rank %d has not launched any operations -- GPU %s managed by process %d " + "on node %s\n", auxCommRanks[rankIdx].rank->commRank, + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } + } + } else { // peerIdx == -1 + if (vcc->count > 1) { + rasOutAppend(" Rank %d -- [process information not found]\n", + auxCommRanks[rankIdx].rank->commRank); + } else { + if (vcc->value > 0) + rasOutAppend(" Rank %d has launched up to operation %ld -- [process information not found]\n", + auxCommRanks[rankIdx].rank->commRank, vcc->value); + else + rasOutAppend(" Rank %d has not launched any operations -- [process information not found]\n", + auxCommRanks[rankIdx].rank->commRank); + } + } // peerIdx == -1 + } // for (rankIdx) + } // if (rasCountIsOutlier(vcc->count)) + } // for (coc) + rasOutAppend("\n"); + } // if (inconsistent) + } // for (collIdx) } // if (auxComm->errors & RAS_ACE_MISMATCH) } // for (commIdx) } // for (vcIdx) @@ -1398,20 +1570,26 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { rasOutExtract(msg); rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; + + TRACE(NCCL_RAS, "RAS: rasClientRunComms: finishing"); exit: free(peerNvmlDevs); free(collOpCounts); free(valCounts); free(peerIdxConv); - free(ranksReSorted); + free(auxCommRanks); free(auxComms); return ret; fail: goto exit; } +// Generates detailed info about encountered errors, be it initialization ones or asynchronous ones. static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm, const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync) { + // Because the number of possible error kinds is finite and small, we don't bother in this case with allocating + // temporary data structures, counting the errors, sorting arrays, etc. Instead, in each iteration we pick the most + // numerous error kind, we iterate through the ranks in search for this error, and immediately add it to the output. for (;;) { int maxCount = 0; ncclResult_t maxCountIdx = ncclSuccess; @@ -1489,17 +1667,20 @@ static void rasOutAppend(const char* format, ...) { } nRasOutBuffer += needed; - assert(nRasOutBuffer <= rasOutBufferSize); + if (nRasOutBuffer >= rasOutBufferSize) + nRasOutBuffer = rasOutBufferSize - 1; // Should never happen, but just to be extra sure... exit: ; } // Copies the output data from an internal buffer to a user-supplied one, including the terminating '\0'. // The user buffer must already be allocated and be at least rasOutLength() bytes long (which includes -// the terminating '\0'). +// the terminating '\0'). Resets the output buffer when done. static void rasOutExtract(char* buffer) { - if (rasOutBuffer) + if (rasOutBuffer) { memcpy(buffer, rasOutBuffer, rasOutLength()); + rasOutReset(); + } } // Returns the current length of the used portion of the output buffer, *not* including the terminating '\0'. @@ -1524,60 +1705,25 @@ exit: // Various sorting callbacks used when grouping/formatting data. // /////////////////////////////////////////////////////////////////// -// Sorting callback for rasPeerInfo elements. Sorts by the number of bits set in cudaDevs. Uses the host IP as the -// secondary key and the process id as the tertiary key. -static int rasPeersNGpuCompare(const void* e1, const void* e2) { - const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; - const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; - int c1 = __builtin_popcountll(p1->cudaDevs); - int c2 = __builtin_popcountll(p2->cudaDevs); +// Sorting callback for rasAuxPeerInfo elements. Sorts by value, with the peers host IP as the secondary key and +// the process id as the tertiary key. +static int rasAuxPeersValueCompare(const void* e1, const void* e2) { + const struct rasAuxPeerInfo* p1 = (const struct rasAuxPeerInfo*)e1; + const struct rasAuxPeerInfo* p2 = (const struct rasAuxPeerInfo*)e2; - if (c1 == c2) { + if (p1->value == p2->value) { // Host IP address is the secondary key. - int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); + int cmp = ncclSocketsHostCompare(&p1->peer->addr, &p2->peer->addr); if (cmp == 0) { // Process ID is the tertiary key. - cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); + cmp = (p1->peer->pid < p2->peer->pid ? -1 : (p1->peer->pid > p2->peer->pid ? 1 : 0)); } return cmp; } else { - return (c1 < c2 ? -1 : 1); + return (p1->value < p2->value ? -1 : 1); } } -// Sorting callback for rasPeerInfo elements. Sorts by the number of peers per node, which we store in cudaDevs. -// Uses the host IP as the secondary key and the process id as the tertiary key. -static int rasPeersNProcsCompare(const void* e1, const void* e2) { - const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; - const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; - - if (p1->cudaDevs == p2->cudaDevs) { - // Host IP address is the secondary key. - int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); - if (cmp == 0) { - // Process ID is the tertiary key. - cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); - } - return cmp; - } else { - return (p1->cudaDevs < p2->cudaDevs ? -1 : 1); - } -} - -// Sorting callback for rasPeerInfo elements. Sorts by the host IP and the process id as the secondary key (rather -// than the port). -static int rasPeersHostPidCompare(const void* e1, const void* e2) { - const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; - const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; - - int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); - if (cmp == 0) { - // Process ID is the secondary key. - cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); - } - return cmp; -} - // Sorting callback for ncclSocketAddress. Unlike the ncclSocketsCompare, it ignores the port. static int ncclSocketsHostCompare(const void* p1, const void* p2) { const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1; @@ -1599,7 +1745,8 @@ static int ncclSocketsHostCompare(const void* p1, const void* p2) { cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr)); } else { // The only remaining valid case are empty addresses. - assert(family == 0); + if (family != 0) + INFO(NCCL_RAS, "RAS invalid address family %d -- internal error?", family); cmp = 0; // Two empty addresses are equal... } @@ -1657,24 +1804,16 @@ static int rasAuxCommsCompareRev(const void* p1, const void* p2) { } } -// Sorting callback for rasCollComms::comm::rank elements. Sorts by the peerIdx. -static int rasCommRanksPeerCompare(const void* p1, const void* p2) { - const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1; - const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2; +// Sorting callback for rasAuxCommRank elements. Sorts by value, with rank's commRank as the secondary key. +static int rasAuxCommRanksValueCompare(const void* p1, const void* p2) { + const struct rasAuxCommRank* r1 = (const struct rasAuxCommRank*)p1; + const struct rasAuxCommRank* r2 = (const struct rasAuxCommRank*)p2; - return (r1->peerIdx < r2->peerIdx ? -1 : (r1->peerIdx > r2->peerIdx ? 1 : 0)); -} - -// Sorting callback for rasCollComms::comm::rank elements. Sorts by the collOpCount, with rank as the secondary key. -static int rasCommRanksCollOpCompare(const void* p1, const void* p2) { - const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1; - const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2; - - if (r1->collOpCount == r2->collOpCount) { - // Use the rank as the secondary key. - return (r1->commRank < r2->commRank ? -1 : (r1->commRank > r2->commRank ? 1 : 0)); + if (r1->value == r2->value) { + // Use the commRank as the secondary key. + return (r1->rank->commRank < r2->rank->commRank ? -1 : (r1->rank->commRank > r2->rank->commRank ? 1 : 0)); } else { - return (r1->collOpCount < r2->collOpCount ? -1 : 1); + return (r1->value < r2->value ? -1 : 1); } } @@ -1705,14 +1844,20 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, return buf; } +// Formats a GPU string based on the CUDA/NVML ids provided. If the CUDA id is different from the NVML id, both are +// printed. +static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size) { + snprintf(buf, size, "%d", cudaDev); + if (cudaDev != nvmlDev) { + snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", nvmlDev); + } + return buf; +} + // Formats a GPU string based on the rasCollComms's rank. If the CUDA id is different from the NVML id, both are // printed. static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) { - snprintf(buf, size, "%d", rank->cudaDev); - if (rank->cudaDev != rank->nvmlDev) { - snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", rank->nvmlDev); - } - return buf; + return rasGpuToString(rank->cudaDev, rank->nvmlDev, buf, size); } // Converts a NCCL error result to a string. @@ -1753,3 +1898,21 @@ static bool rasCountIsOutlier(int count, bool verbose, int totalCount) { (totalCount == -1 || count <= totalCount * RAS_CLIENT_OUTLIER_FRACTION); } } + +// Invoked during RAS termination to release all the allocated resources. +void rasClientSupportTerminate() { + (void)close(rasClientListeningSocket); + rasClientListeningSocket = -1; + + free(rasOutBuffer); + rasOutBuffer = nullptr; + nRasOutBuffer = rasOutBufferSize = 0; + + for (struct rasClient* client = rasClientsHead; client;) { + struct rasClient* clientNext = client->next; + rasClientTerminate(client); + client = clientNext; + } + + // rasClientsHead and rasClientsTail are taken care of by rasClientTerminate(). +} diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc index 201144f..7283360 100644 --- a/src/ras/collectives.cc +++ b/src/ras/collectives.cc @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#define NDEBUG // Comment out duriyng development only! +#define NDEBUG // Comment out during development only! #include #include @@ -12,6 +12,7 @@ #include "checks.h" #include "comm.h" #include "nccl.h" +#include "transport.h" #include "utils.h" #include "ras_internal.h" @@ -32,14 +33,14 @@ static int nRasCollHistory, rasCollHistNextIdx; // Monotonically increased to ensure that each collective originating locally has a unique Id. static uint64_t rasCollLastId; -// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require +// Keeping track of ongoing collective operations (apart from broadcasts, which have no response so require // no such tracking). -struct rasCollective* rasCollectives; -static int nRasCollectives; +struct rasCollective* rasCollectivesHead; +struct rasCollective* rasCollectivesTail; static ncclResult_t getNewCollEntry(struct rasCollective** pColl); static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll, - const struct rasCollRequest* req, size_t reqLen, int fromConnIdx); + const struct rasCollRequest* req, size_t reqLen, struct rasConnection* fromConn); static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen); static ncclResult_t rasCollReadyResp(struct rasCollective* coll); static ncclResult_t rasConnSendCollResp(struct rasConnection* conn, @@ -47,12 +48,17 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn, const union ncclSocketAddress* peers, int nPeers, const char* data, int nData, int nLegTimeouts); -static ncclResult_t rasCollConnsInit(char** pData, int* pNData); +static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData); static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg); -static ncclResult_t rasCollCommsInit(char** pData, int* pNData); +static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData); static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg); +static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm); static int ncclCommsCompare(const void* p1, const void* p2); +static int peersHashesCompare(const void* p1, const void* p2); +static int peersHashesSearch(const void* k, const void* e); +static int rasCommIdCompare(const void* p1, const void* p2); +static int rasCollCommsMissingRankSearch(const void* k, const void* e); /////////////////////////////////////////////////////////////////////////////////////// @@ -62,22 +68,26 @@ static int ncclCommsCompare(const void* p1, const void* p2); // Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary. static ncclResult_t getNewCollEntry(struct rasCollective** pColl) { struct rasCollective* coll; - int i; - for (i = 0; i < nRasCollectives; i++) - if (rasCollectives[i].type == RAS_MSG_NONE) - break; - if (i == nRasCollectives) { - NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT)); - nRasCollectives += RAS_INCREMENT; - } + int nRasConns; + + NCCLCHECK(ncclCalloc(&coll, 1)); - coll = rasCollectives+i; - memset(coll, '\0', sizeof(*coll)); coll->startTime = clockNano(); - coll->fromConnIdx = -1; + coll->fromConn = nullptr; // We are unlikely to use the whole array, but at least we won't need to realloc. + nRasConns = 0; + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) + nRasConns++; NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns)); + if (rasCollectivesHead) { + rasCollectivesTail->next = coll; + coll->prev = rasCollectivesTail; + rasCollectivesTail = coll; + } else { + rasCollectivesHead = rasCollectivesTail = coll; + } + *pColl = coll; return ncclSuccess; } @@ -95,21 +105,23 @@ void rasCollReqInit(struct rasCollRequest* req) { // in preparation for collective response messages. // pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible // in scenarios such as a total of two peers. -// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless +// pColl provides on return a pointer to the allocated rasCollective structure to track this collective (unless // it's a broadcast, which require no such tracking). -ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx, - int fromConnIdx) { +ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone, + struct rasCollective** pColl, struct rasConnection* fromConn) { struct rasCollective* coll = nullptr; + struct rasCollRequest* reqMod = (struct rasCollRequest*)req; + size_t reqLen = 0; if (req->type >= RAS_COLL_CONNS) { // Keep track of this collective operation so that we can handle the responses appropriately. NCCLCHECK(getNewCollEntry(&coll)); - if (pCollIdx) - *pCollIdx = coll-rasCollectives; + if (pColl) + *pColl = coll; memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr)); coll->rootId = req->rootId; coll->type = req->type; coll->timeout = req->timeout; - coll->fromConnIdx = fromConnIdx; + coll->fromConn = fromConn; if (ncclCalloc(&coll->peers, 1) == ncclSuccess) { memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers)); coll->nPeers = 1; @@ -117,9 +129,9 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, // Collective-specific initialization of accumulated data (using local data for now). if (req->type == RAS_COLL_CONNS) - (void)rasCollConnsInit(&coll->data, &coll->nData); + (void)rasCollConnsInit(&reqMod, &reqLen, &coll->data, &coll->nData); else if (req->type == RAS_COLL_COMMS) - (void)rasCollCommsInit(&coll->data, &coll->nData); + (void)rasCollCommsInit(&reqMod, &reqLen, &coll->data, &coll->nData); } else { // req->type < RAS_COLL_CONNS // Add the info to the collective message history. nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE); @@ -131,42 +143,42 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, // Collective-specific message handling. if (req->type == RAS_BC_DEADPEER) { bool done = false; - rasMsgHandleBCDeadPeer(req, &done); + rasMsgHandleBCDeadPeer(&reqMod, &reqLen, &done); if (done) goto exit; } } // req->type < RAS_COLL_CONNS - for (int connIdx = 0; connIdx < nRasConns; connIdx++) - rasConns[connIdx].linkFlag = false; + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) + conn->linkFlag = false; - (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx); - (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx); + (void)rasLinkSendCollReq(&rasNextLink, coll, reqMod, reqLen, fromConn); + (void)rasLinkSendCollReq(&rasPrevLink, coll, reqMod, reqLen, fromConn); if (coll && pAllDone) *pAllDone = (coll->nFwdSent == coll->nFwdRecv); exit: + if (reqMod != req) + free(reqMod); return ncclSuccess; } // Sends the collective message through all connections associated with this link (with the exception of the one // the message came from, if any). static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll, - const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) { - for (int i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; - if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) { - struct rasConnection* conn = rasConns+linkConn->connIdx; - if (!conn->linkFlag) { - // We send collective messages through fully established and operational connections only. - if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) { - if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr) - coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx; - } // if (conn->sockIdx != -1 && RAS_SOCK_READY) - conn->linkFlag = true; - } // if (!conn->linkFlag) - } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) - } // for (i) + const struct rasCollRequest* req, size_t reqLen, + struct rasConnection* fromConn) { + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) { + if (linkConn->conn && linkConn->conn != fromConn && !linkConn->conn->linkFlag) { + // We send collective messages through fully established and operational connections only. + if (linkConn->conn->sock && linkConn->conn->sock->status == RAS_SOCK_READY && + !linkConn->conn->experiencingDelays) { + if (rasConnSendCollReq(linkConn->conn, req, reqLen) == ncclSuccess && coll != nullptr) + coll->fwdConns[coll->nFwdSent++] = linkConn->conn; + } // linkConn->conn is fully established and operational. + linkConn->conn->linkFlag = true; + } // if (linkConn->conn && linkConn->conn != fromConn && !linkConn->con->linkFlag) + } // for (linkConn) return ncclSuccess; } @@ -190,8 +202,8 @@ static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct // in which case it can immediately send the response. ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) { bool allDone = false; - int collIdx = -1; - assert(sock->connIdx != -1); + struct rasCollective* coll = nullptr; + assert(sock->conn); // First check if we've already handled this request (through another connection). for (int i = 0; i < nRasCollHistory; i++) { @@ -202,7 +214,7 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) { if (msg->collReq.type >= RAS_COLL_CONNS) { // Send an empty response so that the sender can account for it. The non-empty response has already been // sent through the connection that we received the request through first. - NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId, + NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId, /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0)); } goto exit; @@ -211,31 +223,29 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) { if (msg->collReq.type >= RAS_COLL_CONNS) { // Check if we're currently handling this collective request. - for (int i = 0; i < nRasCollectives; i++) { - struct rasCollective* coll = rasCollectives+i; - if (coll->type != RAS_MSG_NONE && - memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 && + for (coll = rasCollectivesHead; coll; coll = coll->next) { + if (memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 && msg->collReq.rootId == coll->rootId) { assert(msg->collReq.type == coll->type); // Send an empty response so that the sender can account for it. The non-empty response will be // sent through the connection that we received the request through first. - NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId, + NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId, /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0)); goto exit; } // if match - } // for (i) + } // for (coll) } // if (msg->collReq.type >= RAS_COLL_CONNS) // Re-broadcast the message to my peers (minus the one it came from) and handle it locally. - NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx)); + NCCLCHECK(rasNetSendCollReq(&msg->collReq, &allDone, &coll, sock->conn)); if (msg->collReq.type >= RAS_COLL_CONNS && allDone) { - assert(collIdx != -1); + assert(coll); // We are a leaf process -- send the response right away. This can probably trigger only for the case of a total // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer // has more than one connection so there should always be _some_ other peer to forward the request to. - NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx)); + NCCLCHECK(rasCollReadyResp(coll)); } exit: return ncclSuccess; @@ -245,9 +255,9 @@ exit: // Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't // any peers (unlikely), the peers sent their responses (likely), or we timed out. static ncclResult_t rasCollReadyResp(struct rasCollective* coll) { - if (coll->fromConnIdx != -1) { + if (coll->fromConn) { // For remotely-initiated collectives, send the response back. - NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId, + NCCLCHECK(rasConnSendCollResp(coll->fromConn, &coll->rootAddr, coll->rootId, coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts)); // Add the identifying info to the collective message history. @@ -302,18 +312,15 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn, // the data from the response into the accumulated data. If all the responses have been accounted for, sends the // accumulated response back. ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) { - int collIdx; - struct rasCollective* coll = nullptr; + struct rasCollective* coll; char line[SOCKET_NAME_MAXLEN+1]; - for (collIdx = 0; collIdx < nRasCollectives; collIdx++) { - coll = rasCollectives+collIdx; - if (coll->type != RAS_MSG_NONE && - memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 && + for (coll = rasCollectivesHead; coll; coll = coll->next) { + if (memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 && msg->collResp.rootId == coll->rootId) break; } - if (collIdx == nRasCollectives) { + if (coll == nullptr) { INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!", ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId, ncclSocketToString(&sock->sock.addr, rasLine)); @@ -321,11 +328,11 @@ ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) { } coll->nLegTimeouts += msg->collResp.nLegTimeouts; - assert(sock->connIdx != -1); - // Account for the received response in our collective operation tracking. + assert(sock->conn); + // Account for the received response in our collective operations tracking. for (int i = 0; i < coll->nFwdSent; i++) { - if (coll->fwdConns[i] == sock->connIdx) { - coll->fwdConns[i] = -1; + if (coll->fwdConns[i] == sock->conn) { + coll->fwdConns[i] = nullptr; break; } } @@ -353,46 +360,53 @@ exit: // Removes a connection from all ongoing collectives. Called when a connection is experiencing a delay or is being // terminated. -void rasCollsPurgeConn(int connIdx) { - for (int i = 0; i < nRasCollectives; i++) { - struct rasCollective* coll = rasCollectives+i; - if (coll->type != RAS_MSG_NONE) { - char line[SOCKET_NAME_MAXLEN+1]; - if (coll->fromConnIdx == connIdx) { - INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s", - ncclSocketToString(&coll->rootAddr, line), coll->rootId, - ncclSocketToString(&rasConns[connIdx].addr, rasLine)); - rasCollFree(coll); - } else { - for (int j = 0; j < coll->nFwdSent; j++) { - if (coll->fwdConns[j] == connIdx) { - coll->fwdConns[j] = -1; - coll->nFwdRecv++; - coll->nLegTimeouts++; - INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " - "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", - ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line), - coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); - if (coll->nFwdSent == coll->nFwdRecv) - (void)rasCollReadyResp(coll); - break; - } - } // for (j) - } // coll->fromConnIdx != connIdx - } // !RAS_MSG_NONE - } // for (i) +void rasCollsPurgeConn(struct rasConnection* conn) { + for (struct rasCollective* coll = rasCollectivesHead; coll;) { + struct rasCollective* collNext = coll->next; + char line[SOCKET_NAME_MAXLEN+1]; + if (coll->fromConn == conn) { + INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s", + ncclSocketToString(&coll->rootAddr, line), coll->rootId, + ncclSocketToString(&conn->addr, rasLine)); + rasCollFree(coll); + } else { + for (int i = 0; i < coll->nFwdSent; i++) { + if (coll->fwdConns[i] == conn) { + coll->fwdConns[i] = nullptr; + coll->nFwdRecv++; + coll->nLegTimeouts++; + INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " + "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", + ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), coll->rootId, + coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); + if (coll->nFwdSent == coll->nFwdRecv) + (void)rasCollReadyResp(coll); + break; + } + } // for (i) + } // coll->fromConn != conn + coll = collNext; + } // for (coll) } // Frees a rasCollective entry and any memory associated with it. void rasCollFree(struct rasCollective* coll) { + if (coll == nullptr) + return; + free(coll->fwdConns); - coll->fwdConns = nullptr; free(coll->peers); - coll->peers = nullptr; free(coll->data); - coll->data = nullptr; - coll->fromConnIdx = -1; - coll->type = RAS_MSG_NONE; + + if (coll == rasCollectivesHead) + rasCollectivesHead = rasCollectivesHead->next; + if (coll == rasCollectivesTail) + rasCollectivesTail = rasCollectivesTail->prev; + if (coll->prev) + coll->prev->next = coll->next; + if (coll->next) + coll->next->prev = coll->prev; + free(coll); } // Invoked from the main RAS thread loop to handle timeouts of the collectives. @@ -407,64 +421,64 @@ void rasCollFree(struct rasCollective* coll) { // and send back whatever we have. Unfortunately, the peer that the RAS client is connected to will in all likelihood // time out first, so at that point any delayed responses that eventually arrive are likely to be too late... void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) { - for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) { - struct rasCollective* coll = rasCollectives+collIdx; - if (coll->type == RAS_MSG_NONE || coll->timeout == 0) - continue; - - if (now - coll->startTime > coll->timeout) { - // We've exceeded the leg timeout. For all outstanding responses, check their connections. - if (!coll->timeoutWarned) { - INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing", - ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId, - (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv); - coll->timeoutWarned = true; - } - for (int i = 0; i < coll->nFwdSent; i++) { - if (coll->fwdConns[i] != -1) { - struct rasConnection* conn = rasConns+coll->fwdConns[i]; - char line[SOCKET_NAME_MAXLEN+1]; - if (!conn->experiencingDelays && conn->sockIdx != -1) { - struct rasSocket* sock = rasSockets+conn->sockIdx; - // Ensure that the connection is fully established and operational, and that the socket hasn't been - // re-created during the handling of the collective (which would suggest that the request may have been - // lost). - if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime) - continue; - } - // In all other cases we declare a timeout so that we can (hopefully) recover. - INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " - "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", - ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), - coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); - coll->fwdConns[i] = -1; - coll->nFwdRecv++; - coll->nLegTimeouts++; - } // if (coll->fwdConns[i] != -1) - } // for (i) - if (coll->nFwdSent == coll->nFwdRecv) { - (void)rasCollReadyResp(coll); - } else { - // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they - // must be due to delays at other processes. Presumably those processes will give up waiting soon and the - // (incomplete) responses will arrive shortly, so we should wait a little longer. - if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) { - // We've exceeded even the longer timeout, which is unexpected. Try to return whatever we have (though - // the originator of the collective, if it's not us, may have timed out already anyway). - INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses", + for (struct rasCollective* coll = rasCollectivesHead; coll;) { + struct rasCollective* collNext = coll->next; + if (coll->timeout > 0) { + if (now - coll->startTime > coll->timeout) { + // We've exceeded the leg timeout. For all outstanding responses, check their connections. + if (!coll->timeoutWarned) { + INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing", ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId, (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv); - coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv; - coll->nFwdRecv = coll->nFwdSent; + coll->timeoutWarned = true; + } + for (int i = 0; i < coll->nFwdSent; i++) { + if (coll->fwdConns[i]) { + struct rasConnection* conn = coll->fwdConns[i]; + char line[SOCKET_NAME_MAXLEN+1]; + if (!conn->experiencingDelays && conn->sock) { + // Ensure that the connection is fully established and operational, and that the socket hasn't been + // re-created during the handling of the collective (which would suggest that the request may have been + // lost). + if (conn->sock->status == RAS_SOCK_READY && conn->sock->createTime < coll->startTime) + continue; + } + // In all other cases we declare a timeout so that we can (hopefully) recover. + INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " + "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", + ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), + coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); + coll->fwdConns[i] = nullptr; + coll->nFwdRecv++; + coll->nLegTimeouts++; + } // if (coll->fwdConns[i]) + } // for (i) + if (coll->nFwdSent == coll->nFwdRecv) { (void)rasCollReadyResp(coll); } else { - *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT); - } - } // conn->nFwdRecv < conn->nFwdSent - } else { - *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout); - } - } // for (collIdx) + // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they + // must be due to delays at other processes. Presumably those processes will give up waiting soon and the + // (incomplete) responses will arrive shortly, so we should wait a little longer. + if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) { + // We've exceeded even the longer timeout, which is unexpected. Try to return whatever we have (though + // the originator of the collective, if it's not us, may have timed out already anyway). + INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses", + ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId, + (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv); + coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv; + coll->nFwdRecv = coll->nFwdSent; + (void)rasCollReadyResp(coll); + } else { + *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT); + } + } // conn->nFwdRecv < conn->nFwdSent + } else { + *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout); + } + } // if (coll->timeout > 0) + + coll = collNext; + } // for (coll) } @@ -476,15 +490,16 @@ void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) { // For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well // as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen, // but the system clocks may not be perfectly in sync). -static ncclResult_t rasCollConnsInit(char** pData, int* pNData) { +static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) { struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN}; struct rasCollConns* pConnsData; + *pReqLen = rasCollDataLength(RAS_COLL_CONNS); + // Update the statistical data first and in the process also calculate how much connection-specific space we // will need. - for (int i = 0; i < nRasConns; i++) { - struct rasConnection* conn = rasConns+i; - if (conn->inUse && conn->travelTimeCount > 0) { + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) { + if (conn->travelTimeCount > 0) { if (connsData.travelTimeMin > conn->travelTimeMin) connsData.travelTimeMin = conn->travelTimeMin; if (connsData.travelTimeMax < conn->travelTimeMax) @@ -502,9 +517,9 @@ static ncclResult_t rasCollConnsInit(char** pData, int* pNData) { pConnsData = (struct rasCollConns*)*pData; memcpy(pConnsData, &connsData, sizeof(*pConnsData)); if (connsData.nNegativeMins > 0) { - for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) { - struct rasConnection* conn = rasConns+i; - if (conn->inUse && conn->travelTimeMin < 0) { + int negMinsIdx = 0; + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) { + if (conn->travelTimeMin < 0) { struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx; memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source)); memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest)); @@ -560,10 +575,26 @@ static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* // Initializes the accumulated data with just the local data for now. // For this particular collective, we keep for every communicator information about every rank, to help identify // the missing ones and the discrepancies between the ones that did respond. -static ncclResult_t rasCollCommsInit(char** pData, int* pNData) { +// For any new (previously unseen) communicator we also save the basic identification data about every rank that is +// "missing" (i.e., not part of this process). During merging, this should be replaced by the actual data from +// those ranks, if they are responsive. We want to provide this information to the user (so that we can say more +// than "rank xyz missing"). +// Every "new" communicator is also recorded in the (updated) request, so that when that request is forwarded to our +// peers, those peers don't needlessly send us the same data. +static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) { + ncclResult_t ret = ncclSuccess; struct rasCollComms* commsData; - int nComms = 0, nRanks = 0; + int nComms = 0, nRanks = 0, nMissingRanks = 0; + bool skipMissing = false; std::lock_guard lock(ncclCommsMutex); + struct rasCollComms::comm* comm; + struct rasCollRequest* req = nullptr; + struct rasPeerInfo** peersReSorted = nullptr; + int firstNewSkipMissingIdx = -1; + + *pReqLen = rasCollDataLength(RAS_COLL_COMMS) + + (*pReq)->comms.nSkipMissingRanksComms * sizeof(*(*pReq)->comms.skipMissingRanksComms); + *pData = nullptr; // Start by counting the communicators so that we know how much space to allocate. // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case @@ -572,77 +603,152 @@ static ncclResult_t rasCollCommsInit(char** pData, int* pNData) { qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare); ncclCommsSorted = true; } - for (int i = 0; i < nNcclComms; i++) { - if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting. + for (int commIdx = 0; commIdx < nNcclComms; commIdx++) { + if (ncclComms[commIdx] == nullptr) // nullptr's are always at the end after sorting. break; - if (i == 0) { - nComms = 1; - } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) { - nComms++; - } - nRanks++; - } + // A process may manage multiple GPUs and thus have multiple communicators with the same commHash. + // Comparing just the commHash is OK though within communicators that are part of the same process. + if (commIdx == 0 || ncclComms[commIdx]->commHash != ncclComms[commIdx-1]->commHash) { + skipMissing = rasCollCommsSkipMissing(*pReq, ncclComms[commIdx]); + if (!skipMissing) { + // Add this communicator to the request so that the processes we forward the request to know not to fill in + // the missing rank info. + struct rasCommId* skipComm; + if (req == nullptr) { + // We pessimistically allocate space for all the remaining communicators so that we don't need to reallocate. + int newSize = *pReqLen + (nNcclComms-commIdx) * sizeof(*req->comms.skipMissingRanksComms); + NCCLCHECKGOTO(ncclCalloc((char**)&req, newSize), ret, fail); + memcpy(req, *pReq, *pReqLen); + *pReq = req; + firstNewSkipMissingIdx = req->comms.nSkipMissingRanksComms; + } + skipComm = req->comms.skipMissingRanksComms + req->comms.nSkipMissingRanksComms++; + skipComm->commHash = ncclComms[commIdx]->commHash; + skipComm->hostHash = ncclComms[commIdx]->peerInfo->hostHash; + skipComm->pidHash = ncclComms[commIdx]->peerInfo->pidHash; - // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent + nMissingRanks += ncclComms[commIdx]->nRanks; + } // if (!skipMissing) + nComms++; + } // if encountered a new communicator + nRanks++; + if (!skipMissing) + nMissingRanks--; + } // for (commIdx) + + // rasCollComms has nested variable-length arrays, which makes the size calculation and subsequent // pointer manipulations somewhat unwieldy... - *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks); - NCCLCHECK(ncclCalloc(pData, *pNData)); + // This is extra complicated because of the "hidden" array of struct rasCollCommsMissingRank following the + // ranks array for each communicator. + *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks) + + nMissingRanks * sizeof(struct rasCollCommsMissingRank); + NCCLCHECKGOTO(ncclCalloc(pData, *pNData), ret, fail); commsData = (struct rasCollComms*)*pData; commsData->nComms = nComms; // comm points at the space in the accumulated data where the info about the current communicator is to be stored. - struct rasCollComms::comm* comm = commsData->comms; - for (int i = 0; i < nNcclComms; i++) { - struct rasCollComms::comm::rank* rank; - ncclResult_t asyncError; - if (ncclComms[i] == nullptr) - break; - if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) { - if (i > 0) - comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks)); - comm->commHash = ncclComms[i]->commHash; - comm->commNRanks = ncclComms[i]->nRanks; - comm->nRanks = 0; - } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) { - INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- " - "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash); - continue; // Short of failing, the best we can do is skip... - } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) { - INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)", - ncclComms[i]->rank, comm->commHash); - continue; // Short of failing, the best we can do is skip... - } - if (comm->nRanks == comm->commNRanks) { - INFO(NCCL_RAS, - "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)", - comm->commNRanks, comm->commHash); - continue; // Short of failing, the best we can do is skip... - } - rank = comm->ranks+comm->nRanks; - rank->commRank = ncclComms[i]->rank; - // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially - // always 0. It will increase after we send this response back to the peer we got the request from. - rank->peerIdx = 0; - rank->collOpCount = ncclComms[i]->collOpCount; - rank->status.initState = ncclComms[i]->initState; - if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess) - rank->status.asyncError = asyncError; - rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0); - rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0); - rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0); - rank->cudaDev = ncclComms[i]->cudaDev; - rank->nvmlDev = ncclComms[i]->nvmlDev; - comm->nRanks++; - } - assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData); + comm = commsData->comms; + // collCommIdx counts rasCollComms::comm (comm); commIdx indexes ncclComms. + for (int collCommIdx = 0, commIdx = 0; collCommIdx < nComms; collCommIdx++) { + struct ncclComm* ncclComm = ncclComms[commIdx]; - return ncclSuccess; + comm->commId.commHash = ncclComm->commHash; + comm->commId.hostHash = ncclComm->peerInfo->hostHash; + comm->commId.pidHash = ncclComm->peerInfo->pidHash; + comm->commNRanks = ncclComm->nRanks; + comm->nRanks = comm->nMissingRanks = 0; + + // Fill in the comm->ranks array. + for (; commIdx < nNcclComms && ncclComms[commIdx] && ncclComms[commIdx]->commHash == comm->commId.commHash; + commIdx++) { + ncclComm = ncclComms[commIdx]; + struct rasCollComms::comm::rank* rank = comm->ranks+comm->nRanks; + ncclResult_t asyncError; + rank->commRank = ncclComm->rank; + // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially + // always 0. It will increase after we send this response back to the peer we got the request from. + rank->peerIdx = 0; + memcpy(rank->collOpCounts, ncclComm->seqNumber, sizeof(rank->collOpCounts)); + rank->status.initState = ncclComm->initState; + if (ncclCommGetAsyncError(ncclComm, &asyncError) == ncclSuccess) + rank->status.asyncError = asyncError; + rank->status.finalizeCalled = (ncclComm->finalizeCalled != 0); + rank->status.destroyFlag = (ncclComm->destroyFlag != 0); + rank->status.abortFlag = (__atomic_load_n(ncclComm->abortFlag, __ATOMIC_ACQUIRE) != 0); + rank->cudaDev = ncclComm->cudaDev; + rank->nvmlDev = ncclComm->nvmlDev; + comm->nRanks++; + } // for (commIdx) + + if (firstNewSkipMissingIdx != -1 && + memcmp(req->comms.skipMissingRanksComms+firstNewSkipMissingIdx, &comm->commId, sizeof(comm->commId)) == 0) { + // Fill in the missingRanks array that follows the comm->ranks. + struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks); + + if (peersReSorted == nullptr) { + // Create a lookup table to rasPeers that is sorted by hostHash and pidHash, to reduce the complexity of the + // lookups in the missingRankIdx loop below. + NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail); + for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) + peersReSorted[peerIdx] = rasPeers+peerIdx; + qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesCompare); + } + + comm->nMissingRanks = comm->commNRanks - comm->nRanks; + for (int missingRankIdx = 0, rankIdx = 0; missingRankIdx < comm->nMissingRanks; missingRankIdx++) { + struct rasCollCommsMissingRank* missingRank; + struct ncclPeerInfo* info; + struct rasPeerInfo** peer; + uint64_t key[2]; + // Look for the next "hole" in the ranks array. + while (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == rankIdx+missingRankIdx) + rankIdx++; + + missingRank = missingRanks + missingRankIdx; + missingRank->commRank = rankIdx + missingRankIdx; + info = ncclComm->peerInfo + missingRank->commRank; + key[0] = info->hostHash - ncclComm->commHash; + key[1] = info->pidHash - ncclComm->commHash; + peer = (struct rasPeerInfo**)bsearch(key, peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesSearch); + if (peer) + memcpy(&missingRank->addr, &(*peer)->addr, sizeof(missingRank->addr)); + missingRank->cudaDev = info->cudaDev; + missingRank->nvmlDev = info->nvmlDev; + } // for (missingRankIdx) + + if (++firstNewSkipMissingIdx == req->comms.nSkipMissingRanksComms) + firstNewSkipMissingIdx = -1; + } // if need to fill in the missingRanks + + comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks) + + comm->nMissingRanks * sizeof(struct rasCollCommsMissingRank)); + } // for (collCommIdx) + assert(((char*)comm) - (char*)commsData <= *pNData); + + if (req) { + // Finish updating the request. + *pReqLen = rasCollDataLength(RAS_COLL_COMMS) + + req->comms.nSkipMissingRanksComms * sizeof(*req->comms.skipMissingRanksComms); + qsort(req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms, + sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare); + } +ret: + free(peersReSorted); + return ret; +fail: + if (req) { + free(req); + *pReq = nullptr; + } + free(*pData); + *pData = nullptr; + goto ret; } // Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data. static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) { - struct rasCollComms* collData; - struct rasCollComms* msgData; + struct rasCollComms* collData; // Data previously stored (locally) by our process. + struct rasCollComms* msgData; // Data just received from another process. int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers); ALIGN_SIZE(dataOffset, alignof(int64_t)); @@ -650,7 +756,7 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* collData = (struct rasCollComms*)coll->data; if (msgData->nComms > 0) { - struct rasCollComms* newData = nullptr; + struct rasCollComms* newData = nullptr; // Destination buffer for the merged data. // Allocate the new buffer pessimistically (sized as the sum of the two old ones). NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData)); @@ -661,25 +767,28 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) { int cmp; if (collIdx < collData->nComms && msgIdx < msgData->nComms) - cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0)); + cmp = rasCommIdCompare(&collComm->commId, &msgComm->commId); else cmp = (collIdx < collData->nComms ? -1 : 1); if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) { INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- " - "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash); + "possible hash collision (0x%lx, 0x%lx, 0x%lx)", collComm->commNRanks, msgComm->commNRanks, + collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash); cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1); - // We try to preserve both separately, although the input data might already be messed up anyway... + // We try to preserve them both separately... } if (cmp == 0) { // Merge the comms. - newComm->commHash = collComm->commHash; + memcpy(&newComm->commId, &collComm->commId, sizeof(newComm->commId)); newComm->commNRanks = collComm->commNRanks; if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) { INFO(NCCL_RAS, - "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)", - collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash); + "RAS encountered more ranks (%d) than the communicator size (%d) -- possible hash collision " + "(0x%lx, 0x%lx, 0x%lx)", collComm->nRanks + msgComm->nRanks, newComm->commNRanks, + collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash); + newComm->nRanks = newComm->commNRanks; // We'll skip the extras in the loop below. } else { newComm->nRanks = collComm->nRanks + msgComm->nRanks; @@ -691,16 +800,18 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* int cmpRank; if (newRankIdx == newComm->commNRanks) break; // Short of failing, the best we can do is skip... - if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks) + if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks) { cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 : (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0)); - else + } else { cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1); + } // There shouldn't be any overlaps in ranks between different sources. if (cmpRank == 0) { - INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)", - collComm->ranks[collRankIdx].commRank, newComm->commHash); + INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible hash collision " + "(0x%lx, 0x%lx, 0x%lx)", collComm->ranks[collRankIdx].commRank, + newComm->commId.commHash, newComm->commId.hostHash, newComm->commId.pidHash); msgRankIdx++; // Short of failing, the best we can do is skip... } memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ : @@ -708,23 +819,63 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* if (cmpRank > 0) { // peerIdx values from msgComm need to shift after merge. newComm->ranks[newRankIdx].peerIdx += coll->nPeers; - } + + if (collComm->nMissingRanks > 0) { + // Remove the corresponding entry from missingRanks. + struct rasCollCommsMissingRank* missingRank; + missingRank = (struct rasCollCommsMissingRank*)bsearch(&newComm->ranks[newRankIdx].commRank, + collComm->ranks+collComm->nRanks, + collComm->nMissingRanks, + sizeof(struct rasCollCommsMissingRank), + rasCollCommsMissingRankSearch); + if (missingRank) { + // Mark the entry as no longer needed. + memset(&missingRank->addr, '\0', sizeof(missingRank->addr)); + } else { + INFO(NCCL_RAS, "RAS failed to find missingRank data -- internal error?"); + } + } // if (collComm->nMissingRanks > 0) + } // if (cmpRank > 0) } // for (newRankIdx) - newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks)); - collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks)); + if (collComm->nMissingRanks > 0) { + // Copy the missingRanks to newComm, skipping over any no longer needed entries. + union ncclSocketAddress emptyAddr; + struct rasCollCommsMissingRank* collMissingRanks; + struct rasCollCommsMissingRank* newMissingRanks; + int newRankIdx; + + memset(&emptyAddr, '\0', sizeof(emptyAddr)); + collMissingRanks = (struct rasCollCommsMissingRank*)(collComm->ranks+collComm->nRanks); + newMissingRanks = (struct rasCollCommsMissingRank*)(newComm->ranks+newComm->nRanks); + newRankIdx = 0; + for (int collRankIdx = 0; collRankIdx < collComm->nMissingRanks; collRankIdx++) { + if (memcmp(&collMissingRanks[collRankIdx].addr, &emptyAddr, sizeof(emptyAddr))) { + memcpy(newMissingRanks + newRankIdx++, collMissingRanks + collRankIdx, sizeof(*newMissingRanks)); + } + } + newComm->nMissingRanks = newRankIdx; + assert(newComm->nRanks + newComm->nMissingRanks == newComm->commNRanks); + } + newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks) + + newComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank)); + collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks) + + collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank)); collIdx++; - msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks)); + msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks) + + msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank)); msgIdx++; } else if (cmp < 0) { // Copy from collComm. - int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks); + int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks) + + collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank); memcpy(newComm, collComm, commSize); newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize); collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize); collIdx++; } else { // cmp > 0 // Copy from msgComm. - int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks); + int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks) + + msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank); memcpy(newComm, msgComm, commSize); for (int i = 0; i < newComm->nRanks; i++) { // peerIdx values from msgComm need to shift after merge. @@ -745,18 +896,87 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* return ncclSuccess; } +// Checks if a given communicator is in the skipMissingRanksComms array of the request. +static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm) { + struct rasCommId id; + id.commHash = comm->commHash; + id.hostHash = comm->peerInfo->hostHash; + id.pidHash = comm->peerInfo->pidHash; + return (bsearch(&id, req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms, + sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare) != nullptr); +} + // Sorting callback for the ncclComms array. static int ncclCommsCompare(const void* p1, const void* p2) { - const ncclComm** pc1 = (const ncclComm**)p1; - const ncclComm** pc2 = (const ncclComm**)p2; + const ncclComm* comm1 = *(const ncclComm**)p1; + const ncclComm* comm2 = *(const ncclComm**)p2; // Put nullptr's at the end. - if (*pc1 == nullptr || *pc2 == nullptr) - return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0)); + if (comm1 == nullptr || comm2 == nullptr) + return (comm1 != nullptr ? -1 : (comm2 != nullptr ? 1 : 0)); - if ((*pc1)->commHash == (*pc2)->commHash) { - return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0)); + if (comm1->commHash == comm2->commHash) { + return (comm1->rank < comm2->rank ? -1 : (comm1->rank > comm2->rank ? 1 : 0)); } else { - return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1); + return (comm1->commHash < comm2->commHash ? -1 : 1); } } + +// Sorting callback for a lookup table to rasPeers. Sorts by the hostHash (primary) and pidHash (secondary). +static int peersHashesCompare(const void* p1, const void* p2) { + const struct rasPeerInfo* pi1 = *(const struct rasPeerInfo**)p1; + const struct rasPeerInfo* pi2 = *(const struct rasPeerInfo**)p2; + + if (pi1->hostHash == pi2->hostHash) { + return (pi1->pidHash < pi2->pidHash ? -1 : (pi1->pidHash > pi2->pidHash ? 1 : 0)); + } else { + return (pi1->hostHash < pi2->hostHash ? -1 : 1); + } +} + +// Search callback for a lookup table to rasPeers. Searches by the hostHash and pidHash. The key is an array +// containing the hostHash at index 0 and the pidHash at index 1. +static int peersHashesSearch(const void* k, const void* e) { + const uint64_t* key = (const uint64_t*)k; + const struct rasPeerInfo* elem = *(const struct rasPeerInfo**)e; + + if (key[0] == elem->hostHash) { + return (key[1] < elem->pidHash ? -1 : (key[1] > elem->pidHash ? 1 : 0)); + } else { + return (key[0] < elem->hostHash ? -1 : 1); + } +} + +// Sorting/searching callback for struct rasCommId. Sorts by commHash, then hostHash, then pidHash. +static int rasCommIdCompare(const void* p1, const void* p2) { + const struct rasCommId* i1 = (const struct rasCommId*)p1; + const struct rasCommId* i2 = (const struct rasCommId*)p2; + if (i1->commHash == i2->commHash) { + if (i1->hostHash == i2->hostHash) { + return (i1->pidHash < i2->pidHash ? -1 : (i1->pidHash > i2->pidHash ? 1 : 0)); + } else { + return (i1->hostHash < i2->hostHash ? -1 : 1); + } + } else { + return (i1->commHash < i2->commHash ? -1 : 1); + } +} + +// Search callback for rasCollComms::comm rasCollCommsMissingRank array. The key is the commRank. +static int rasCollCommsMissingRankSearch(const void* k, const void* e) { + int key = *(const int*)k; + const struct rasCollCommsMissingRank* elem = (const struct rasCollCommsMissingRank*)e; + + return (key < elem->commRank ? -1 : (key > elem->commRank ? 1 : 0)); +} + +// Invoked during RAS termination to release all the allocated resources. +void rasCollectivesTerminate() { + for (struct rasCollective* coll = rasCollectivesHead; coll;) { + struct rasCollective* collNext = coll->next; + rasCollFree(coll); + coll = collNext; + } + + // rasCollectivesHead and rasCollectivesTail are taken care of by rasCollFree(). +} diff --git a/src/ras/peers.cc b/src/ras/peers.cc index f2692d3..8573209 100644 --- a/src/ras/peers.cc +++ b/src/ras/peers.cc @@ -40,10 +40,11 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1); static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, - struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1); + struct rasRankInit* ranks = nullptr, int nranks = 0, + struct rasConnection* fromConn = nullptr); static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks, - int fromConnIdx); + struct rasConnection* fromConn); static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks); ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock); @@ -146,6 +147,8 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks rankPeer->pid = rank->pid; rankPeer->cudaDevs = (1UL << rank->cudaDev); rankPeer->nvmlDevs = (1UL << rank->nvmlDev); + rankPeer->hostHash = rank->hostHash; + rankPeer->pidHash = rank->pidHash; rankPeerIdx++; // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how @@ -357,12 +360,12 @@ int rasPeerFind(const union ncclSocketAddress* addr) { // ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members // of the new communicator being established), and who thus don't need to be notified. updatedDeadPeers can // be used, however, to request at least the propagation of rasDeadPeers to such peers. -// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to +// fromConn -- if provided -- identifies the connection used to receive this update; there's no need to // propagate the update back through it. // Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new // connections as needed. static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, - struct rasRankInit* ranks, int nranks, int fromConnIdx) { + struct rasRankInit* ranks, int nranks, struct rasConnection* fromConn) { ncclResult_t ret = ncclSuccess; // Do we actually have anything to do? @@ -371,8 +374,8 @@ static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nN // Start by propagating the update through the RAS network links. We consider any errors during this process // to be non-fatal (we can re-sync later around a keep-alive exchange). - (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx); - (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx); + (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn); + (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn); // Calculate new link peers and open new connections if needed. NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail); @@ -388,15 +391,13 @@ fail: // for the explanation of the function arguments. static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks, - int fromConnIdx) { - for (int i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; + struct rasConnection* fromConn) { + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) { // Note that we don't send the update via the connection that we received this notification from in the first // place (while it wouldn't loop indefinitely, it would add a needless extra exchange). - if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) { - struct rasConnection* conn = rasConns+linkConn->connIdx; + if (linkConn->conn && linkConn->conn != fromConn) { // Failed propagations are not considered fatal (we will retry after a keep-alive). - (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks); + (void)rasConnPropagateUpdate(linkConn->conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks); } } @@ -407,7 +408,7 @@ static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct ra // arguments. static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) { - if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) { + if (conn->sock && conn->sock->status == RAS_SOCK_READY) { // If we have the rank info, check if the peer on the other side of this connection has participated in the new // communicator. int connRank = -1; @@ -462,7 +463,8 @@ ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct ras msg->peersUpdate.deadPeersHash = rasDeadPeersHash; msg->peersUpdate.nDeadPeers = nDeadPeers; memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0])); - memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers)); + if (nDeadPeers > 0) + memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers)); if (nPeers > 0) conn->lastSentPeersHash = rasPeersHash; @@ -485,8 +487,7 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) ncclResult_t ret = ncclSuccess; struct rasMsg* newMsg = nullptr; int newMsgLen = 0; - assert(sock->connIdx != -1); - struct rasConnection* conn = rasConns+sock->connIdx; + assert(sock->conn); int nPeers, nDeadPeers; int deadPeersOffset = 0; bool updatePeers, updateDeadPeers; @@ -496,8 +497,8 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers); INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d", rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers); - conn->lastRecvPeersHash = msg->peersUpdate.peersHash; - conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash; + sock->conn->lastRecvPeersHash = msg->peersUpdate.peersHash; + sock->conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash; // Prepare ours to send back. We don't enqueue it right away because we want to make sure first that we need // to send it. We'll find out by comparing the hash values after the merge. @@ -545,15 +546,15 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) rasDeadPeersDump(); // If post-merge the hashes are still different, send our (dead) peers back. - updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash); - updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash && - conn->lastRecvDeadPeersHash != rasDeadPeersHash); + updatePeers = (sock->conn->lastSentPeersHash != rasPeersHash && sock->conn->lastRecvPeersHash != rasPeersHash); + updateDeadPeers = (sock->conn->lastSentDeadPeersHash != rasDeadPeersHash && + sock->conn->lastRecvDeadPeersHash != rasDeadPeersHash); if (updatePeers || updateDeadPeers) { newMsg->peersUpdate.peersHash = rasPeersHash; newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash; if (updatePeers) { assert(nPeers > 0); - conn->lastSentPeersHash = rasPeersHash; + sock->conn->lastSentPeersHash = rasPeersHash; } else { // If hashes match, make sure that we don't send the rasPeers back. newMsg->peersUpdate.nPeers = 0; @@ -564,14 +565,14 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) if (updateDeadPeers) { assert(nRasDeadPeers > 0); - conn->lastSentDeadPeersHash = rasDeadPeersHash; + sock->conn->lastSentDeadPeersHash = rasDeadPeersHash; ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress)); deadPeersOffset = newMsgLen; newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers); memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers)); - conn->lastSentDeadPeersHash = rasDeadPeersHash; + sock->conn->lastSentDeadPeersHash = rasDeadPeersHash; newMsg->peersUpdate.nDeadPeers = nRasDeadPeers; } else { newMsg->peersUpdate.nDeadPeers = 0; @@ -580,13 +581,13 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)", newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers); - rasConnEnqueueMsg(conn, newMsg, newMsgLen); + rasConnEnqueueMsg(sock->conn, newMsg, newMsgLen); newMsg = nullptr; } // if (updatePeers || updateDeadPeers) // Propagate the changes through our RAS network links. NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0, - sock->connIdx), ret, fail); + sock->conn), ret, fail); } exit: @@ -603,7 +604,7 @@ fail: // Reinitializes the connection(s) of a particular link, following a peers update. // Adding new peers can affect the calculation of the link's primary connection and also the fallbacks. -// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn +// The newly added peers could also shift all the existing peerIdx values, invalidating the values in rasLinkConn // structures, so it's better to drop it all and recalculate from scratch. // We recalculate the primary peer; if an active connection to it already exists, then we're done. If there // is no connection, we create one. If a connection exists but is experiencing delays then we add a fallback and @@ -611,77 +612,51 @@ fail: // External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed). static ncclResult_t rasLinkReinitConns(struct rasLink* link) { struct rasLinkConn* linkConn; - struct rasConnection* conn = nullptr; int newPeerIdx = myPeerIdx; - if (link->connsSize == 0) { - link->connsSize = RAS_INCREMENT; - NCCLCHECK(ncclCalloc(&link->conns, link->connsSize)); + if (link->conns) { + // Free the old contents but keep the first entry for convenience (though wipe it). + for (struct rasLinkConn* linkConn = link->conns->next; linkConn;) { + struct rasLinkConn* linkConnNext = linkConn->next; + free(linkConn); + linkConn = linkConnNext; + } + memset(link->conns, '\0', sizeof(*link->conns)); + link->lastUpdatePeersTime = 0; + } else { // link->conns == nullptr + NCCLCHECK(ncclCalloc(&link->conns, 1)); } - link->nConns = 0; - // Establish a connection for this link. We iterate as long as the connections we find are experiencing delays. - while (newPeerIdx != -1) { - if (link->nConns == link->connsSize) { - NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT)); - link->connsSize += RAS_INCREMENT; - } + // Fill in the entry for the primary connection. + linkConn = link->conns; + linkConn->peerIdx = newPeerIdx = rasLinkCalculatePeer(link, myPeerIdx, /*isFallback*/false); + linkConn->conn = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : nullptr); + linkConn->external = false; - newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1); - if (newPeerIdx == -1) { - INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns); - if (link->nConns > 0) - break; - } - linkConn = link->conns+link->nConns; - linkConn->peerIdx = newPeerIdx; - linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1); - linkConn->external = false; - - // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration. - // Depending on the circumstances, we may first need to create that connection. - if (linkConn->connIdx == - 1) { - if (link->nConns == 0) { - if (linkConn->peerIdx != -1) { - INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s", - link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"), - ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); - // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index) - // to avoid races and the creation of duplicate connections. - if (myPeerIdx < linkConn->peerIdx) { - NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx)); - } - else { // If we didn't initiate the connection, start the timeout. - link->lastUpdatePeersTime = clockNano(); - } - } // if (linkConn->peerIdx != -1) - } else { // link->nConns > 0 - INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s", - link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); - NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx)); - } // link->nConns > 0 - } else { // linkConn->connIdx != -1 - if (link->nConns == 0) { - INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s", - link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); - } else { - INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s", - link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + if (linkConn->conn == nullptr) { + if (linkConn->peerIdx != -1) { + // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index) + // to avoid races and the creation of duplicate connections. + INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s", + link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"), + ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + if (myPeerIdx < linkConn->peerIdx) { + NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn)); } - } - link->nConns++; - if (linkConn->connIdx == -1) - break; - conn = rasConns+linkConn->connIdx; - - // We check if the connection already went through the fallback calculation; if so, we'll need to create a new - // fallback in the next iteration, to ensure that RAS will keep retrying. - if (!conn->experiencingDelays) - break; + else { // If we didn't initiate the connection, start the timeout. + link->lastUpdatePeersTime = clockNano(); + } + } // if (linkConn->peerIdx != -1) + } else { // linkConn->conn + INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s", + link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + } // linkConn->conn + if (linkConn->conn && linkConn->conn->experiencingDelays) { INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d", - conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9, - (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + linkConn->conn->experiencingDelays, (clockNano()-linkConn->conn->startRetryTime)/1e9, + (linkConn->conn->sock ? linkConn->conn->sock->status : - 1)); + NCCLCHECK(rasLinkAddFallback(link, linkConn->conn)); } return ncclSuccess; @@ -701,39 +676,37 @@ int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallbac if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) { // peerIdx is a fallback and it is not running on the same node as us. int tryPeerIdx = newPeerIdx; - int tryConnIdx = -1; + struct rasConnection* tryConn = nullptr; // Try to skip the remaining peers on the same node as peerIdx. We may end up skipping over some peers that // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a // little suboptimal one. while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) { if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) { - tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr); - if (tryConnIdx != -1) { - struct rasConnection* tryConn = rasConns+tryConnIdx; + tryConn = rasConnFind(&rasPeers[tryPeerIdx].addr); + if (tryConn) { // Check if the connection is fully established and operational, i.e., if the underlying socket // is ready and there's been recent communication on it. - if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY && - !tryConn->experiencingDelays) { + if (tryConn->sock && tryConn->sock->status == RAS_SOCK_READY && !tryConn->experiencingDelays) { // We convinced ourselves that the node is not down. We don't adjust newPeerIdx in // this case. This is the only case when tryConnIdx != -1 after this loop. break; } - } // if (tryConnIdx != -1) + } // if (tryConn) } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) - tryConnIdx = -1; - tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers; + tryConn = nullptr; + tryPeerIdx = (tryPeerIdx + link->direction + nRasPeers) % nRasPeers; if (tryPeerIdx == myPeerIdx) break; } - if (tryConnIdx == -1) + if (tryConn == nullptr) newPeerIdx = tryPeerIdx; if (tryPeerIdx == myPeerIdx) break; } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) - + if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) { newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers; } @@ -932,7 +905,8 @@ bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSock static void rasPeersDump() { for (int p = 0; p < nRasPeers; p++) { const struct rasPeerInfo* peer = rasPeers+p; - INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : "")); + INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), + (p == myPeerIdx ? " [this process]" : "")); } if (nRasPeers > 0) INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash); @@ -958,3 +932,17 @@ static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nr rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2))); return result; } + +// Invoked during RAS termination to release all the allocated resources. +void rasPeersTerminate() { + free(rasPeers); + rasPeers = nullptr; + nRasPeers = 0; + rasPeersHash = 0; + myPeerIdx = -1; + + free(rasDeadPeers); + rasDeadPeers = nullptr; + nRasDeadPeers = rasDeadPeersSize = 0; + rasDeadPeersHash = 0; +} diff --git a/src/ras/ras.cc b/src/ras/ras.cc index 4905d7a..8ef551c 100644 --- a/src/ras/ras.cc +++ b/src/ras/ras.cc @@ -4,8 +4,10 @@ * See LICENSE.txt for license information ************************************************************************/ -#define NDEBUG // Comment out during development only! -#include +// Workaround for libstdc++ trying to force public visibility of std:: symbols. We don't want to do that in libnccl.so. +#include +#undef _GLIBCXX_VISIBILITY +#define _GLIBCXX_VISIBILITY(V) #include #include #include @@ -65,8 +67,8 @@ int nNcclComms = 0; bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank. static ncclResult_t rasLocalNotify(const struct rasNotification* msg); -static ncclResult_t rasLocalHandle(); -static void rasLocalHandleTerminate(); +static ncclResult_t rasLocalHandle(bool* terminate); +static void rasThreadCleanup(); static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock); static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock); @@ -74,6 +76,8 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock); static void* rasThreadMain(void*); +static void rasTerminate() __attribute__((destructor)); + NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1); ////////////////////////////////////////////////// @@ -105,7 +109,6 @@ ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank) PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail); ncclSetThreadName(rasThread, "NCCL RAS"); - (void)pthread_detach(rasThread); rasInitialized = true; } @@ -157,18 +160,27 @@ ncclResult_t ncclRasCommFini(const struct ncclComm* comm) { } } } - if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) { - struct rasNotification msg; - msg.type = RAS_TERMINATE; - NCCLCHECK(rasLocalNotify(&msg)); - } + ncclAtomicRefCountDecrement(&rasInitRefCount); return ncclSuccess; } +// Global destructor. Notifies the RAS thread to release all the resources +// and terminate. Waits for the thread to terminate. +static void rasTerminate() { + struct rasNotification msg; + if (!rasInitialized) + return; + memset(&msg, '\0', sizeof(msg)); + msg.type = RAS_TERMINATE; + if (rasLocalNotify(&msg) == ncclSuccess) + (void)pthread_join(rasThread, nullptr); +} + // Invoked by regular NCCL threads on every (non-split) comm initialization. Provides info on all the ranks within // the communicator. ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) { struct rasNotification msg; + memset(&msg, '\0', sizeof(msg)); msg.type = RAS_ADD_RANKS; msg.addRanks.ranks = ranks; msg.addRanks.nranks = nranks; @@ -199,7 +211,7 @@ static ncclResult_t rasLocalNotify(const struct rasNotification* msg) { ///////////////////////////////////////////////////////////////////////////////// // Handles asynchronous local notifications arriving from regular NCCL threads. -static ncclResult_t rasLocalHandle() { +static ncclResult_t rasLocalHandle(bool* terminate) { struct rasNotification msg; size_t done = 0; @@ -212,9 +224,11 @@ static ncclResult_t rasLocalHandle() { } if (msg.type == RAS_ADD_RANKS) { - NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks)); + (void)rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks); + // Not great if the above fails, but it shouldn't be critical; better to keep going. } else if (msg.type == RAS_TERMINATE) { - rasLocalHandleTerminate(); + INFO(NCCL_RAS, "RAS handling local termination request"); + *terminate = true; } else { WARN("RAS received unknown notification type %d", msg.type); return ncclInternalError; @@ -223,10 +237,35 @@ static ncclResult_t rasLocalHandle() { return ncclSuccess; } -// Handles local RAS_TERMINATE notification. -static void rasLocalHandleTerminate() { - INFO(NCCL_RAS, "RAS handling local termination request"); - // For now we don't do anything. +// Cleans up local RAS state, normally in response to a RAS_TERMINATE notification. +static void rasThreadCleanup() { + rasClientSupportTerminate(); + rasNetTerminate(); + rasCollectivesTerminate(); + rasPeersTerminate(); + + { + std::lock_guard lock(rasInitMutex); + (void)close(rasNotificationPipe[1]); + (void)close(rasNotificationPipe[0]); + // rasClientListeningSocket is taken care of by rasClientSupportTerminate(). + rasNotificationPipe[0] = rasNotificationPipe[1] = -1; + (void)ncclSocketClose(&rasNetListeningSocket); + rasInitRefCount = 0; + rasInitialized = false; + } + + { + std::lock_guard lock(ncclCommsMutex); + free(ncclComms); + ncclComms = nullptr; + nNcclComms = 0; + ncclCommsSorted = false; + } + + free(rasPfds); + rasPfds = nullptr; + nRasPfds = 0; } @@ -270,10 +309,10 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms else ncclIntruQueueEnqueue(&conn->sendQ, meta); - if (conn->sockIdx != -1) { - struct rasSocket* sock = rasSockets+conn->sockIdx; - if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) { - rasPfds[sock->pfd].events |= POLLOUT; + if (conn->sock) { + if (conn->sock->status == RAS_SOCK_READY || + (conn->sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) { + rasPfds[conn->sock->pfd].events |= POLLOUT; ready = true; } } @@ -283,31 +322,31 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)", msg->type, ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0), - (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + (conn->sock ? conn->sock->status : -1)); } } // Attempts to send the queued RAS messages to another RAS thread. ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) { - struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock; struct rasMsgMeta* meta; *closed = 0; while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) { - if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) { + if (conn->sock->status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) { // We don't send anything beyond the handshake at this point. meta = nullptr; break; } if (meta->offset < sizeof(meta->length)) { // Send the length of the message. - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed)); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, &meta->length, sizeof(meta->length), + &meta->offset, closed)); if (*closed) return ncclSuccess; if (meta->offset < sizeof(meta->length)) break; } // Send the body of the message. - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length), + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, ((char*)&meta->msg)-sizeof(meta->length), meta->length+sizeof(meta->length), &meta->offset, closed)); if (*closed) return ncclSuccess; @@ -377,7 +416,7 @@ ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) { static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) { ncclResult_t ret = ncclSuccess; struct rasConnection* conn = nullptr; - int connIdx, peerIdx; + int peerIdx; struct rasMsg* newMsg = nullptr; int newMsgLen; char line[SOCKET_NAME_MAXLEN+1]; @@ -406,19 +445,16 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc } // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race). - connIdx = rasConnFind(&msg->connInit.listeningAddr); - if (connIdx != -1) { - conn = rasConns+connIdx; - + conn = rasConnFind(&msg->connInit.listeningAddr); + if (conn) { INFO(NCCL_RAS, "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)", (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "), conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0)); - if (conn->sockIdx != -1) { - struct rasSocket* connSock = rasSockets+conn->sockIdx; + if (conn->sock) { INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)", - connSock->status, (clockNano()-connSock->createTime)/1e9); + conn->sock->status, (clockNano()-conn->sock->createTime)/1e9); // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have // a race where both sides attempt to establish a connection at roughly the same time, so the other side's // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them. @@ -433,21 +469,19 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc goto exit; } else { INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one"); - rasSocketTerminate(connSock); + rasSocketTerminate(conn->sock); } } - } - if (!conn) { + } else { // conn == nullptr NCCLCHECK(getNewConnEntry(&conn)); memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr)); - connIdx = conn - rasConns; } sock->status = RAS_SOCK_READY; // rasConnResume will reset any experiencingDelays, startRetryTime, etc. - conn->sockIdx = sock-rasSockets; - sock->connIdx = connIdx; + conn->sock = sock; + sock->conn = conn; memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr)); // Make sure that the connection is part of the right links forming the RAS network. At this point we only @@ -456,8 +490,8 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before // the peers update. if (peerIdx != -1) { - (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx); - (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx); + (void)rasLinkConnUpdate(&rasNextLink, conn, peerIdx); + (void)rasLinkConnUpdate(&rasPrevLink, conn, peerIdx); } // Send a confirmation to the server that requested the connection (so that the resilience code can mark @@ -504,12 +538,13 @@ static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct ras } // Handles the deadPeer broadcast. -void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) { - INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine)); +void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone) { + INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&(*pReq)->deadPeer.addr, rasLine)); - if (!rasPeerIsDead(&req->deadPeer.addr)) { - rasConnDisconnect(&req->deadPeer.addr); - (void)rasPeerDeclareDead(&req->deadPeer.addr); + *pReqLen = rasCollDataLength(RAS_BC_DEADPEER); + if (!rasPeerIsDead(&(*pReq)->deadPeer.addr)) { + rasConnDisconnect(&(*pReq)->deadPeer.addr); + (void)rasPeerDeclareDead(&(*pReq)->deadPeer.addr); *pDone = false; } else { INFO(NCCL_RAS, "RAS already knew it was dead"); @@ -530,6 +565,7 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock) { INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine)); + memset(&msg, '\0', sizeof(msg)); msg.type = RAS_MSG_CONNINITACK; msg.connInitAck.nack = 1; offset = 0; @@ -557,16 +593,16 @@ static void* rasThreadMain(void*) { INFO(NCCL_RAS, "RAS thread started"); // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket). - NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit); rasPfds[pfd].fd = rasNotificationPipe[0]; rasPfds[pfd].events = POLLIN; - NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); - NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail); + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit); + NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, exit); rasPfds[pfd].fd = rasNetListeningSocketFd; rasPfds[pfd].events = POLLIN; - NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit); rasPfds[pfd].fd = rasClientListeningSocket; rasPfds[pfd].events = POLLIN; @@ -595,32 +631,37 @@ static void* rasThreadMain(void*) { if (rasPfds[pollIdx].revents) { nEvents--; if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) { - (void)rasLocalHandle(); + bool terminate = false; + NCCLCHECKGOTO(rasLocalHandle(&terminate), ret, exit); + if (terminate) + goto exit; } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) { (void)rasNetAcceptNewSocket(); } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) { (void)rasClientAcceptNewSocket(); } else { // Check if it's one of the RAS sockets. - int sockIdx; - for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) { - struct rasSocket* sock = rasSockets+sockIdx; - if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) { - rasSockEventLoop(sockIdx, pollIdx); + struct rasSocket* sock; + for (sock = rasSocketsHead; sock;) { + struct rasSocket* sockNext = sock->next; + if (rasPfds[pollIdx].fd == sock->sock.fd) { + rasSockEventLoop(sock, pollIdx); break; } - } // for (sockIdx) + sock = sockNext; + } // for (sock) - if (sockIdx == nRasSockets) { + if (sock == nullptr) { // Try a client socket instead. - for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) { - struct rasClient* client = rasClients+clientIdx; - if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) { - rasClientEventLoop(clientIdx, pollIdx); + for (struct rasClient* client = rasClientsHead; client;) { + struct rasClient* clientNext = client->next; + if (rasPfds[pollIdx].fd == client->sock) { + rasClientEventLoop(client, pollIdx); break; } - } // for (clientIdx) - } // if (sockIdx == nRasSockets) + client = clientNext; + } // for (client) + } // if (sock == nullptr) } // dynamic fds } // if (revents) } // for (pollIdx) @@ -636,14 +677,9 @@ static void* rasThreadMain(void*) { rasCollsHandleTimeouts(now, &nextWakeup); } // for (;;) -fail: - WARN("fatal error - RAS thread terminating"); - std::lock_guard lock(rasInitMutex); - (void)close(rasNotificationPipe[1]); - (void)close(rasNotificationPipe[0]); - (void)close(rasClientListeningSocket); - (void)ncclSocketClose(&rasNetListeningSocket); - rasInitialized = false; +exit: + rasThreadCleanup(); + INFO(NCCL_RAS, "RAS thread terminating"); return nullptr; } diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h index 715fff4..17326c3 100644 --- a/src/ras/ras_internal.h +++ b/src/ras/ras_internal.h @@ -42,6 +42,14 @@ typedef enum { RAS_COLL_COMMS = 1002, // Collect data about all communicators. } rasCollectiveType; +// Unique communicator identifier. commHash by itself is definitely not guaranteed to be unique. +// Combined with the two other hashes, the chance is much better... +// All three fields are used for sorting. +struct rasCommId { + uint64_t commHash; + uint64_t hostHash, pidHash; // These are the hashes of the *first* rank (comm->peerInfo[0]). +}; + // Payload of a collective request message (RAS_MSG_COLLREQ). struct rasCollRequest { union ncclSocketAddress rootAddr; @@ -56,6 +64,10 @@ struct rasCollRequest { struct { } conns; struct { + int nSkipMissingRanksComms; // Number of elements in the array below. + // Communicators for which we do *not* need the missingRanks data in the responses + // (see struct rasCollCommsMissingRank later). + struct rasCommId skipMissingRanksComms[0]; // Variable length, sorted. } comms; }; }; @@ -69,8 +81,8 @@ struct rasCollResponse { int nPeers; int nData; // Size of data in bytes. union ncclSocketAddress peers[0]; // Variable length. - // The peersAddrs array is followed by: - //alignas(int64_t) char data[0]; // Variable length, collective-dependent. + // The peers array is followed by: + // alignas(int64_t) char data[0]; // Variable length, collective-dependent. }; // Describes a peer NCCL process. Every RAS thread keeps an (identical) array of them, one entry for each @@ -80,6 +92,8 @@ struct rasPeerInfo { pid_t pid; uint64_t cudaDevs; // Bitmask. This is for local devices so 64 bits is enough. uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES. + uint64_t hostHash, pidHash; // Taken from ncclComm, but with the commHash subtracted to make it + // communicator-independent. }; // Describes a RAS message. Every message is preceded by a (32-bit) message length. All data in the host @@ -112,7 +126,7 @@ struct rasMsg { int nPeers; int nDeadPeers; struct rasPeerInfo peers[0]; // Variable length. - // The peers array is followed by the following: + // The peers array is followed by: //union ncclSocketAddress deadPeers[0]; // Variable length. } peersUpdate; struct { @@ -218,6 +232,9 @@ struct rasMsgMeta { // Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response). // For every collective operation, each participating RAS thread will create its own. struct rasCollective { + struct rasCollective* next; + struct rasCollective* prev; + union ncclSocketAddress rootAddr; uint64_t rootId; @@ -227,15 +244,16 @@ struct rasCollective { bool timeoutWarned; int64_t startTime; // For timeout calculations. - int fromConnIdx; // The connection we received the request from. + struct rasConnection* fromConn; // The connection we received the request from. - int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive. + struct rasConnection** fwdConns; // Connections we forwarded the request to; replaced by nullptr's as the + // responses arrive. int nFwdSent; // Count of the above (local process only). int nFwdRecv; // Count of the responses received or timeouts (local process only). int nLegTimeouts; // Collective (from this process and the responses we received). - union ncclSocketAddress* peers; // Collective (from this process and the responses we received). + union ncclSocketAddress* peers; // Collective (from this process and the responses we received). Unsorted. int nPeers; char* data; // Collective (from this process and the responses we received). @@ -261,13 +279,14 @@ struct rasCollConns { struct rasCollComms { int nComms; struct comm { - uint64_t commHash; - int commNRanks; - int nRanks; // number of elements in the array below, *not* in the communicator. + struct rasCommId commId; + int commNRanks; // >= nRanks + nMissingRanks + int nRanks; // Number of elements in the ranks array below, *not* in the communicator. + int nMissingRanks; // Number of elements in the missingRanks array below. struct rank { int commRank; int peerIdx; // Index within rasCollective->peers, *not* rasPeers. - uint64_t collOpCount; + uint64_t collOpCounts[NCCL_NUM_FUNCTIONS]; struct { ncclResult_t initState:4; ncclResult_t asyncError:4; @@ -278,34 +297,47 @@ struct rasCollComms { char cudaDev; char nvmlDev; } ranks[0]; // Variable length. Sorted by commRank. Optimized for 1 GPU/process. - } comms[0]; // Variable length. Sorted by commHash. + // The ranks array is followed by: + // struct rasCollCommsMissingRank missingRanks[0]; // Variable length. Sorted by commRank. + } comms[0]; // Variable length. Sorted by commId. +}; + +// Provides info about missing ranks. An array of these structures can be part of struct rasCollComms above. +// Because the arrays are of variable length, we can't describe them in C. To ensure that adding +// rasCollCommsMissingRank structures doesn't mess up the alignment, we explicitly request one. +struct alignas(struct rasCollComms) rasCollCommsMissingRank { + int commRank; + union ncclSocketAddress addr; + // We don't need pid here as we can look it up in rasPeers via addr. + char cudaDev; + char nvmlDev; }; // Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one // or one of the fallbacks). struct rasLinkConn { + struct rasLinkConn* next; int peerIdx; // Index in the rasPeers array of the peer this entry describes. Could be -1 (an entry initiated // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates). - int connIdx; // Index in the rasConns array of the connection to the above peer. Could be -1 (a placeholder - // for a connection to be started by the remote peer). + struct rasConnection* conn; // The connection to the above peer. Could be nullptr (a placeholder for a connection + // to be started by the remote peer). bool external; // true if the entry exists only due to an external request (requested by a remote peer, most // likely as part of fault recovery). Such connections are kept as fallbacks even if there's a // valid primary connection, in order to ensure that keep-alive messages are sent. }; // Describes a link that forms the backbone of the RAS network. Links focus on direction (previous/next in -// case of 1-D topology) rather than a particular destination. The are implemented using rasConnections, but +// case of 1-D topology) rather than a particular destination. They are implemented using rasConnections, but // they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS // network is reconfigured or a peer dies. struct rasLink { int direction; // 1 for nextLink, -1 for prevLink. - // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having - // problems with the primary connection). The elements are de-facto ordered (highest-preference ones have - // the lowest indices). + // First element is the primary connection; any additional ones are fallbacks (that get created if we are having + // problems with the primary connection). The highest-preference elements come first; the list is de-facto sorted + // by peerIdx, though peerIdx values can wrap around (given the ring/torus topology) and they can also be -1 + // (the latter are stored at the end). struct rasLinkConn* conns; - int nConns; - int connsSize; // Array size; could be larger than nConns. // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect // the peer on the other side to do so) but that peer failed to initiate. @@ -315,15 +347,15 @@ struct rasLink { // Describes a connection to another peer on the RAS network. It is meant to be more persistent than a volatile // socket (described by the rasSocket structure), which can be affected by transient network issues. struct rasConnection { - bool inUse; + struct rasConnection* next; + struct rasConnection* prev; union ncclSocketAddress addr; - // Index of the current rasSocket in the rasSockets array. Note that multiple rasSocket entries may point back + // Pointer to the current rasSocket. Note that multiple rasSocket entries may point back // to a single entry here, for sockets that are in the process of being terminated and re-established. - // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time. - // -1 if there is no such socket. - int sockIdx; + // nullptr if there is no such socket. + struct rasSocket* sock; // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges. // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash. @@ -371,16 +403,18 @@ typedef enum { // Describes a socket implementing communication between two peers. struct rasSocket { + struct rasSocket* next; + struct rasSocket* prev; + struct ncclSocket sock; rasSocketStatus status; int pfd; // Index in the rasPfds array. - // Index of the corresponding entry in the rasConns array. - // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time. - // -1 if there is no connection (normal condition on the accept side before the connInit message). - int connIdx; + // Pointer to the corresponding entry in the rasConns array. + // nullptr if there is no connection (a normal condition on the accept side before the connInit message). + struct rasConnection* conn; int64_t createTime; int64_t lastSendTime; @@ -404,7 +438,10 @@ typedef enum { // Describes a RAS client. struct rasClient { - int sock; + struct rasClient* next; + struct rasClient* prev; + + int sock; // File descriptor rasClientStatus status; @@ -420,7 +457,7 @@ struct rasClient { int64_t timeout; // State stored during asynchronous operations such as collectives. - int collIdx; // Index to the onging rasCollective. + struct rasCollective* coll; }; @@ -440,31 +477,33 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent); ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed); ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock); -void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone); +void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone); ncclResult_t rasGetNewPollEntry(int* index); // rasnet.cc extern struct rasLink rasNextLink, rasPrevLink; -extern struct rasConnection* rasConns; -extern int nRasConns; -extern struct rasSocket *rasSockets; -extern int nRasSockets; +extern struct rasConnection* rasConnsHead; +extern struct rasConnection* rasConnsTail; +extern struct rasSocket *rasSocketsHead; +extern struct rasSocket *rasSocketsTail; ncclResult_t getNewConnEntry(struct rasConnection** pConn); -ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx); -int rasConnFind(const union ncclSocketAddress* addr); +ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn); +struct rasConnection* rasConnFind(const union ncclSocketAddress* addr); void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup); void rasConnDisconnect(const union ncclSocketAddress* addr); ncclResult_t rasNetAcceptNewSocket(); void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup); void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0, bool retry = true); -void rasSockEventLoop(int sockIdx, int pollIdx); +void rasSockEventLoop(struct rasSocket* sock, int pollIdx); void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup); ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock); -ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false, - bool insert = false, bool pretend = false, int* pLinkIdx = nullptr); +ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn); +ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx); +void rasNetTerminate(); + // peers.cc extern struct rasPeerInfo* rasPeers; @@ -483,29 +522,35 @@ ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr); bool rasPeerIsDead(const union ncclSocketAddress* addr); int ncclSocketsCompare(const void* p1, const void* p2); bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2); +void rasPeersTerminate(); // collectives.cc -extern struct rasCollective* rasCollectives; +extern struct rasCollective* rasCollectivesHead; +extern struct rasCollective* rasCollectivesTail; void rasCollReqInit(struct rasCollRequest* req); -ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr, - int* pCollIdx = nullptr, int fromConnIdx = -1); +ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone = nullptr, + struct rasCollective** pColl = nullptr, struct rasConnection* fromConn = nullptr); ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock); ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock); -void rasCollsPurgeConn(int connIdx); +void rasCollsPurgeConn(struct rasConnection* conn); void rasCollFree(struct rasCollective* coll); void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup); +void rasCollectivesTerminate(); + // client_support.cc extern int rasClientListeningSocket; -extern struct rasClient* rasClients; -extern int nRasClients; +extern struct rasClient* rasClientsHead; +extern struct rasClient* rasClientsTail; + ncclResult_t rasClientInitSocket(); ncclResult_t rasClientAcceptNewSocket(); ncclResult_t rasClientResume(struct rasCollective* coll); -void rasClientEventLoop(int clientIdx, int pollIdx); +void rasClientEventLoop(struct rasClient* client, int pollIdx); const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size); +void rasClientSupportTerminate(); #endif // !NCCL_RAS_CLIENT diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc index 441ad19..43aa042 100644 --- a/src/ras/rasnet.cc +++ b/src/ras/rasnet.cc @@ -13,90 +13,106 @@ struct rasLink rasNextLink = {1}, rasPrevLink = {-1}; // Connections on the RAS network. -struct rasConnection* rasConns; -int nRasConns; +struct rasConnection* rasConnsHead; +struct rasConnection* rasConnsTail; // Sockets implementing the RAS network. -struct rasSocket *rasSockets; -int nRasSockets; +struct rasSocket *rasSocketsHead; +struct rasSocket *rasSocketsTail; // Magic file descriptor number when we want poll() to ignore an entry. Anything negative would do, but // I didn't want to use -1 because it has a special meaning for us. #define POLL_FD_IGNORE -2 +static void freeConnEntry(struct rasConnection* conn); static void rasConnOpen(struct rasConnection* conn); static ncclResult_t rasConnPrepare(struct rasConnection* conn); static void rasConnTerminate(struct rasConnection* conn); static ncclResult_t getNewSockEntry(struct rasSocket** pSock); +static void freeSockEntry(struct rasSocket* sock); static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup); -static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup); +static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup); static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack = false); -static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx); static void rasConnResume(struct rasConnection* conn); static void rasLinkSanitizeFallbacks(struct rasLink* link); -static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx = -1); -static int rasLinkFindConn(const struct rasLink* link, int connIdx); +static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend = false, + int* pLinkIdx = nullptr, struct rasLinkConn** pLinkConn = nullptr, + bool insert = true); +static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx); +static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external = false); +static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn, + int* pLinkIdx = nullptr); /////////////////////////////////////////////// // Functions related to the RAS connections. // /////////////////////////////////////////////// -// Allocates an entry in the rasConns array, enlarging the array if necessary. +// Allocates a new entry in the rasConnections list. ncclResult_t getNewConnEntry(struct rasConnection** pConn) { struct rasConnection* conn; - int i; - for (i = 0; i < nRasConns; i++) - if (!rasConns[i].inUse) - break; - if (i == nRasConns) { - NCCLCHECK(ncclRealloc(&rasConns, nRasConns, nRasConns+RAS_INCREMENT)); - nRasConns += RAS_INCREMENT; - } - conn = rasConns+i; - memset(conn, '\0', sizeof(*conn)); - conn->inUse = true; - conn->sockIdx = -1; + NCCLCHECK(ncclCalloc(&conn, 1)); + ncclIntruQueueConstruct(&conn->sendQ); conn->travelTimeMin = INT64_MAX; conn->travelTimeMax = INT64_MIN; + if (rasConnsHead) { + rasConnsTail->next = conn; + conn->prev = rasConnsTail; + rasConnsTail = conn; + } else { + rasConnsHead = rasConnsTail = conn; + } + *pConn = conn; return ncclSuccess; } +// Frees an entry from the rasConns list. +static void freeConnEntry(struct rasConnection* conn) { + if (conn == nullptr) + return; + + if (conn == rasConnsHead) + rasConnsHead = rasConnsHead->next; + if (conn == rasConnsTail) + rasConnsTail = rasConnsTail->prev; + if (conn->prev) + conn->prev->next = conn->next; + if (conn->next) + conn->next->prev = conn->prev; + free(conn); +} + // Creates a new RAS network connection to a remote peer address. -ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) { +ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn) { ncclResult_t ret = ncclSuccess; - struct rasConnection* conn = nullptr; + struct rasConnection* conn; // First check if a connection entry for this peer already exists. - int connIdx = rasConnFind(addr); - if (connIdx != -1) { - conn = rasConns+connIdx; - } + conn = rasConnFind(addr); - if (conn && conn->sockIdx != -1) { + if (conn && conn->sock) { // An entry exists and has a socket associated with it -- nothing left for us to do. - if (pConnIdx) - *pConnIdx = connIdx; + if (pConn) + *pConn = conn; goto exit; } - if (!conn) { + if (conn == nullptr) { NCCLCHECKGOTO(getNewConnEntry(&conn), ret, exit); memcpy(&conn->addr, addr, sizeof(conn->addr)); // We are establishing a new connection -- start the timeout. conn->startRetryTime = clockNano(); - connIdx = conn - rasConns; } - if (pConnIdx) - *pConnIdx = connIdx; + if (pConn) + *pConn = conn; rasConnOpen(conn); @@ -107,7 +123,7 @@ exit: // Opens a connection to a remote peer. static void rasConnOpen(struct rasConnection* conn) { ncclResult_t ret; // Not used. - struct rasSocket* sock; + struct rasSocket* sock = nullptr; bool closeSocketOnFail = false; int ready; @@ -120,10 +136,8 @@ static void rasConnOpen(struct rasConnection* conn) { NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail); - // We delay the initialization of sockIdx, connIdx and status until this point so that in case of failures - // we don't need to clean them up. - conn->sockIdx = sock-rasSockets; - sock->connIdx = conn-rasConns; + conn->sock = sock; + sock->conn = conn; rasPfds[sock->pfd].fd = sock->sock.fd; // We ignore the possibly ready status of the socket at this point and consider it CONNECTING because @@ -141,6 +155,7 @@ exit: fail: if (closeSocketOnFail) (void)ncclSocketClose(&sock->sock); + freeSockEntry(sock); goto exit; } @@ -166,16 +181,13 @@ static ncclResult_t rasConnPrepare(struct rasConnection* conn) { } // Searches through rasConns for a connection with a provided address. -int rasConnFind(const union ncclSocketAddress* addr) { - // rasConns is not sorted (given the number of indices, it would be a massive hassle to keep it that way) - // so binary search won't do... - for (int i = 0; i < nRasConns; i++) { - struct rasConnection* conn = rasConns+i; - if (conn->inUse && memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0) - return i; +struct rasConnection* rasConnFind(const union ncclSocketAddress* addr) { + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) { + if (memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0) + return conn; } - return -1; + return nullptr; } // Handles any connection-related timeouts. Many timeouts affect the underlying sockets and thus have been handled @@ -184,58 +196,56 @@ int rasConnFind(const union ncclSocketAddress* addr) { // This is also where we declare peers as dead, etc. // Invoked from the main RAS event loop. void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) { - for (int connIdx = 0; connIdx < nRasConns; connIdx++) { - struct rasConnection* conn = rasConns+connIdx; - - if (!conn->inUse) - continue; - - if (conn->sockIdx != -1) { - struct rasSocket* sock = rasSockets+conn->sockIdx; + for (struct rasConnection* conn = rasConnsHead; conn;) { + struct rasConnection* connNext = conn->next; + if (conn->sock) { bool sockTerminated = false; // Retry the socket connections that have been refused. - if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) { - if (now - sock->lastSendTime > RAS_CONNECT_RETRY) { + if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting) { + if (now - conn->sock->lastSendTime > RAS_CONNECT_RETRY) { int ready; - if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) { + if (ncclSocketReady(&conn->sock->sock, &ready) != ncclSuccess) { INFO(NCCL_RAS, "Unexpected error from ncclSocketReady; terminating the socket connection with %s", - ncclSocketToString(&sock->sock.addr, rasLine)); - rasSocketTerminate(sock, /*finalize*/true); + ncclSocketToString(&conn->addr, rasLine)); + rasSocketTerminate(conn->sock, /*finalize*/true); // We will retry below in the same loop. sockTerminated = true; } else { // We update lastSendTime even if !ready because we need it up-to-date for timeout calculations. - sock->lastSendTime = clockNano(); - if (!ready && sock->sock.state == ncclSocketStateConnecting) - *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY); + conn->sock->lastSendTime = clockNano(); + if (!ready && conn->sock->sock.state == ncclSocketStateConnecting) + *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY); else - rasPfds[sock->pfd].fd = sock->sock.fd; // Enable the handling via the main loop. + rasPfds[conn->sock->pfd].fd = conn->sock->sock.fd; // Enable the handling via the main loop. } // if (ncclSocketReady) } else { - *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY); + *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY); } - } // if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) + } // if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting) // For connections that have data to send but that we've been unable to send a message on for a while, // consider their sockets lost and terminate them. - if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) { - if (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) { + if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY) { + if (now - std::max(conn->sock->lastSendTime, + ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) { INFO(NCCL_RAS, "RAS send stuck timeout error (%lds) on socket connection with %s", - (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) / - CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); - rasSocketTerminate(sock, /*finalize*/false, RAS_STUCK_TIMEOUT); + (now - std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) / + CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); + rasSocketTerminate(conn->sock, /*finalize*/false, RAS_STUCK_TIMEOUT); // We will retry below in the same loop. } else { - *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, - ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+RAS_STUCK_TIMEOUT); + *nextWakeup = std::min(*nextWakeup, + std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+ + RAS_STUCK_TIMEOUT); } - } // if (!ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) - } // if (conn->sockIdx != -1) + } // if (!ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY) + } // if (conn->sock) // For connections that are being (re-)established, irrespective of whether there's a valid socket associated - // with them (conn->startIdx != -1), we need to check if any connection-level timeout has expired. + // with them, we need to check if any connection-level timeout has expired. if (conn->startRetryTime) { + bool connTerminated = false; // If we've been trying to open a connection for too long (60s), give up and mark the peer as dead // so that we don't try again. if (now - conn->startRetryTime > RAS_PEER_DEAD_TIMEOUT) { @@ -248,82 +258,83 @@ void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) { rasCollReqInit(&bCast); bCast.type = RAS_BC_DEADPEER; memcpy(&bCast.deadPeer.addr, &conn->addr, sizeof(bCast.deadPeer.addr)); - (void)rasNetSendCollReq(&bCast, rasCollDataLength(RAS_BC_DEADPEER)); + (void)rasNetSendCollReq(&bCast); - continue; + connTerminated = true; } else { *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_PEER_DEAD_TIMEOUT); } // RAS_STUCK_TIMEOUT has already been handled in the socket function (we'll pick it up later via - // the conn->sockIdx == -1 test). + // the conn->sock == nullptr test). - // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try - // to establish fallback connections. - if (now - conn->startRetryTime > RAS_CONNECT_WARN) { - if (!conn->experiencingDelays) { - INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s", - (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); + if (!connTerminated) { + // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try + // to establish fallback connections. + if (now - conn->startRetryTime > RAS_CONNECT_WARN) { + if (!conn->experiencingDelays) { + INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s", + (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); - // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback - // connection(s). At this point, it's mostly just a precaution; we will continue trying to establish - // the primary connection until RAS_PEER_DEAD_TIMEOUT expires. - conn->experiencingDelays = true; - (void)rasLinkAddFallback(&rasNextLink, connIdx); - (void)rasLinkAddFallback(&rasPrevLink, connIdx); - // rasConns may have been reallocated by the above calls. - conn = rasConns+connIdx; + // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback + // connection(s). At this point, it's mostly just a precaution; we will continue trying to establish + // the primary connection until RAS_PEER_DEAD_TIMEOUT expires. + conn->experiencingDelays = true; + (void)rasLinkAddFallback(&rasNextLink, conn); + (void)rasLinkAddFallback(&rasPrevLink, conn); - // Stop collectives from waiting for a response over it. - rasCollsPurgeConn(connIdx); - } // if (!conn->experiencingDelays) - } else { - *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN); - } - - // If a socket was terminated (or never opened, due to some error), try to open it now. - // We retry once a second. - if (conn->sockIdx == -1) { - if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) { - INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)", - ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays, - (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0)); - rasConnOpen(conn); + // Stop collectives from waiting for a response over it. + rasCollsPurgeConn(conn); + } // if (!conn->experiencingDelays) + } else { + *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN); } - if (conn->sockIdx == -1) - *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY); - } + + // If a socket was terminated (or never opened, due to some error), try to open it now. + // We retry once a second. + if (conn->sock == nullptr) { + if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) { + INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)", + ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays, + (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0)); + rasConnOpen(conn); + } + if (conn->sock == nullptr) + *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY); + } + } // if (!connTerminated) } // if (conn->startRetryTime) - } // for (connIdx) + + conn = connNext; + } // for (conn) } // Checks if we have a connection to a given peer and if so, terminates it. The connection is removed from the // RAS links, though fallbacks are initiated if necessary. Typically called just before declaring a peer dead. void rasConnDisconnect(const union ncclSocketAddress* addr) { - int connIdx = rasConnFind(addr); - if (connIdx != -1) { - (void)rasLinkAddFallback(&rasNextLink, connIdx); - (void)rasLinkAddFallback(&rasPrevLink, connIdx); - rasLinkDropConn(&rasNextLink, connIdx); - rasLinkDropConn(&rasPrevLink, connIdx); + struct rasConnection* conn = rasConnFind(addr); + if (conn) { + (void)rasLinkAddFallback(&rasNextLink, conn); + (void)rasLinkAddFallback(&rasPrevLink, conn); + rasLinkConnDrop(&rasNextLink, conn); + rasLinkConnDrop(&rasPrevLink, conn); - rasConnTerminate(rasConns+connIdx); + rasConnTerminate(conn); } } // Terminates a connection and frees the rasConns entry. static void rasConnTerminate(struct rasConnection* conn) { - int connIdx = conn - rasConns; - // Make sure there are no lingering rasSockets pointing to it. - for (int i = 0; i < nRasSockets; i++) { - struct rasSocket* sock = rasSockets+i; - if (sock->status != RAS_SOCK_CLOSED && sock->connIdx == connIdx) + for (struct rasSocket* sock = rasSocketsHead; sock;) { + struct rasSocket* sockNext = sock->next; + if (sock->conn == conn) rasSocketTerminate(sock, /*finalize*/true); + sock = sockNext; } // Also check any ongoing collectives. - rasCollsPurgeConn(connIdx); + rasCollsPurgeConn(conn); while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&conn->sendQ)) { free(meta); @@ -331,8 +342,7 @@ static void rasConnTerminate(struct rasConnection* conn) { INFO(NCCL_RAS, "RAS terminating a connection with %s", ncclSocketToString(&conn->addr, rasLine)); - conn->inUse = false; - conn->sockIdx = -1; // Should be that way already, but just to be extra sure... + freeConnEntry(conn); } @@ -344,7 +354,7 @@ static void rasConnTerminate(struct rasConnection* conn) { // corresponding rasConnection can't be established without knowing the peer's address. ncclResult_t rasNetAcceptNewSocket() { ncclResult_t ret = ncclSuccess; - struct rasSocket* sock; + struct rasSocket* sock = nullptr; int ready; bool socketInitialized = false; NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail); @@ -370,91 +380,98 @@ exit: fail: if (socketInitialized) NCCLCHECK(ncclSocketClose(&sock->sock)); + freeSockEntry(sock); goto exit; } -// Returns the index of the first available entry in the rasConns array, enlarging the array if necessary. +// Allocates a new entry in the rasSockets list. static ncclResult_t getNewSockEntry(struct rasSocket** pSock) { struct rasSocket* sock; - int i; - for (i = 0; i < nRasSockets; i++) - if (rasSockets[i].status == RAS_SOCK_CLOSED) - break; - if (i == nRasSockets) { - NCCLCHECK(ncclRealloc(&rasSockets, nRasSockets, nRasSockets+RAS_INCREMENT)); - nRasSockets += RAS_INCREMENT; - } - sock = rasSockets+i; - memset(sock, '\0', sizeof(*sock)); + NCCLCHECK(ncclCalloc(&sock, 1)); + sock->pfd = -1; - sock->connIdx = -1; sock->createTime = sock->lastSendTime = sock->lastRecvTime = clockNano(); + if (rasSocketsHead) { + rasSocketsTail->next = sock; + sock->prev = rasSocketsTail; + rasSocketsTail = sock; + } else { + rasSocketsHead = rasSocketsTail = sock; + } + *pSock = sock; return ncclSuccess; } +// Frees an entry from the rasSockets list. +static void freeSockEntry(struct rasSocket* sock) { + if (sock == nullptr) + return; + + if (sock == rasSocketsHead) + rasSocketsHead = rasSocketsHead->next; + if (sock == rasSocketsTail) + rasSocketsTail = rasSocketsTail->prev; + if (sock->prev) + sock->prev->next = sock->next; + if (sock->next) + sock->next->prev = sock->prev; + free(sock); +} + // Invoked from the main RAS event loop to handle RAS socket timeouts. void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) { - for (int sockIdx = 0; sockIdx < nRasSockets; sockIdx++) { - struct rasSocket* sock = rasSockets+sockIdx; + for (struct rasSocket* sock = rasSocketsHead; sock;) { + struct rasSocket* sockNext = sock->next; - if (sock->status == RAS_SOCK_CLOSED) - continue; - - // For socket connections that are still being established, give up on the ones that take too long to initialize. if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) { + // For socket connections that are still being established, give up on the ones that take too long to initialize. if (now - sock->createTime > RAS_STUCK_TIMEOUT) { - if (sock->connIdx == -1) { + if (sock->conn == nullptr) { INFO(NCCL_RAS, "RAS init timeout error (%lds) on incoming socket connection from %s", (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); } else { - struct rasConnection* conn = rasConns+sock->connIdx; INFO(NCCL_RAS, "RAS init timeout error (%lds) on socket connection with %s " "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)", (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine), - conn->experiencingDelays, (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0), - sock->status); + sock->conn->experiencingDelays, + (sock->conn->startRetryTime ? (now-sock->conn->startRetryTime)/1e9 : 0.0), sock->status); } rasSocketTerminate(sock, /*finalize*/true); // We may retry later. - continue; } else { *nextWakeup = std::min(*nextWakeup, sock->createTime+RAS_STUCK_TIMEOUT); } - } // if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) - - // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long. - if (sock->status == RAS_SOCK_TERMINATING) { + } else if (sock->status == RAS_SOCK_TERMINATING) { + // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long. if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_STUCK_TIMEOUT) { INFO(NCCL_RAS, "RAS termination stuck timeout error (%lds) on socket connection with %s", (now-std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock, /*finalize*/true); // This socket is presumably already being re-established, if needed. - continue; } else { *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_STUCK_TIMEOUT); } - } // if (sock->status == RAS_SOCK_TERMINATING) - - // Terminate sockets that haven't been used in a good while. In principle this shouldn't trigger for anything - // important due to shorter timeouts on RAS network connections, but in case of weird situations like process - // suspend, rasSocketTerminate will do additional checking. - if (sock->status == RAS_SOCK_READY) { + } else if (sock->status == RAS_SOCK_READY) { + // Terminate sockets that haven't been used in a good while. In principle this shouldn't trigger for anything + // important due to shorter timeouts on RAS network connections, but in case of weird situations like process + // suspend, rasSocketTerminate will do additional checking. if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_IDLE_TIMEOUT) { INFO(NCCL_RAS, "RAS idle timeout (%lds) on socket connection with %s", (now - std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock, /*finalize*/false, /*startRetryOffset*/0, /*retry*/false); - continue; // The RAS network timeout handler will terminate the conn it was associated with, if any. } else { *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_IDLE_TIMEOUT); } } // if (sock->status == RAS_SOCK_READY) - } // for (sockIdx) + + sock = sockNext; + } // for (sock) } // Handles the termination of a RAS socket. @@ -464,19 +481,19 @@ void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) { // For not fully established sockets, we can terminate immediately as there's no useful data to extract. void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) { assert(sock->status != RAS_SOCK_CLOSED); - if (sock->connIdx != -1) { - struct rasConnection* conn = rasConns+sock->connIdx; - // If the sockIdx of the connection points back to us, it means that we are the current socket of this + if (sock->conn) { + struct rasConnection* conn = sock->conn; + // If the sock of the connection points back to us, it means that we are the current socket of this // connection, so we have additional work to do before we can terminate it. - if (conn->sockIdx == sock-rasSockets) { + if (conn->sock == sock) { // Reset it to indicate there's no valid socket associated with that connection anymore. - conn->sockIdx = -1; + conn->sock = nullptr; // Don't attempt to retry on sockets that have been unused for so long that the remote peer probably // deliberately closed them. Make an exception for sockets that are part of the RAS network links. if ((retry && clockNano() - std::max(sock->lastSendTime, sock->lastRecvTime) < RAS_IDLE_TIMEOUT - RAS_IDLE_GRACE_PERIOD) || - rasLinkFindConn(&rasNextLink, sock->connIdx) != -1 || rasLinkFindConn(&rasPrevLink, sock->connIdx) != -1) { + rasLinkConnFind(&rasNextLink, sock->conn) || rasLinkConnFind(&rasPrevLink, sock->conn)) { // For connections that were fine until now, the connection-level timeout starts at termination, and possibly // even earlier, depending on what event trigerred the termination -- if it was another timeout expiring, then // we need to include that timeout as well. @@ -507,11 +524,11 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet } // if (retry) // Stop collectives from waiting for a response over this connection. - rasCollsPurgeConn(sock->connIdx); - } // if (conn->sockIdx == sock-rasSockets) - } // if (sock->connIdx != -1) + rasCollsPurgeConn(sock->conn); + } // if (conn->sock == sock) + } // if (sock->conn) - if (sock->status != RAS_SOCK_CONNECTING && sock->connIdx != -1 && !finalize && (rasPfds[sock->pfd].events & POLLIN)) { + if (sock->status != RAS_SOCK_CONNECTING && sock->conn && !finalize && (rasPfds[sock->pfd].events & POLLIN)) { if (sock->status != RAS_SOCK_TERMINATING) { // The receiving side is still open -- close just the sending side. (void)ncclSocketShutdown(&sock->sock, SHUT_WR); @@ -525,20 +542,15 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet } else { // Either the caller requested finalization or we cannot receive on it. (void)ncclSocketClose(&sock->sock); - sock->status = RAS_SOCK_CLOSED; rasPfds[sock->pfd].fd = -1; rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0; - sock->pfd = sock->connIdx = -1; - sock->recvOffset = sock->recvLength = 0; free(sock->recvMsg); - sock->recvMsg = nullptr; + freeSockEntry(sock); } } // Handles a ready socket FD from the main event loop. -void rasSockEventLoop(int sockIdx, int pollIdx) { - struct rasSocket* sock = rasSockets+sockIdx; - +void rasSockEventLoop(struct rasSocket* sock, int pollIdx) { if (sock->status == RAS_SOCK_CONNECTING) { int ready; // Socket is not yet fully established. Continue the OS or NCCL-level handshake. @@ -554,15 +566,15 @@ void rasSockEventLoop(int sockIdx, int pollIdx) { (connectSide ? sock->lastSendTime : sock->lastRecvTime) = clockNano(); sock->status = RAS_SOCK_HANDSHAKE; if (connectSide) { - assert(sock->connIdx != -1); - if (rasConns[sock->connIdx].sockIdx == sockIdx) { - if (rasConnPrepare(rasConns+sock->connIdx) != ncclSuccess) { + assert(sock->conn); + if (sock->conn->sock == sock) { + if (rasConnPrepare(sock->conn) != ncclSuccess) { INFO(NCCL_RAS, "RAS unexpected error from rasConnPrepare; terminating the socket connection with %s", ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock); // We may retry further down. } - } else { + } else { // sock->conn->sock != sock // The connection this socket is associated with no longer considers it to be the current one. // This could possibly happen due to a race condition. Simply terminate it. INFO(NCCL_RAS, "RAS connected with %s via a socket that's no longer current!", @@ -581,10 +593,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) { if (sock->status != RAS_SOCK_TERMINATING && (rasPfds[pollIdx].revents & POLLOUT)) { int closed = 0; bool allSent = false; - assert(sock->connIdx != -1); - struct rasConnection* conn = rasConns+sock->connIdx; - assert(conn->sockIdx == sockIdx); - if (rasConnSendMsg(conn, &closed, &allSent) != ncclSuccess) { + assert(sock->conn); + assert(sock->conn->sock == sock); + if (rasConnSendMsg(sock->conn, &closed, &allSent) != ncclSuccess) { INFO(NCCL_RAS, "RAS unexpected error from rasConnSendMsg; terminating the socket connection with %s", ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock); @@ -612,9 +623,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) { // We may retry further down. } else if (closed) { const char* socketType; - if (sock->connIdx == -1) + if (sock->conn == nullptr) socketType = "incoming"; - else if (rasConns[sock->connIdx].sockIdx != sockIdx) + else if (sock->conn->sock != sock) socketType = "old"; else if (sock->status == RAS_SOCK_HANDSHAKE) socketType = "new"; @@ -624,25 +635,21 @@ void rasSockEventLoop(int sockIdx, int pollIdx) { socketType, ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock, /*finalize*/true); // We may retry further down. - } else { + } else { // !closed sock->lastRecvTime = clockNano(); if (msg) { (void)rasMsgHandle(msg, sock); free(msg); - // Message handlers can terminate a socket in certain cases; we need to check for - // that here so that we don't try to receive from a closed socket. - // No handlers are currently believed to create new sockets but better to be safe than sorry - // and re-init the sock variable. - sock = rasSockets+sockIdx; - if (sock->status == RAS_SOCK_CLOSED) + // Message handlers can terminate a socket in various cases. We re-check rasPfds.events to ensure that + // this hasn't happened here (rasSocketTerminate will reset it when finalizing a socket). + if (!(rasPfds[pollIdx].revents & POLLIN)) break; } - if (sock->connIdx != -1) { - struct rasConnection* conn = rasConns+sock->connIdx; - if (conn->sockIdx == sockIdx && (conn->startRetryTime || conn->experiencingDelays)) - rasConnResume(conn); + if (sock->conn) { + if (sock->conn->sock == sock && (sock->conn->startRetryTime || sock->conn->experiencingDelays)) + rasConnResume(sock->conn); } - } + } // !closed } while (msg); } // if (POLLIN) } // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING @@ -658,109 +665,95 @@ void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup) { // A connection can belong to multiple links but, when it comes to various timeouts, we want to handle each // connection just once. We solve that with a simple flag within a connection. This also allows us to distinguish // connections that are part of a link from those that are not. - for (int connIdx = 0; connIdx < nRasConns; connIdx++) - rasConns[connIdx].linkFlag = false; + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) + conn->linkFlag = false; (void)rasLinkHandleNetTimeouts(&rasNextLink, now, nextWakeup); (void)rasLinkHandleNetTimeouts(&rasPrevLink, now, nextWakeup); - for (int connIdx = 0; connIdx < nRasConns; connIdx++) { - struct rasConnection* conn = rasConns+connIdx; - if (conn->inUse && !conn->linkFlag) { + for (struct rasConnection* conn = rasConnsHead; conn;) { + struct rasConnection* connNext = conn->next; + if (!conn->linkFlag) { // The connection is not part of any link. Check if it should be terminated. - if (conn->sockIdx == -1 && ncclIntruQueueEmpty(&conn->sendQ)) { + if (conn->sock == nullptr && ncclIntruQueueEmpty(&conn->sendQ)) rasConnTerminate(conn); - continue; - } } + conn = connNext; } } // Checks for and handles timeouts at the link level; primarily the keep-alives for link connections. static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup) { - for (int i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; - if (linkConn->connIdx != -1) { - if (!rasConns[linkConn->connIdx].linkFlag) { - rasConnHandleNetTimeouts(linkConn->connIdx, now, nextWakeup); - // rasConns may have been reallocated by the above call, which is why we don't have a conn variable here. - // For the same reason we re-init linkConn. - linkConn = link->conns+i; - rasConns[linkConn->connIdx].linkFlag = true; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) { + if (linkConn->conn) { + if (!linkConn->conn->linkFlag) { + rasConnHandleNetTimeouts(linkConn->conn, now, nextWakeup); + linkConn->conn->linkFlag = true; } - } else if (i == 0 && link->lastUpdatePeersTime != 0) { + } else if (linkConn == link->conns && link->lastUpdatePeersTime != 0) { // This triggers when rasLinkReinitConns didn't create the primary connection because we have a higher address // than the peer. If that peer fails to initiate within RAS_CONNECT_WARN, we need to take action. if (now - link->lastUpdatePeersTime > RAS_CONNECT_WARN) { INFO(NCCL_RAS, "RAS peer connect timeout warning (%lds) on socket connection from %s", (now-link->lastUpdatePeersTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); - NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx)); - if (linkConn->connIdx != -1) { - rasConns[linkConn->connIdx].linkFlag = true; + NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn)); + if (linkConn->conn) { + linkConn->conn->linkFlag = true; } - // We used to connect to the first fallback but I think trying to connect to the calculated primary first - // in this case is more intuitive. - //(void)rasLinkTryFallback(link, -1); link->lastUpdatePeersTime = 0; } else { *nextWakeup = std::min(*nextWakeup, link->lastUpdatePeersTime+RAS_CONNECT_WARN); } - } // if (i == 0 && link->lastUpdatePeerTime != 0) - } // for (i) + } // if (linkConn == link->conns && link->lastUpdatePeerTime != 0) + } // for (linkConn) return ncclSuccess; } // Handles the sending of keep-alive messages and related timeouts for connections that are part of the RAS links. -static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup) { - struct rasConnection* conn = rasConns+connIdx; - if (conn->sockIdx != -1) { - struct rasSocket* sock = rasSockets+conn->sockIdx; - - if (sock->status == RAS_SOCK_READY) { +static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup) { + if (conn->sock) { + if (conn->sock->status == RAS_SOCK_READY) { // Send a regular keep-alive message if we haven't sent anything in a while and we don't have anything queued. if (ncclIntruQueueEmpty(&conn->sendQ)) { - if (now - sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) { + if (now - conn->sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) { rasConnSendKeepAlive(conn); } else { - *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_KEEPALIVE_INTERVAL); + *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_KEEPALIVE_INTERVAL); } } // For short timeouts print a warning but also pessimistically immediately try to establish fallback connections. - if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) { + if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) { if (!conn->experiencingDelays) { INFO(NCCL_RAS, "RAS keep-alive timeout warning (%lds) on socket connection with %s", - (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); + (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); // At this point, it's mostly just a precaution; we will continue with the primary connection until // RAS_PEER_DEAD_TIMEOUT expires. conn->experiencingDelays = true; - (void)rasLinkAddFallback(&rasNextLink, connIdx); - (void)rasLinkAddFallback(&rasPrevLink, connIdx); - // rasConns and rasSockets may have been reallocated by the above calls. - conn = rasConns+connIdx; - sock = rasSockets+conn->sockIdx; + (void)rasLinkAddFallback(&rasNextLink, conn); + (void)rasLinkAddFallback(&rasPrevLink, conn); - // Stop collectives from waiting for a response over it. - rasCollsPurgeConn(connIdx); + // Stop ongoing collectives from waiting for a response over this connection. + rasCollsPurgeConn(conn); } } else { - *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN); + *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN); } // For long timeouts we need to act. - if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) { + if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) { INFO(NCCL_RAS, "RAS keep-alive timeout error (%lds) on socket connection with %s", - (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); - rasSocketTerminate(sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR); + (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); + rasSocketTerminate(conn->sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR); *nextWakeup = now; // Retry will be in the next iteration of the main loop so ensure we don't wait. } else { - *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR); + *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR); } - } // if (sock->status == RAS_SOCK_READY) - } // if (conn->sockIdx != -1) + } // if (conn->sock->status == RAS_SOCK_READY) + } // if (conn->sock) } // Sends a keep-alive message to a peer on the RAS network. @@ -768,17 +761,17 @@ static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack) { struct rasMsg* msg = nullptr; int msgLen = rasMsgLength(RAS_MSG_KEEPALIVE); if (rasMsgAlloc(&msg, msgLen) == ncclSuccess) { - int linkIdx; + struct rasLinkConn* linkConn; msg->type = RAS_MSG_KEEPALIVE; msg->keepAlive.peersHash = rasPeersHash; msg->keepAlive.deadPeersHash = rasDeadPeersHash; msg->keepAlive.nack = (nack ? 1 : 0); - linkIdx = rasLinkFindConn(&rasNextLink, conn-rasConns); - if (linkIdx != -1 && !rasNextLink.conns[linkIdx].external) + linkConn = rasLinkConnFind(&rasNextLink, conn); + if (linkConn && !linkConn->external) msg->keepAlive.linkMask |= 2; // Our rasNextLink should be the peer's rasPrevLink. - linkIdx = rasLinkFindConn(&rasPrevLink, conn-rasConns); - if (linkIdx != -1 && !rasPrevLink.conns[linkIdx].external) + linkConn = rasLinkConnFind(&rasPrevLink, conn); + if (linkConn && !linkConn->external) msg->keepAlive.linkMask |= 1; // Our rasPrevLink should be the peer's rasNextLink. (void)clock_gettime(CLOCK_REALTIME, &msg->keepAlive.realTime); @@ -793,46 +786,51 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s int64_t travelTime; int peerIdx; - assert(sock->connIdx != -1); - struct rasConnection* conn = rasConns+sock->connIdx; + assert(sock->conn); SYSCHECK(clock_gettime(CLOCK_REALTIME, ¤tTime), "clock_gettime"); travelTime = (currentTime.tv_sec-msg->keepAlive.realTime.tv_sec)*1000*1000*1000 + (currentTime.tv_nsec-msg->keepAlive.realTime.tv_nsec); - if (msg->keepAlive.peersHash != conn->lastRecvPeersHash) { - conn->lastRecvPeersHash = msg->keepAlive.peersHash; + if (msg->keepAlive.peersHash != sock->conn->lastRecvPeersHash) { + sock->conn->lastRecvPeersHash = msg->keepAlive.peersHash; } - if (msg->keepAlive.deadPeersHash != conn->lastRecvDeadPeersHash) { - conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash; + if (msg->keepAlive.deadPeersHash != sock->conn->lastRecvDeadPeersHash) { + sock->conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash; } // Make sure that the connection is part of the appropriate links forming the RAS network. In particular, this // will add any externally-requested connections to the appropriate links (or remove existing ones, if no longer // needed). - peerIdx = rasPeerFind(&conn->addr); + peerIdx = rasPeerFind(&sock->conn->addr); // Note: it's possible for peerIdx to be -1 at this point if, due to races, the keepAlive arrives before // the peers update. - (void)rasLinkUpdateConn(&rasNextLink, (msg->keepAlive.linkMask & 1) ? sock->connIdx : -1, peerIdx, /*external*/true); - (void)rasLinkUpdateConn(&rasPrevLink, (msg->keepAlive.linkMask & 2) ? sock->connIdx : -1, peerIdx, /*external*/true); + if (msg->keepAlive.linkMask & 1) + (void)rasLinkConnAddExternal(&rasNextLink, sock->conn, peerIdx); + else + rasLinkConnDrop(&rasNextLink, sock->conn, /*external*/true); + if (msg->keepAlive.linkMask & 2) + (void)rasLinkConnAddExternal(&rasPrevLink, sock->conn, peerIdx); + else + rasLinkConnDrop(&rasPrevLink, sock->conn, /*external*/true); // If the keep-alive message is from a peer that doesn't actually need this connection (i.e., for that peer the // connection is just an external fallback), we should check if *we* still need it. It might be that we don't, - // and because we stopped sending the keep-alives, our peer doesn't know about it. rasLinkUpdateConn calls above - // will have wiped any external fallbacks, so anything that remains must be needed. + // and because we stopped sending the keep-alives, our peer doesn't know about it. The rasLinkConnDrop calls + // above will have wiped any external fallbacks, so anything that remains must be needed. if (!msg->keepAlive.nack && msg->keepAlive.linkMask == 0) { - if (rasLinkFindConn(&rasNextLink, sock->connIdx) == -1 && rasLinkFindConn(&rasPrevLink, sock->connIdx) == -1) { + if (rasLinkConnFind(&rasNextLink, sock->conn) == nullptr && rasLinkConnFind(&rasPrevLink, sock->conn) == nullptr) { // We don't need this connection either. Notify the peer about it. To avoid an infinite loop, we set the // special nack flag in the message to distinguish it from regular keep-alives. - rasConnSendKeepAlive(conn, /*nack*/true); + rasConnSendKeepAlive(sock->conn, /*nack*/true); } } - if (conn->travelTimeMin > travelTime) - conn->travelTimeMin = travelTime; - if (conn->travelTimeMax < travelTime) - conn->travelTimeMax = travelTime; - conn->travelTimeSum += travelTime; - conn->travelTimeCount++; + if (sock->conn->travelTimeMin > travelTime) + sock->conn->travelTimeMin = travelTime; + if (sock->conn->travelTimeMax < travelTime) + sock->conn->travelTimeMax = travelTime; + sock->conn->travelTimeSum += travelTime; + sock->conn->travelTimeCount++; if (msg->keepAlive.peersHash != rasPeersHash || msg->keepAlive.deadPeersHash != rasDeadPeersHash) { // This could happen due to a short-lived race condition between the peers propagation @@ -842,7 +840,7 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s INFO(NCCL_RAS, "RAS keepAlive hash mismatch from %s (peersHash 0x%lx, deadPeersHash 0x%lx)", ncclSocketToString(&sock->sock.addr, rasLine), msg->keepAlive.peersHash, msg->keepAlive.deadPeersHash); INFO(NCCL_RAS, "RAS my peersHash 0x%lx, deadPeersHash 0x%lx", rasPeersHash, rasDeadPeersHash); - NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers)); + NCCLCHECK(rasConnSendPeersUpdate(sock->conn, rasPeers, nRasPeers)); } return ncclSuccess; } @@ -857,100 +855,104 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s // External connections are generally ignored by this whole process: in particular, we don't add fallbacks for // timing out external connections. However, we will use an active external connection if it would be a better // option than whatever we can come up with. -static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) { - int peerIdx = -1; - int linkIdx = -1; +ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn) { + struct rasLinkConn* foundLinkConn = nullptr; + struct rasLinkConn* firstExtLinkConn = nullptr; int firstExtLinkIdx = -1; - int newPeerIdx; + int newPeerIdx, i; // First check if the connection is part of this link. In the process also check if any of the link's connections // might be active -- if so, there's no need to initiate any more fallbacks and we can bail out. - for (int i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; - + i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) { if (linkConn->peerIdx == -1) { - // Such elements are always at the very end of the array and we can't use them so we can just as well break. + // Such elements are always at the end and we can't use them so we can just as well break. break; } // Check for any other connection that might be a viable fallback (basically, anything that is not experiencing // delays). - if (linkConn->connIdx != -1 && linkConn->connIdx != connIdx) { - struct rasConnection* conn = rasConns+linkConn->connIdx; - if (!conn->experiencingDelays) { - if (!linkConn->external) + if (linkConn->conn && linkConn->conn != conn) { + if (!linkConn->conn->experiencingDelays) { + if (!linkConn->external) { goto exit; // We don't need to do anything if there's a non-external connection. - else if (linkConn->peerIdx != -1) { + } else if (linkConn->peerIdx != -1) { // Record the location of the first potentially viable external connection in the chain; we may prefer it // over anything we can come up with. - if (firstExtLinkIdx == -1) + if (firstExtLinkConn == nullptr) { + firstExtLinkConn = linkConn; firstExtLinkIdx = i; - if (linkIdx != -1) + } + if (foundLinkConn) break; // Break out of the loop if we already have all the data we might need. } // linkConn->external && linkConn->peerIdx != -1 - } // if (!conn->experiencingDelays) - } // if (linkConn->connIdx != -1) + } // if (!linkConn->conn->experiencingDelays) + } // if (linkConn->conn && linkConn->conn != conn) - if (linkConn->connIdx == connIdx) { + if (linkConn->conn == conn) { if (linkConn->external) goto exit; // We don't add fallbacks for external connections... - peerIdx = linkConn->peerIdx; - linkIdx = i; + foundLinkConn = linkConn; // We are not breaking out of the loop here because we want to check for active connections on *all* potentially // viable elements (in particular, there could be some external ones beyond this one). } } - if (linkIdx == -1) + if (foundLinkConn == nullptr) goto exit; // We found an existing element so the connection is part of the link. No existing non-external connections of this // link are active, so a fallback is needed. - assert(peerIdx != -1); - newPeerIdx = rasLinkCalculatePeer(link, peerIdx, /*isFallback*/linkIdx > 0); + assert(foundLinkConn->peerIdx != -1); + newPeerIdx = rasLinkCalculatePeer(link, foundLinkConn->peerIdx, /*isFallback*/(foundLinkConn != link->conns)); // In principle we want to add (at most) one fallback. However, if the found fallback connection already exists // and is also experiencing delays, we need to keep iterating. while (newPeerIdx != -1) { - int newConnIdx = rasConnFind(&rasPeers[newPeerIdx].addr); + struct rasConnection* newConn = rasConnFind(&rasPeers[newPeerIdx].addr); + int linkIdx; + struct rasLinkConn* newLinkConn; // If we previously found a potential external fallback connection, check if it's better than what we just found. - if (firstExtLinkIdx != -1) { + if (firstExtLinkConn) { linkIdx = -1; // Calculate the index that the newly found fallback would have (pretend mode). - NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/true, - &linkIdx)); + NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/true, &linkIdx)); assert(linkIdx != -1); if (firstExtLinkIdx < linkIdx) { // The external connection *is* better -- use it as a fallback instead and be done. - link->conns[firstExtLinkIdx].external = false; + firstExtLinkConn->external = false; goto exit; } } - NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/false, - &linkIdx)); - if (firstExtLinkIdx != -1 && linkIdx <= firstExtLinkIdx) - firstExtLinkIdx++; // Adjust if we inserted a new conn at a lower index. + NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/false, &linkIdx, &newLinkConn)); + if (firstExtLinkConn && linkIdx <= firstExtLinkIdx) + firstExtLinkIdx++; // Adjust if we inserted a new entry ahead of this one. INFO(NCCL_RAS, "RAS link %d: %s fallback connection %d with %s", - link->direction, (newConnIdx == -1 ? "opening new" : "calculated existing"), + link->direction, (newConn == nullptr ? "opening new" : "calculated existing"), linkIdx, ncclSocketToString(&rasPeers[newPeerIdx].addr, rasLine)); // Note that we don't follow here our convention of "lower address is the one establishing connections" -- // that convention is for optimizing regular operations, but we don't want to take chances during fault // recovery. It may temporarily result in duplicate connections, but we have a mechanism to deal with those. - if (newConnIdx == -1) - NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &link->conns[linkIdx].connIdx)); + if (newConn == nullptr) { + NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &newConn)); + newLinkConn->conn = newConn; + } - struct rasConnection* conn = rasConns+link->conns[linkIdx].connIdx; // If the fallback connection is also experiencing delays, we need to keep trying. - if (!conn->experiencingDelays) + if (!newConn->experiencingDelays) break; INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d", - conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0), - (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + newConn->experiencingDelays, (newConn->startRetryTime ? (clockNano()-newConn->startRetryTime)/1e9 : 0.0), + (newConn->sock ? newConn->sock->status : -1)); newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/true); } - if (newPeerIdx == -1) - INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns); + if (newPeerIdx == -1) { + int nConns = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) + nConns++; + INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (total %d)", link->direction, nConns); + } exit: return ncclSuccess; } @@ -958,7 +960,7 @@ exit: // Invoked when we receive a message over a connection that was just activated or was experiencing delays. // Cleans up the fallbacks, timers, etc, as appropriate. static void rasConnResume(struct rasConnection* conn) { - if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) { + if (conn->sock && conn->sock->status == RAS_SOCK_READY) { INFO(NCCL_RAS, "RAS %s connection with %s (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)", (conn->experiencingDelays && conn->startRetryTime == 0 ? "recovered" : "established"), ncclSocketToString(&conn->addr, rasLine), (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "), @@ -972,218 +974,362 @@ static void rasConnResume(struct rasConnection* conn) { rasLinkSanitizeFallbacks(&rasPrevLink); if (!ncclIntruQueueEmpty(&conn->sendQ)) - rasPfds[rasSockets[conn->sockIdx].pfd].events |= POLLOUT; + rasPfds[conn->sock->pfd].events |= POLLOUT; } } // Checks if the primary connection is fully established and if so, purges the fallbacks (as they are no longer needed). static void rasLinkSanitizeFallbacks(struct rasLink* link) { - if (link->nConns > 0 && link->conns[0].connIdx != -1) { - struct rasConnection* conn = rasConns+link->conns[0].connIdx; - if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) { + if (link->conns && link->conns->conn) { + struct rasConnection* conn = link->conns->conn; + if (conn->sock && conn->sock->status == RAS_SOCK_READY && !conn->experiencingDelays) { // We have a good primary. Simply drop all the fallbacks (the external ones will get recreated via the // keepAlive messages). - for (int i = 1; i < link->nConns; i++) { + int i = 1; + for (struct rasLinkConn* linkConn = link->conns->next; linkConn; i++) { + struct rasLinkConn* linkConnNext = linkConn->next; INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s", - link->direction, (link->conns[i].external ? "external " : ""), i, - ncclSocketToString(&rasConns[link->conns[i].connIdx].addr, rasLine)); + link->direction, (linkConn->external ? "external " : ""), i, + ncclSocketToString(&linkConn->conn->addr, rasLine)); + free(linkConn); + linkConn = linkConnNext; } - link->nConns = 1; + link->conns->next = nullptr; link->lastUpdatePeersTime = 0; } } } -// Attempt to drop a connection from a link. -static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx) { - if (linkIdx == -1) - linkIdx = rasLinkFindConn(link, connIdx); - if (linkIdx != -1) { - if (linkIdx == 0) { - INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s", - link->direction, ncclSocketToString(&rasConns[connIdx].addr, rasLine)); - } else { - INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s", - link->direction, (link->conns[linkIdx].external ? "external " : ""), linkIdx, - ncclSocketToString(&rasConns[connIdx].addr, rasLine)); - } - memmove(link->conns+linkIdx, link->conns+linkIdx+1, (link->nConns-(linkIdx+1))*sizeof(*link->conns)); - if (link->nConns > 1) - link->nConns--; - else { - link->conns[0].peerIdx = link->conns[0].connIdx = -1; - } - - if (linkIdx == 0) { - // First ensure that the conn becoming the primary is not marked as external (we don't want to lose it if - // the remote peer loses interest in it). - link->conns[0].external = false; - if (link->conns[0].connIdx != -1) { - INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary", - link->direction, ncclSocketToString(&rasConns[link->conns[0].connIdx].addr, rasLine)); - } - rasLinkSanitizeFallbacks(link); - } - } -} - -// Checks if a given connection is a member of this link and if so, returns its entry index. -// Returns -1 if connection not found. -static int rasLinkFindConn(const struct rasLink* link, int connIdx) { - for (int i = 0; i < link->nConns; i++) { - if (link->conns[i].connIdx == connIdx) - return i; - } - return -1; -} - -// Note: the behavior of this function has become super-complex and so it should be considered for refactoring. -// Searches for and updates an entry in a RAS network link. The conns array is de-facto sorted by peerIdx: it is -// ordered by preference, though peerIdx values can wrap around (given the ring/torus topology) and they can also -// be -1 (the latter are stored at the end). -// external provides an updated value for the entry's external field. A false value, if requested, is always set; -// a true value, however, is only set if a new entry is added (external == true implies insert), i.e., if an entry -// already exists and the function is invoked with external == true, the new value will be ignored. -// If insert is set, it will, if necessary, insert a new entry if one is not already there. -// If pretend is set, it will not modify the array and will just set *pLinkIdx as appropriate. -// pLinkIdx is a pointer to an (optional) result where the index of the added/updated entry is stored. -// -1 can be passed as peerIdx if unknown (possible in case of race conditions, and only if external). -// -1 can be passed as connIdx if unknown or, if insert is *not* set, to indicate that the entry is to be removed -// (the entry's external must match the argument external for it to be removed). -ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external, bool insert, - bool pretend, int* pLinkIdx) { +// Adds an entry to a RAS network link (or updates one, if it already exists). +// conn can be nullptr if the connection doesn't exist (yet). +// peerIdx *cannot* be -1 when this function is invoked. +// If pretend is true, the function will not modify the list and will just set *pLinkIdx and *pLinkConn as appropriate. +// pLinkIdx and pLinkConn are (optional) pointers to the results; the index/address of the added/updated entry are +// stored there. +// insert (true by default) determines whether this is an "add" function (as implied by the name) or an "update" -- +// if set to false, it will refuse to add a new entry (but will update an existing one as needed). +// Note: there is some code duplication between this function and rasLinkConnAddExternal so changes to one of them +// may need to be sync'ed to the other one as well. They used to be a single function that could do it all but the +// logic was extremely difficult to follow then. +static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend, + int* pLinkIdx, struct rasLinkConn** pLinkConn, bool insert) { + struct rasLinkConn* oldLinkConn = nullptr; + struct rasLinkConn* linkConnPrev = nullptr; int i, oldLinkIdx = -1; - if (external && connIdx != -1) - insert = true; + assert(peerIdx != -1); + if (conn) { + // Start by checking if we already have an element with this conn. + oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx); + if (oldLinkConn) { + if (pLinkConn) + *pLinkConn = oldLinkConn; + if (oldLinkConn->peerIdx != -1) { + assert(oldLinkConn->peerIdx == peerIdx); - if (connIdx != -1) { - // Start by checking if we already have an element with this connIdx. - oldLinkIdx = rasLinkFindConn(link, connIdx); - if (oldLinkIdx != -1) { - struct rasLinkConn* linkConn = link->conns+oldLinkIdx; - if (linkConn->peerIdx != -1) - assert(linkConn->peerIdx == peerIdx); - - if (linkConn->peerIdx == peerIdx) { - if (!external && !pretend) - linkConn->external = false; // Ensure that external is cleared if so requested. + if (!pretend) + oldLinkConn->external = false; // Ensure that external is cleared. if (pLinkIdx) *pLinkIdx = oldLinkIdx; - goto exit; // Nothing more to do if both connIdx and peerIdx are up to date. - } + goto exit; // Nothing more to do if both conn and peerIdx are up to date. + } // if (oldLinkConn->peerIdx != -1) - // Otherwise (linkConn->peerIdx == -1 && peerIdx != -1) we have a conn that, due to -1 peerIdx, is in a wrong - // place in the array -- we need to find the right spot. linkConn->peerIdx == -1 can only happen for external - // connections. - assert(external); - } - } + // Otherwise oldLinkConn->peerIdx == -1. The oldLinkConn is in a wrong place in the list -- we need to find + // the right spot. This can happen only for external connections. + } // if (oldLinkConn) + } // if (conn) - if (peerIdx != -1) { - // Search for the right spot in the conns array. - for (i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; - if (peerIdx != -1 && linkConn->peerIdx == peerIdx) { - // The exact conn element already exists. - if (connIdx == -1 && !insert) { - // Drop the connection from the link. - if (linkConn->external == external) { - if (!pretend) - rasLinkDropConn(link, linkConn->connIdx, i); - else if (pLinkIdx) - *pLinkIdx = i; - } - } else { // connIdx != -1 || insert - if (!pretend) { - if (linkConn->connIdx != -1) - assert(linkConn->connIdx == connIdx); - else - linkConn->connIdx = connIdx; - if (!external) - linkConn->external = false; // Ensure that external is cleared if so requested. - if (i == 0) { - // We received a connection from the remote peer that matches the primary connection we've been - // waiting for. - rasLinkSanitizeFallbacks(link); - } - } // if (!pretend) - if (pLinkIdx) - *pLinkIdx = i; - } // connIdx != -1 || insert + // Search for the right spot in the conns list. + i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) { + if (linkConn->peerIdx == peerIdx) { + // The exact linkConn element already exists. + if (linkConn->conn) + assert(linkConn->conn == conn); + if (!pretend) { + if (linkConn->conn == nullptr) + linkConn->conn = conn; + linkConn->external = false; // Ensure that external is cleared. + if (linkConn == link->conns) { + // We received a connection from the remote peer that matches the primary connection we've been + // waiting for. + rasLinkSanitizeFallbacks(link); + } + } // if (!pretend) + if (pLinkIdx) + *pLinkIdx = i; + if (pLinkConn) + *pLinkConn = linkConn; + goto exit; + } // if (linkConn->peerIdx == peerIdx) - goto exit; - } // if (peerIdx != -1 && linkConn->peerIdx == peerIdx) - if (!insert) - continue; - // Ensure that the i-1 index is also valid. - if (i == 0) - continue; - // Conns with peerIdx == -1 are stored at the end, so anything else needs to go before them. - if (peerIdx != -1 && linkConn->peerIdx == -1) + // Ensure that the previous element is valid. + if (linkConnPrev == nullptr) + continue; + // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done. + if (linkConn->peerIdx == -1) + break; + // Detect a roll-over and handle it specially. + if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) { + if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 || + link->direction * (peerIdx - linkConn->peerIdx) < 0) break; - // Detect a roll-over and handle it specially. - if (link->direction * (link->conns[i-1].peerIdx - linkConn->peerIdx) > 0) { - if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 || - link->direction * (peerIdx - linkConn->peerIdx) < 0) - break; - } else { // Regular, monotonic case with the peerIdx value between two existing elements. - if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 && - link->direction * (peerIdx - linkConn->peerIdx) < 0) - break; - } - } // for (i) - } else { - // If peerIdx == -1, insert the new element at the very end. This can only happen for external connections. - assert(external && oldLinkIdx == -1); - i = link->nConns; - } - if (!insert) - goto exit; + } else { // Regular, monotonic case with the peerIdx value between two existing elements. + if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 && + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; + } + } // for (linkConn) - // i holds the index at which to insert a new element. - if (pretend) { - if (pLinkIdx) - *pLinkIdx = i; - goto exit; - } - - if (oldLinkIdx == -1) { - struct rasLinkConn* linkConn; - if (link->nConns == link->connsSize) { - NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT)); - link->connsSize += RAS_INCREMENT; - } - linkConn = link->conns+i; - // Shift existing conns with indices >= i to make room for the new one. - memmove(linkConn+1, linkConn, (link->nConns-i)*sizeof(*link->conns)); - linkConn->peerIdx = peerIdx; - linkConn->connIdx = connIdx; - linkConn->external = external; - if (external) { - INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i, - ncclSocketToString((connIdx != -1 ? &rasConns[connIdx].addr : &rasPeers[peerIdx].addr), rasLine)); - } - link->nConns++; - } - else { // oldLinkIdx > -1 - // We already have the conn, we just need to move it to a new spot. - struct rasLinkConn* linkConn = link->conns+i; - assert(i <= oldLinkIdx); // We can only get here if linkConn->peerIdx == -1 && peerIdx != -1. - if (i != oldLinkIdx) { - struct rasLinkConn tmp; - struct rasLinkConn* linkConnNext = link->conns+i+1; // Just to silence the compiler. - // Move the existing conn from index oldLinkIdx to a (lower) index i, shifting the existing conns - // with indices in the range [i, oldLinkIdx). - memcpy(&tmp, link->conns+oldLinkIdx, sizeof(tmp)); - memmove(linkConnNext, linkConn, (oldLinkIdx-i)*sizeof(*linkConn)); - memcpy(linkConn, &tmp, sizeof(*linkConn)); - } - if (!external) - linkConn->external = false; // Ensure that external is cleared if so requested. - } // oldLinkIdx > -1 + // The new element should be inserted after linkConnPrev (which is at index i-1). if (pLinkIdx) *pLinkIdx = i; + if (pretend) + goto exit; + + if (oldLinkConn) { + if (i != oldLinkIdx) { + // We already have the entry, but we need to move it to a new spot (which must be earlier in the list). + assert(i < oldLinkIdx); + // Remove oldLinkConn from its old spot. + for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) { + if (linkConn->next == oldLinkConn) { + linkConn->next = oldLinkConn->next; + break; + } + } // for (linkConn) + // Insert it at its new spot. + oldLinkConn->next = linkConnPrev->next; + linkConnPrev->next = oldLinkConn; + } // if (i != oldLinkIdx) + oldLinkConn->peerIdx = peerIdx; + oldLinkConn->external = false; + } else if (insert) { + struct rasLinkConn* linkConn; + NCCLCHECK(ncclCalloc(&linkConn, 1)); + if (linkConnPrev) { + linkConn->next = linkConnPrev->next; + linkConnPrev->next = linkConn; + } else { + assert(link->conns == nullptr); // We never add an element that would replace an existing primary. + link->conns = linkConn; + // linkConn->next is already nullptr. + } + linkConn->peerIdx = peerIdx; + linkConn->conn = conn; + linkConn->external = false; + if (pLinkConn) + *pLinkConn = linkConn; + } // oldLinkConn == nullptr && insert + exit: return ncclSuccess; } + +// Adds an external entry in a RAS network link (or updates one, if already exists). +// conn *cannot* be nullptr when this function is invoked. +// peerIdx can be -1 if unknown (possible in case of a race condition between keepAlive and peers update). +// Note: there is some code duplication between this function and rasLinkConnAdd so changes to one of them +// may need to be sync'ed to the other one as well. They used to be a single function that could do it all but the +// logic was extremely difficult to follow then. +static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx) { + struct rasLinkConn* oldLinkConn = nullptr; + struct rasLinkConn* linkConnPrev = nullptr; + int i, oldLinkIdx = -1; + + assert(conn); + oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx); + if (oldLinkConn) { + if (oldLinkConn->peerIdx != -1) + assert(oldLinkConn->peerIdx == peerIdx); + + if (oldLinkConn->peerIdx == peerIdx) + goto exit; // Nothing more to do if both conn and peerIdx are up to date. Note that we neither check nor + // update the value of external here. + + // Otherwise (oldLinkConn->peerIdx == -1 && peerIdx != -1) oldLinkConn, due to its -1 peerIdx, is in + // a wrong place in the array -- we need to find the right spot. oldLinkConn->peerIdx == -1 can only happen for + // external connections. + } // if (oldLinkConn) + + // Search for the right spot in the conns list. + i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) { + if (peerIdx == -1) { + // We simply want to find the end of the list so that we can insert a new entry with -1 peerIdx there. + continue; + } + if (linkConn->peerIdx == peerIdx) { + // The exact linkConn element already exists. + if (linkConn->conn) + assert(linkConn->conn == conn); + if (linkConn->conn == nullptr) + linkConn->conn = conn; + if (linkConn == link->conns) { + // We received a connection from the remote peer that matches the primary connection we've been + // waiting for. This shouldn't trigger for external connections (rasLinkConnUpdate should be invoked first, + // which will update the entry's conn, so rasLinkConnFind invoked at the top of this function should succeed), + // but better safe than sorry... + rasLinkSanitizeFallbacks(link); + } + goto exit; + } // if (linkConn->peerIdx == peerIdx) + + // Ensure that the previous element is valid. + if (linkConnPrev == nullptr) + continue; + // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done. + if (linkConn->peerIdx == -1) + break; + // Detect a roll-over and handle it specially. + if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) { + if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 || + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; + } else { // Regular, monotonic case with the peerIdx value between two existing elements. + if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 && + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; + } + } // for (linkConn) + + // The new element should be inserted after linkConnPrev (which is at index i-1). + if (oldLinkConn) { + if (i != oldLinkIdx) { + // We already have the entry, but we need to move it to a new spot (which must be earlier in the list). + assert(i < oldLinkIdx); + INFO(NCCL_RAS, "RAS link %d: moving %sfallback connection with %s from %d to %d", link->direction, + (oldLinkConn->external ? "external " : ""), ncclSocketToString(&conn->addr, rasLine), oldLinkIdx, i); + // Remove oldLinkConn from its old spot. + for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) { + if (linkConn->next == oldLinkConn) { + linkConn->next = oldLinkConn->next; + break; + } + } // for (linkConn) + // Insert it at its new spot. + oldLinkConn->next = linkConnPrev->next; + linkConnPrev->next = oldLinkConn; + } // if (i != oldLinkIdx) + oldLinkConn->peerIdx = peerIdx; + oldLinkConn->external = false; + } else { // oldLinkConn == nullptr + struct rasLinkConn* linkConn; + NCCLCHECK(ncclCalloc(&linkConn, 1)); + if (linkConnPrev) { + INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i, + ncclSocketToString(&conn->addr, rasLine)); + linkConn->next = linkConnPrev->next; + linkConnPrev->next = linkConn; + linkConn->external = true; + } else { + INFO(NCCL_RAS, "RAS link %d: adding external fallback with %s as a new primary connection", link->direction, + ncclSocketToString(&conn->addr, rasLine)); + linkConn->next = link->conns; + link->conns = linkConn; + linkConn->external = false; // Primary connections are never external. + } + linkConn->peerIdx = peerIdx; + linkConn->conn = conn; + } // oldLinkConn == nullptr + +exit: + return ncclSuccess; +} + +// Updates an existing entry in a RAS network link, if any. +// Basically an easy-to-use variant of rasLinkConnAdd. +// For this function, conn cannot be a nullptr and peerIdx cannot be -1. +ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx) { + assert(conn && peerIdx != -1); + + NCCLCHECK(rasLinkConnAdd(link, conn, peerIdx, /*pretend*/false, /*pLinkIdx*/nullptr, /*pLinkConn*/nullptr, + /*insert*/false)); + return ncclSuccess; +} + +// Attempts to drop a connection from a link. +// If the optional external argument is true, it will drop a connection only if its external flag is set +// (otherwise the flag is ignored and a connection is always dropped if found). +static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external) { + struct rasLinkConn* linkConnPrev = nullptr; + int i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) { + if (linkConn->conn == conn && (!external || linkConn->external)) { + if (linkConnPrev) { + INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s", + link->direction, (linkConn->external ? "external " : ""), i, + ncclSocketToString(&conn->addr, rasLine)); + linkConnPrev->next = linkConn->next; + free(linkConn); + } else { // linkConnPrev == nullptr + INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s", + link->direction, ncclSocketToString(&conn->addr, rasLine)); + if (linkConn->next) { + link->conns = linkConn->next; + // Ensure that the conn becoming the primary is not marked as external (we don't want to lose it if + // the remote peer loses interest in it). + link->conns->external = false; + if (link->conns->conn) + INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary", + link->direction, ncclSocketToString(&link->conns->conn->addr, rasLine)); + rasLinkSanitizeFallbacks(link); + free(linkConn); + } else { // linkConn->next == nullptr + // We prefer the primary entry to always be present, even if empty. + linkConn->peerIdx = -1; + linkConn->conn = nullptr; + } // linkConn->next == nullptr + } // linkConnPrev == nullptr + break; + } // if (linkConn->conn == conn) + } // for (linkConn) +} + +// Checks if a given connection is a member of this link and if so, returns its link entry. +// Optionally returns the position of the connection in the conns list. +// Returns nullptr if connection not found. +static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn, + int* pLinkIdx) { + int i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) { + if (linkConn->conn == conn) { + if (pLinkIdx) + *pLinkIdx = i; + return linkConn; + } + } + if (pLinkIdx) + *pLinkIdx = -1; + return nullptr; +} + +// Invoked during RAS termination to release all the allocated resources. +void rasNetTerminate() { + for (struct rasLinkConn* linkConn = rasNextLink.conns; linkConn;) { + struct rasLinkConn* linkConnNext = linkConn->next; + free(linkConn); + linkConn = linkConnNext; + } + for (struct rasLinkConn* linkConn = rasPrevLink.conns; linkConn;) { + struct rasLinkConn* linkConnNext = linkConn->next; + free(linkConn); + linkConn = linkConnNext; + } + rasNextLink.conns = rasPrevLink.conns = nullptr; + rasNextLink.lastUpdatePeersTime = rasPrevLink.lastUpdatePeersTime = 0; + + for (struct rasConnection* conn = rasConnsHead; conn;) { + struct rasConnection* connNext = conn->next; + rasConnTerminate(conn); + conn = connNext; + } + // rasConnsHead and rasConnsTail are taken care of by rasConnTerminate(). + + for (struct rasSocket* sock = rasSocketsHead; sock;) { + struct rasSocket* sockNext = sock->next; + rasSocketTerminate(sock); + sock = sockNext; + } + // rasSocketsHead and rasSocketsTail are taken care of by rasSocketTerminate(). +} diff --git a/src/register/register.cc b/src/register/register.cc index 9e8f6ea..930367a 100644 --- a/src/register/register.cc +++ b/src/register/register.cc @@ -92,8 +92,8 @@ static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) { } } if (reg->state & NVLS_REG_COMPLETE) { - if (ncclNvlsDeregBuffer(comm, ®->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) { - WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize); + if (ncclNvlsDeregBuffer(comm, ®->mcHandle, reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize) != ncclSuccess) { + WARN("rank %d deregister NVLS buffer %p dev %d ucsize %ld mcsize %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize); } reg->regAddr = (CUdeviceptr)NULL; } diff --git a/src/transport.cc b/src/transport.cc index 5629ce7..f98b77a 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -11,11 +11,12 @@ #include "timer.h" #include "transport.h" -struct ncclTransport* ncclTransports[NTRANSPORTS] = { +struct ncclTransport* ncclTransports[NTRANSPORTS+1] = { &p2pTransport, &shmTransport, &netTransport, - &collNetTransport + &collNetTransport, + &profilerTransport // Not really used for transport, only to create proxy ops polling on profiler counters. }; template @@ -111,12 +112,14 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* gettimeofday(&timeStart, NULL); timeLast = timeStart; // struct copy bool timeReported = false; + cudaStream_t hostStream, deviceStream; NCCLCHECK(ncclCalloc(&data, maxPeers)); NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail); NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail); // First time initialization for (int i=1; inRanks; i++) { int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); @@ -195,7 +198,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (ret == ncclSuccess) { conn->connected = 1; /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */ - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } @@ -214,7 +217,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (ret == ncclSuccess) { conn->connected = 1; /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */ - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } @@ -286,8 +289,9 @@ exit: if (sendData) free(sendData); if (recvData) free(recvData); - NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream)); + NCCLCHECK(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent)); + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false)); + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false)); return ret; fail: goto exit; diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 6718012..c1ccfca 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -103,7 +103,7 @@ struct sendResources { int rank; int nranks; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int useDmaBuf; uint64_t* gdcSync; void* gdrDesc; @@ -124,7 +124,7 @@ struct recvResources { int rank; int nranks; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int useDmaBuf; int needFlush; uint64_t* gdcSync; @@ -143,9 +143,19 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG return ncclSuccess; } +// Returns the flags to be used by a call to cuMemGetHandleForAddressRange. +static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) { + int flags = 0; +#if CUDA_VERSION >= 12080 + // Force mapping on PCIe on systems with both PCI and C2C attachments. + if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE; +#endif + return flags; +} + struct setupReq { int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int needFlush; struct ncclCollNetSharedRes* collNet; }; @@ -168,8 +178,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, - req.useGdr ? "/GDRDMA" : ""); + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : ""); return ncclSuccess; } @@ -192,8 +202,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, - req.useGdr ? "/GDRDMA" : ""); + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : ""); return ncclSuccess; } @@ -454,6 +464,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc } static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + ncclResult_t ret = ncclSuccess; if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big"); @@ -505,16 +516,17 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); - NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + int dmabuf_fd = -1; #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { - int dmabuf_fd; - CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, - NCCL_PTR_CUDA, 0ULL, dmabuf_fd, - &resources->sendMhandles[NCCL_PROTO_SIMPLE])); + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); + NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, + NCCL_PTR_CUDA, 0ULL, dmabuf_fd, + &resources->sendMhandles[NCCL_PROTO_SIMPLE]), + ret, fail); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif @@ -525,10 +537,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str } *((struct connectMap**)respBuff) = &resources->map; - return ncclSuccess; + +exit: + return ret; +fail: + if (dmabuf_fd != -1) { + (void)close(dmabuf_fd); + } + goto exit; } static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + ncclResult_t ret = ncclSuccess; if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; @@ -574,16 +594,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); - NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + int dmabuf_fd = -1; #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { - int dmabuf_fd; - CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, - NCCL_PTR_CUDA, 0ULL, dmabuf_fd, - &resources->mhandles[NCCL_PROTO_SIMPLE])); + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); + NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, + NCCL_PTR_CUDA, 0ULL, dmabuf_fd, + &resources->mhandles[NCCL_PROTO_SIMPLE]), + ret, fail); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif @@ -600,7 +621,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } *((struct connectMap**)respBuff) = &resources->map; - return ncclSuccess; + +exit: + return ret; +fail: + if (dmabuf_fd != -1) { + (void)close(dmabuf_fd); + } + goto exit; } static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { @@ -737,7 +765,7 @@ static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct } static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) { - ncclNetSGE_v9_t recvParts; + ncclNetSGE_t recvParts; ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); ssize_t nBytes; @@ -779,7 +807,7 @@ static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, stru } static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) { - ncclNetSGE_v9_t recvParts; + ncclNetSGE_t recvParts; ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); recvParts.mhandle = recvMhandle; @@ -796,7 +824,7 @@ static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct } static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) { - ncclNetSGE_v9_t sendParts; + ncclNetSGE_t sendParts; ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); ssize_t nBytes; @@ -835,7 +863,7 @@ static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, } static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) { - ncclNetSGE_v9_t sendParts; + ncclNetSGE_t sendParts; ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); sendParts.mhandle = sendMhandle; @@ -1150,6 +1178,7 @@ struct collnetRegInfo { static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) { ncclResult_t ret = ncclSuccess; + int gdrEnable = -1; if (regRecord) { if (regRecord->state & COLLNET_REG_COMPLETE) { // reuse previous registration @@ -1165,6 +1194,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use if (conn->flags & NCCL_DIRECT_NIC) { struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn; + gdrEnable = 1; NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail); if (handle) { regRecord->state |= COLLNET_REG_COMPLETE; @@ -1174,7 +1204,8 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send"); } } else { - WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send"); + gdrEnable = 0; + goto fail; } } } @@ -1183,6 +1214,7 @@ exit: fail: *outRegBufFlag = 0; *outHandle = NULL; + INFO(NCCL_REG, "rank %d - COLLNET failed to register userbuff %p, buffSize %ld, type %s, GDR %d", comm->rank, userbuff, buffSize, type == collNetRecv ? "Recv" : "Send", gdrEnable); goto exit; } @@ -1268,17 +1300,20 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); + int dmabuf_fd = -1; #if CUDART_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { - int dmabuf_fd; - CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); - (void)close(dmabuf_fd); needReg = false; } #endif peermem: + if (dmabuf_fd != -1) { + (void)close(dmabuf_fd); + dmabuf_fd = -1; + } if (needReg) { NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); } @@ -1301,17 +1336,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); + int dmabuf_fd = -1; #if CUDART_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { - int dmabuf_fd; - CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); - (void)close(dmabuf_fd); needReg = false; } #endif peermem: + if (dmabuf_fd != -1) { + (void)close(dmabuf_fd); + dmabuf_fd = -1; + } if (needReg) { NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); } @@ -1600,4 +1638,4 @@ struct ncclTransport collNetTransport = { canConnect, { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer }, { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer } -}; \ No newline at end of file +}; diff --git a/src/transport/net.cc b/src/transport/net.cc index 8760b42..40d334f 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -92,7 +92,7 @@ struct sendNetResources { int tpLocalRank; int tpRemoteRank; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int useDmaBuf; int maxRecvs; uint64_t* gdcSync; @@ -123,7 +123,7 @@ struct recvNetResources { int tpRemoteRank; int tpRemoteProxyRank; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int useDmaBuf; int needFlush; int maxRecvs; @@ -168,7 +168,7 @@ struct setupReq { int tpRemoteRank; int shared; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int needFlush; int channelId; int connIndex; @@ -180,6 +180,16 @@ static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large // Forward declaration static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); +// Returns the flags to be used by a call to cuMemGetHandleForAddressRange. +static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) { + int flags = 0; +#if CUDA_VERSION >= 12080 + // Force mapping on PCIe on systems with both PCI and C2C attachments. + if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE; +#endif + return flags; +} + /* Determine if we will use this transport for this peer and return connect * information for this peer */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { @@ -204,11 +214,14 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); if (proxyRank == myInfo->rank) { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, - req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", + req.shared ? "/Shared" : ""); } else { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, - proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, + proxyRank, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", + req.shared ? "/Shared" : ""); } *((int*)connectInfo) = comm->topParentRanks[proxyRank]; memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int)); @@ -247,18 +260,19 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int)); - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev, - req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", + req.shared ? "/Shared" : ""); return ncclSuccess; } -static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) { - NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc)); +static ncclResult_t netMapShm(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct connectMapMem* mem) { + NCCLCHECK(ncclShmImportShareableBuffer(comm, proxyConn->rank, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc)); return ncclSuccess; } static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) { - NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr)); + NCCLCHECK(ncclShmAllocateShareableBuffer(mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr)); return ncclSuccess; } @@ -292,6 +306,7 @@ static ncclResult_t netDumpMap(struct connectMap* map) { struct netSendConnectArgs { ncclNetHandle_t handle; + int trafficClass; }; struct netRecvConnectArgs { @@ -315,6 +330,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId); netSendConnectArgs args = {0}; memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t)); + args.trafficClass = comm->config.trafficClass; NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId)); } else { opId = send; @@ -343,7 +359,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne } } } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { - if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM)); + if (!map->sameProcess) NCCLCHECK(netMapShm(comm, &send->proxyConn, map->mems + NCCL_NET_MAP_HOSTMEM)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL; NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank, @@ -692,9 +708,11 @@ static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version, static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); + ncclNetCommConfig_t commConfig = {0}; if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError; ncclResult_t ret = ncclSuccess; netSendConnectArgs* req = (netSendConnectArgs*) reqBuff; + commConfig.trafficClass = req->trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : req->trafficClass; NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle)); if (resources->shared) { // Shared buffers @@ -714,15 +732,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank; - if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); + if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { - ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); + ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); } } else { // Connect to remote peer - ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); + ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } @@ -748,7 +766,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, proxyState->buffSizes[p], buffs[p]); + NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]); resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { @@ -765,7 +783,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } - NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); @@ -820,7 +838,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; - CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path @@ -904,7 +922,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, proxyState->buffSizes[p], buffs[p]); + NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]); resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { @@ -915,14 +933,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; - NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); if (proxyState->allocP2pNetLLBuffers) { - NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*devMem*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } @@ -964,7 +982,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; - CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path @@ -1175,11 +1193,12 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct // Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense, // since size is a plain integer. // coverity[use_invalid:FALSE] - NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot)); + NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle); sub->transSize += size; sub->transmitted += args->sliceSteps; + sub->profilerSteps++; ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted); ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait); args->idle = 0; @@ -1280,6 +1299,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct size_t sizes[NCCL_PROXY_MAX_SUBS]; int tags[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; + void* phandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; int postedStepId = sub->posted; @@ -1323,6 +1343,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; tags[subCount] = resources->tpRemoteRank; mhandles[subCount] = sub->recvMhandle; + phandles[subCount] = sub; subCount++; } } @@ -1332,7 +1353,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct void** requestPtr = subGroup->requests+(step%NCCL_STEPS); bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1); if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION; - NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); + NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr)); if (*requestPtr) { subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr; subGroup->recvRequestsSubCount = subCount; @@ -1341,6 +1362,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct int postedStepId = sub->posted; TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]); sub->posted += args->sliceSteps; + sub->profilerSteps++; ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted); ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait); } @@ -1558,7 +1580,7 @@ exit: return ret; fail: *outRegBufFlag = 0; - WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag); + INFO(NCCL_REG, "rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag); goto exit; } @@ -1639,7 +1661,7 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s /* DMA-BUF support */ if (resources->useDmaBuf) { int dmabuf_fd; - CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); (void)close(dmabuf_fd); needReg = false; @@ -1673,7 +1695,7 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s /* DMA-BUF support */ if (resources->useDmaBuf) { int dmabuf_fd; - CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); (void)close(dmabuf_fd); needReg = false; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index bc54133..bfff6e5 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -11,6 +11,7 @@ #include "graph.h" #include "utils.h" #include "param.h" +#include "profiler/net_ib.h" #include #include @@ -85,6 +86,11 @@ struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; static int ncclIbRelaxedOrderingEnabled = 0; +#define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED")) + +#define NCCL_IB_SL_DEFAULT 0 +#define NCCL_IB_TC_DEFAULT 0 + NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1); NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1); NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2); @@ -92,8 +98,8 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20); NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7); NCCL_PARAM(IbPkey, "IB_PKEY", 0); NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0); -NCCL_PARAM(IbSl, "IB_SL", 0); -NCCL_PARAM(IbTc, "IB_TC", 0); +NCCL_PARAM(IbSl, "IB_SL", -1); +NCCL_PARAM(IbTc, "IB_TC", -1); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); @@ -327,6 +333,9 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, close(fd); if (ret == -1) { + // In containerized environments, read could return EINVAL if the GID index is not mapped to the + // container sysfs. In this case return ncclSuccess and let the caller move to next GID index. + if (errno == EINVAL) return ncclSuccess; WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno)); return ncclSystemError; } @@ -359,7 +368,7 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port return ncclSuccess; } int usrRoceVer = roceVer; - int gidRoceVerNum, gidRoceVerNumCandidate; + int gidRoceVerNum, gidRoceVerNumCandidate = -1; const char* deviceName = wrap_ibv_get_device_name(context->device); NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum)); NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate)); @@ -530,8 +539,8 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) { } ncclIbDev* dev = ncclIbDevs + props->devs[i]; if (dev->link != dev0->link) { - WARN("NET/IB : Trying to merge multiple devices together with different link_layer properties %s -> %d, %s -> %d. Try only selecting NICs with one type of link using NCCL_IB_HCA", - dev0->devName, dev0->link, dev->devName, dev->link); + WARN("NET/IB : Attempted to merge incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + props->devs[0], dev0->devName, dev0->portNum, NCCL_IB_LLSTR(dev0->link), props->devs[i], dev->devName, dev->portNum, NCCL_IB_LLSTR(dev->link)); return ncclInvalidUsage; } } @@ -548,8 +557,11 @@ ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return res; } -ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { +static ncclProfilerCallback_t ncclProfilerFunction; + +ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { ncclResult_t ret = ncclSuccess; + ncclProfilerFunction = profFunction; if (ncclParamIbDisable()) return ncclInternalError; static int shownIbHcaEnv = 0; if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } @@ -571,7 +583,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { struct ibv_device** devices; // Check if user defined which IB device:port to use - char* userIbEnv = getenv("NCCL_IB_HCA"); + const char* userIbEnv = ncclGetEnv("NCCL_IB_HCA"); if (userIbEnv != NULL && shownIbHcaEnv++ == 0) INFO(NCCL_NET|NCCL_ENV, "NCCL_IB_HCA set to %s", userIbEnv); struct netIf userIfs[MAX_IB_DEVS]; bool searchNot = userIbEnv && userIbEnv[0] == '^'; @@ -634,7 +646,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting(); TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, - portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); + NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail); ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); @@ -666,7 +678,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable(); for (int d = 0; d < ncclNIbDevs; d++) { snprintf(line+strlen(line), sizeof(line)-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName, - ncclIbDevs[d].portNum, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + ncclIbDevs[d].portNum, NCCL_IB_LLSTR(ncclIbDevs[d].link)); } char addrline[SOCKET_NAME_MAXLEN+1]; INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "", @@ -832,6 +844,8 @@ struct ncclIbConnectionMetadata { char devName[MAX_MERGED_DEV_NAME]; uint64_t fifoAddr; int ndevs; + int tc; + int sl; }; enum ncclIbCommState { @@ -873,12 +887,23 @@ struct ncclIbGidInfo { #define NCCL_NET_IB_REQ_FLUSH 3 const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" }; +#define MAX_QPS_PER_REQ 8 +struct ncclProfilerInfo { + void* qpEventHandles[MAX_QPS_PER_REQ]; + int qpIndex[MAX_QPS_PER_REQ]; + int nEventHandles; + ncclProfilerNetIbDescr_v1_t data; +}; + struct ncclIbRequest { struct ncclIbNetCommBase* base; int type; struct ncclSocket* sock; int events[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC]; +#ifdef NCCL_ENABLE_NET_PROFILING + struct ncclProfilerInfo pInfo[NCCL_NET_IB_MAX_RECVS]; +#endif int nreqs; union { struct { @@ -1084,7 +1109,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, return ncclSuccess; } -ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc) { +ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc, int tc, int sl) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTR; @@ -1100,7 +1125,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint qpAttr.ah_attr.grh.flow_label = 0; qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex; qpAttr.ah_attr.grh.hop_limit = 255; - qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : ncclParamIbTc(); + qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : tc; } else { //pick lid if subnet prefixs are same, FLID if they are not if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) == @@ -1122,10 +1147,10 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint qpAttr.ah_attr.grh.hop_limit = 255; } } - qpAttr.ah_attr.sl = ncclParamIbSl(); + qpAttr.ah_attr.sl = sl; qpAttr.ah_attr.src_path_bits = 0; qpAttr.ah_attr.port_num = info->ib_port; - TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port); + TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u sl: %d tc: %d", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port, qpAttr.ah_attr.sl, qpAttr.ah_attr.grh.traffic_class); NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)); return ncclSuccess; } @@ -1164,12 +1189,13 @@ fail: goto exit; } -ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { ncclResult_t ret = ncclSuccess; struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; struct ncclIbCommStage* stage = &handle->stage; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; int ready; + uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED; *sendComm = NULL; if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; @@ -1199,7 +1225,7 @@ ib_connect_check: // IB Setup struct ncclIbMergedDev* mergedDev; if (dev >= ncclNMergedIbDevs) { - WARN("NET/IB : Trying to use non-existant virtual device %d", dev); + WARN("NET/IB : Trying to use non-existent virtual device %d", dev); return ncclInternalError; } @@ -1305,8 +1331,17 @@ ib_recv_dev_list: devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey); } } + if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = devInfo->link_layer; + if (link_layer != devInfo->link_layer) { + int ibDev0 = comm->devs[0].base.ibDevN; + WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + commDev->base.ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer)); + return ncclInternalError; + } } meta.fifoAddr = (uint64_t)comm->fifo; + meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT; + meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT; strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME); stage->state = ncclIbCommStateSend; @@ -1332,13 +1367,16 @@ ib_connect: comm->base.nRemDevs = remMeta.ndevs; - int link_layer; - link_layer = remMeta.devs[0].link_layer; - for (int i = 1; i < remMeta.ndevs; i++) { - if (remMeta.devs[i].link_layer != link_layer) { - WARN("NET/IB : Can't connect net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d", - i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer); - return ncclInternalError; + // ensure that the remote devices have the same link layer than the local devices used in the connection. + if (comm->base.vProps.ndevs > 0) { + int ibDev0 = comm->devs[0].base.ibDevN; + link_layer = ncclIbDevs[ibDev0].portAttr.link_layer; + for (int i = 0; i < remMeta.ndevs; i++) { + if (remMeta.devs[i].link_layer != link_layer) { + WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer)); + return ncclInternalError; + } } } @@ -1373,7 +1411,7 @@ ib_connect: ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu); - NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail); + NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false, remMeta.tc, remMeta.sl), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail); } @@ -1459,6 +1497,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle struct ncclIbCommStage* stage = &lComm->stage; struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; int ready; + int link_layer = IBV_LINK_LAYER_UNSPECIFIED; *recvComm = NULL; if (stage->state == ncclIbCommStateAccept) goto ib_accept_check; @@ -1497,7 +1536,7 @@ ib_recv_dev_list: ncclNetVDeviceProps_t remoteVProps; memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t)); if (lComm->dev >= ncclNMergedIbDevs) { - WARN("NET/IB : Trying to use non-existant virtual device %d", lComm->dev); + WARN("NET/IB : Trying to use non-existent virtual device %d", lComm->dev); return ncclInternalError; } @@ -1555,6 +1594,13 @@ ib_recv: ibDev = ncclIbDevs + ibDevN; NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail); NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail); + if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = ibDev->portAttr.link_layer; + if (link_layer != ibDev->portAttr.link_layer) { + int ibDev0 = rComm->devs[0].base.ibDevN; + WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer)); + return ncclInternalError; + } } // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. @@ -1562,6 +1608,12 @@ ib_recv: rComm->base.remDevs[i] = remMeta.devs[i]; rComm->base.remDevs[i].remoteGid.global.interface_id = rComm->base.remDevs[i].gid.global.interface_id; rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].gid.global.subnet_prefix; + if (remMeta.devs[i].link_layer != link_layer) { + int ibDev0 = rComm->devs[0].base.ibDevN; + WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer)); + return ncclInternalError; + } } // Stripe QP creation across merged devs @@ -1598,7 +1650,7 @@ ib_recv: meta.qpInfo[q].ece_supported = 0; } - NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true), ret, fail); + NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true, remMeta.tc, remMeta.sl), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail); } @@ -1629,7 +1681,7 @@ ib_recv: devInfo.gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; devInfo.gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; devInfo.mtu = ibDev->portAttr.active_mtu; - NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail); + NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false, remMeta.tc, remMeta.sl), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail); } @@ -1646,6 +1698,8 @@ ib_recv: meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey; } meta.fifoAddr = (uint64_t)rComm->sizesFifo; + meta.sl = remMeta.sl; + meta.tc = remMeta.tc; for (int q = 0; q < rComm->base.nqps; q++) { meta.qpInfo[q].qpn = rComm->base.qps[q].qp->qp_num; @@ -1842,7 +1896,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0); -ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { +ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandle) { struct ncclIbRequest** reqs = comm->fifoReqs[slot]; volatile struct ncclIbSendFifo* slots = comm->fifo[slot]; int nreqs = slots[0].nreqs; @@ -1860,6 +1914,9 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { wr->wr.rdma.remote_addr = slots[r].addr; wr->next = wr + 1; wr_id += (reqs[r] - comm->base.reqs) << (r*8); +#ifdef NCCL_ENABLE_NET_PROFILING + reqs[r]->pInfo[0].nEventHandles = 0; +#endif } // Write size as immediate data. In the case of multi-send, only write @@ -1929,6 +1986,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { } struct ibv_send_wr* bad_wr; +#ifdef NCCL_ENABLE_NET_PROFILING + // QP profiling loop + for (int r=0; rpInfo[0].nEventHandles; + reqs[r]->pInfo[0].qpIndex[nEventHandles%MAX_QPS_PER_REQ] = qpIndex; + // Store info for profiler + int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; + reqs[r]->pInfo[0].data.type = ncclProfileQp; + reqs[r]->pInfo[0].data.qp.device = devIndex; + reqs[r]->pInfo[0].data.qp.wr_id = comm->wrs[r].wr_id; + reqs[r]->pInfo[0].data.qp.opcode = comm->wrs[r].opcode; + reqs[r]->pInfo[0].data.qp.qpNum = qp->qp->qp_num; + reqs[r]->pInfo[0].data.qp.length = comm->sges[r].length; + NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles%MAX_QPS_PER_REQ], 0, pHandle, pluginId, &reqs[r]->pInfo[0].data)); + reqs[r]->pInfo[0].nEventHandles++; + } +#endif NCCLCHECK(wrap_ibv_post_send(qp->qp, comm->wrs, &bad_wr)); for (int r=0; rbase.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } @@ -2018,7 +2093,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* } TIME_START(0); - NCCLCHECK(ncclIbMultiSend(comm, slot)); + NCCLCHECK(ncclIbMultiSend(comm, slot, phandle)); // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks memset((void*)slots, 0, sizeof(struct ncclIbSendFifo)); @@ -2109,7 +2184,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz return ncclSuccess; } -ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { +ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } @@ -2121,6 +2196,9 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* req->type = NCCL_NET_IB_REQ_RECV; req->sock = &comm->base.sock; req->nreqs = n; +#ifdef NCCL_ENABLE_NET_PROFILING + for (int r = 0; r < n && phandles; r++) req->pInfo[r].nEventHandles = 0; +#endif for (int i = 0; i < comm->base.vProps.ndevs; i++) { req->devBases[i] = &comm->devs[i].base; @@ -2141,6 +2219,19 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* for (int i = 0; i < nqps; i++) { struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex; ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base); +#ifdef NCCL_ENABLE_NET_PROFILING + // Start a QP event for every request in the multirecv and every qp + for (int r = 0; r < n && phandles; r++) { + // Store info for profiler + int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; + req->pInfo[r].data.type = ncclProfileQp; + req->pInfo[r].data.qp.device = qp->devIndex; + req->pInfo[r].data.qp.wr_id = wr.wr_id; + req->pInfo[r].data.qp.qpNum = qp->qp->qp_num; + NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[i], 0, phandles[r], pluginId, &req->pInfo[r].data)); + req->pInfo[r].nEventHandles++; + } +#endif NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr)); comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps; } @@ -2196,6 +2287,16 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** #define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name) +#ifdef NCCL_ENABLE_NET_PROFILING +static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) { + for (int i = 0; i < MAX_QPS_PER_REQ; i++) { + int qpIndex = req->pInfo[request].qpIndex[i]; + if (req->base->qps[qpIndex].qp->qp_num == qpNumber) return i; + } + return 0; +} +#endif + ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { struct ncclIbRequest *r = (struct ncclIbRequest*)request; *done = 0; @@ -2205,11 +2306,24 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { TRACE(NCCL_NET, "r=%p done", r); *done = 1; if (sizes && r->type == NCCL_NET_IB_REQ_RECV) { - for (int i=0; inreqs; i++) sizes[i] = r->recv.sizes[i]; + for (int i=0; inreqs; i++) { + sizes[i] = r->recv.sizes[i]; +#ifdef NCCL_ENABLE_NET_PROFILING + for (int j = 0; j < r->pInfo[i].nEventHandles; j++) { + NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], 1, NULL, 0, NULL)); + } +#endif + } } if (sizes && r->type == NCCL_NET_IB_REQ_SEND) { sizes[0] = r->send.size; +#ifdef NCCL_ENABLE_NET_PROFILING + for (int j = 0; j < r->pInfo[0].nEventHandles; j++) { + NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], 1, NULL, 0, NULL)); + } +#endif } + // Stop all remaining Qp events for this event NCCLCHECK(ncclIbFreeRequest(r)); return ncclSuccess; } @@ -2264,6 +2378,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { return ncclInternalError; } sendReq->events[i]--; +#ifdef NCCL_ENABLE_NET_PROFILING + // Stop Qp event for sendReq + NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[getReqQpIndex(sendReq, j, wc->qp_num)], 1, NULL, 0, NULL)); +#endif } } else { if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { @@ -2276,6 +2394,12 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { } } req->events[i]--; +#ifdef NCCL_ENABLE_NET_PROFILING + // Stop Qp event for workFifo + for (int j = 0; j < req->nreqs; j++) { + NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[getReqQpIndex(req, j, wc->qp_num)], 1, NULL, 0, NULL)); + } +#endif } } // Once the IB fatal event is reported in the async thread, we want to propagate this error diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 235dee8..8034d95 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -9,6 +9,7 @@ #include "socket.h" #include "net.h" #include "param.h" +#include "profiler/net_socket.h" #include #include @@ -35,7 +36,10 @@ static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) { return ncclSuccess; } -ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { +static ncclProfilerCallback_t ncclProfilerFunction; + +ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { + ncclProfilerFunction = profFunction; if (ncclNetIfs == -1) { pthread_mutex_lock(&ncclNetSocketLock); if (ncclNetIfs == -1) { @@ -158,6 +162,11 @@ struct ncclNetSocketTask { ncclResult_t result; }; +struct ncclProfilerInfo { + void* eHandle; + void* pHandle; +}; + struct ncclNetSocketRequest { int op; void* data; @@ -168,6 +177,7 @@ struct ncclNetSocketRequest { struct ncclNetSocketComm* comm; struct ncclNetSocketTask* tasks[MAX_SOCKETS]; int nSubs; + struct ncclProfilerInfo pInfo; }; struct ncclNetSocketTaskQueue { @@ -180,6 +190,7 @@ struct ncclNetSocketThreadResources { struct ncclNetSocketTaskQueue threadTaskQueue; int stop; struct ncclNetSocketComm* comm; + struct ncclProfilerInfo* pInfo; pthread_mutex_t threadLock; pthread_cond_t threadCond; }; @@ -210,6 +221,9 @@ void* persistentSocketThread(void *args_) { struct ncclNetSocketComm* comm = resource->comm; struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue; int nSocksPerThread = comm->nSocks / comm->nThreads; +#ifdef NCCL_ENABLE_NET_PROFILING + void* eHandle[MAX_REQUESTS*MAX_SOCKETS] = { 0 }; +#endif while (1) { int idle = 1; int mark = myQueue->next; // mark newest task seen @@ -220,13 +234,33 @@ void* persistentSocketThread(void *args_) { for (int j=0; jtasks+i+j; if (r != NULL && r->used == 1 && r->offset < r->size) { +#ifdef NCCL_ENABLE_NET_PROFILING + if (!eHandle[i+j]) { + ncclProfilerNetSockDescr_v1_t data; + data.type = ncclProfileSocket; + data.sock.fd = r->sock->fd; + data.sock.op = r->op; + data.sock.length = r->size; + ncclProfilerFunction(&eHandle[i+j], 0, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data); + } +#endif r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset); if (r->result != ncclSuccess) { +#ifdef NCCL_ENABLE_NET_PROFILING + ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL); + eHandle[i+j] = NULL; +#endif WARN("NET/Socket : socket progress error"); return NULL; } idle = 0; if (r->offset < r->size) repeat = 1; +#ifdef NCCL_ENABLE_NET_PROFILING + if (repeat == 0) { + ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL); + eHandle[i+j] = NULL; + } +#endif } } } while (repeat); @@ -326,7 +360,7 @@ fail: goto exit; } -ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } @@ -444,7 +478,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi return ncclInternalError; } -ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) { +ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclProfilerInfo* pInfo, int op, void* data, int size, struct ncclNetSocketTask** req) { int tid = comm->nextSock % comm->nThreads; struct ncclNetSocketThreadResources* res = comm->threadResources+tid; struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue; @@ -457,6 +491,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* NCCLCHECK(ncclCalloc(&queue->tasks, queue->len)); queue->next = 0; res->comm = comm; +#ifdef NCCL_ENABLE_NET_PROFILING + res->pInfo = pInfo; +#endif pthread_mutex_init(&res->threadLock, NULL); pthread_cond_init(&res->threadCond, NULL); PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create"); @@ -520,7 +557,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); while (chunkOffset < r->size) { int chunkSize = std::min(taskSize, r->size-chunkOffset); - NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); + NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); chunkOffset += chunkSize; } } @@ -544,6 +581,16 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { } } } else { // progress request using main thread +#ifdef NCCL_ENABLE_NET_PROFILING + if (!r->pInfo.eHandle) { + ncclProfilerNetSockDescr_v1_t data; + data.type = ncclProfileSocket; + data.sock.fd = r->ctrlSock->fd; + data.sock.op = r->op; + data.sock.length = r->size; + ncclProfilerFunction(&r->pInfo.eHandle, 0, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data); + } +#endif if (r->offset < r->size) { NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset)); } @@ -551,6 +598,10 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { if (size) *size = r->size; *done = 1; r->used = 0; +#ifdef NCCL_ENABLE_NET_PROFILING + ncclProfilerFunction(&r->pInfo.eHandle, 1, NULL, 0, NULL); + r->pInfo.eHandle = NULL; +#endif } } } @@ -562,16 +613,26 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v } ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } -ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { +ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm; NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request)); +#ifdef NCCL_ENABLE_NET_PROFILING + // NCCL core profiler callback + struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request; + req->pInfo.pHandle = phandle; +#endif return ncclSuccess; } -ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { +ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm; if (n != 1) return ncclInternalError; NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request)); +#ifdef NCCL_ENABLE_NET_PROFILING + // NCCL core profiler callback + struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request; + if (phandles) req->pInfo.pHandle = phandles[0]; +#endif return ncclSuccess; } diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index 3fe25a3..d99f7cb 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -108,29 +108,29 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll return ncclSuccess; } -ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { - CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size)); - CUCHECK(cuMemUnmap(ptr, size)); - CUCHECK(cuMemAddressFree(ptr, size)); +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) { + CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize)); + CUCHECK(cuMemUnmap(ptr, mcsize)); + CUCHECK(cuMemAddressFree(ptr, mcsize)); CUCHECK(cuMemRelease(*mcHandler)); - INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size); + INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d ucsize %ld mcsize %ld", comm->rank, (void*)ptr, dev, ucsize, mcsize); return ncclSuccess; } -ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) { - INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr); +ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t ucsize, void* ucptr, CUmemGenericAllocationHandle* ucHandle, size_t mcsize, void* mcptr, CUmemGenericAllocationHandle* mcHandle) { + INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) ucsize %zu MC handle 0x%llx(%p) mcsize %zd", *ucHandle, ucptr, ucsize, *mcHandle, mcptr, mcsize); // Release the UC memory and mapping if (ucptr) { - CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size)); - CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size)); + CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, ucsize)); + CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, ucsize)); CUCHECK(cuMemRelease(*ucHandle)); } // Release the MC memory and mapping if (mcptr) { - CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size)); - CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size)); + CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, mcsize)); + CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, mcsize)); CUCHECK(cuMemRelease(*mcHandle)); } @@ -197,25 +197,27 @@ fail: goto exit; } -static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularity_flags mcOption, const CUmemAccessDesc* desc, size_t* sizePtr, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr) { +static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc* desc, size_t size, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr, size_t* ucsizePtr, size_t* mcsizePtr) { char shareableHandle[NVLS_HANDLE_SIZE]; CUmulticastObjectProp mcprop; CUmemAllocationProp ucprop; ncclResult_t ret = ncclSuccess; - size_t size = *sizePtr; - size_t originSize = size; + size_t mcsize; + size_t ucsize; size_t ucgran, mcgran; int allocMcHandle = 0; + mcsize = ucsize = size; *ucptr = *mcptr = NULL; + memset(shareableHandle, '\0', sizeof(shareableHandle)); memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); mcprop.numDevices = comm->localRanks; mcprop.handleTypes = ncclCuMemHandleType; mcprop.flags = 0; mcprop.size = size; - CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, mcOption), ret, fail); - ALIGN_SIZE(size, mcgran); - *sizePtr = mcprop.size = size; + CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + ALIGN_SIZE(mcsize, mcgran); + mcprop.size = mcsize; if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail); @@ -235,26 +237,29 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit ucprop.location.id = comm->cudaDev; ucprop.requestedHandleTypes = ncclCuMemHandleType; CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); - // Map a VA for UC memory - CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, size, ucgran, 0U, 0), ret, fail); + ALIGN_SIZE(ucsize, ucgran); + // Map a VA for UC memory with MC alignment and size + CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, ucsize, ucgran, 0U, 0), ret, fail); // Alloc local physical mem for this NVLS group - CUCHECKGOTO(cuMemCreate(ucHandle, size, &ucprop, 0), ret, fail); - CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, size, 0, *ucHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail); - CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail); + CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail); + CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail); + CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail); // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); // Bind physical memory to the Multicast group // NB: It will block until all ranks have been added to the Group - CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail); + CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/), ret, fail); // Map mc virtual address - CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, size, mcgran, 0U, 0), ret, fail); - CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, size, 0, *mcHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, size, desc, 1), ret, fail); - INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld size %ld (%ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, size, originSize); + CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, mcsize, mcgran, 0U, 0), ret, fail); + CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, mcsize, 0, *mcHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, mcsize, desc, 1), ret, fail); + *ucsizePtr = ucsize; + *mcsizePtr = mcsize; + INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld ucsize %ld mcsize %ld (inputsize %ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, ucsize, mcsize, size); exit: return ret; @@ -273,6 +278,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { size_t nvlsTotalSize = 0; struct ncclNvlsSharedRes* resources = NULL; int nChannels = -1; + cudaStream_t deviceStream, hostStream; if (comm->nvlsSupport == 0 || comm->nvlsResources->inited) return ncclSuccess; // initialize after checking comm->nvlsSupport @@ -288,10 +294,10 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu", comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize); - NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_RECOMMENDED, &resources->accessDesc, &nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff), res, fail); - resources->buffSize = nvlsTotalSize; + NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff, &resources->buffUCSize, &resources->buffMCSize), res, fail); - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail); for (int h = 0; h < nHeads; h++) { int nvlsPeer = comm->nRanks + 1 + h; for (int c = 0; c < nChannels; c++) { @@ -306,15 +312,16 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * buffSize; peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * buffSize; - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); } } - NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail); // For now, the barrier is a must that guarantees all buffers are mc-mapped before accessing peer's buffer NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, fail); comm->nvlsResources->inited = true; @@ -374,6 +381,7 @@ setup: size_t memSize = 64; size_t creditSize = nChannels * 2 * memSize * nHeads; int nvlsStepSize = comm->nvlsChunkSize; + cudaStream_t hostStream, deviceStream; NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail); comm->nvlsResources->inited = false; @@ -398,11 +406,11 @@ setup: resources->accessDesc.location.id = comm->cudaDev; resources->dev = comm->cudaDev; - NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_MINIMUM, &resources->accessDesc, &creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit), res, fail); - resources->creditSize = creditSize; + NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit, &resources->creditUCSize, &resources->creditMCSize), res, fail); // Set up head and tail only for now - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail); for (int h = 0; h < nHeads; h++) { int nvlsPeer = comm->nRanks + 1 + h; for (int c = 0; c < nChannels; c++) { @@ -440,14 +448,15 @@ setup: peer->send[0].conn.stepSize = nvlsStepSize; peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL; - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); } } - NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail); } // MNNVL does not support NVLS buffer registration @@ -488,13 +497,13 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle)); if (resources->ucCredit || resources->mcCredit) { - NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle)); - NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle)); + NCCLCHECK(nvlsGroupUnbind(comm, resources->creditUCSize, &resources->mcCreditHandle)); + NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditUCSize, resources->ucCredit, &resources->ucCreditHandle, resources->creditMCSize, resources->mcCredit, &resources->mcCreditHandle)); } if (comm->nvlsResources->inited) { - NCCLCHECK(nvlsGroupUnbind(comm, resources->buffSize, &resources->mcBuffHandle)); - NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffSize, resources->ucBuff, &resources->ucBuffHandle, resources->mcBuff, &resources->mcBuffHandle)); + NCCLCHECK(nvlsGroupUnbind(comm, resources->buffUCSize, &resources->mcBuffHandle)); + NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffUCSize, resources->ucBuff, &resources->ucBuffHandle, resources->buffMCSize, resources->mcBuff, &resources->mcBuffHandle)); } free(resources); comm->nvlsResources = NULL; @@ -513,7 +522,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t size_t minSize = SIZE_MAX; struct localRegData* regData = NULL; cudaPointerAttributes attr; - size_t ucgran, mcgran; + size_t ucgran, mcgran, ucsize, mcsize; NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail); @@ -538,13 +547,12 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr), ret, fail); - if (regSize % mcgran == 0) { - regRecord->regSize = regSize; - } else { - regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr); - } - - if (regRecord->addr % ucgran == 0 && regRecord->regSize % mcgran == 0) { + if (regRecord->addr % ucgran == 0) { + if (regSize % ucgran != 0) { + regRecord->regUCSize = ALIGN_SIZE(regSize, ucgran); + } else { + regRecord->regUCSize = regSize; + } regRecord->state |= NVLS_REG_POSSIBLE; memcpy(®Data[comm->localRank].reg, regRecord, sizeof(struct ncclReg)); regData[comm->localRank].offset = userBuff - regRecord->addr; @@ -564,13 +572,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t goto fail; } /* get minimal reg size of nvls buffers */ - if (minSize > regData[i].reg.regSize) - minSize = regData[i].reg.regSize; + if (minSize > regData[i].reg.regUCSize) + minSize = regData[i].reg.regUCSize; } /* start registration */ + mcsize = ucsize = minSize; mcprop.size = minSize; CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + ALIGN_SIZE(mcsize, mcgran); + mcprop.size = mcsize; + if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); @@ -583,16 +595,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t // Coverity complains that regRecord could be NULL. That won't in practice be the case because we've already checked // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out. // coverity[var_deref_op] - CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail); + CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, ucsize, 0), ret, fail); // Create a VA for the NVLS - CUCHECKGOTO(cuMemAddressReserve(®Ptr, minSize, mcgran, 0U, 0), ret, fail); + CUCHECKGOTO(cuMemAddressReserve(®Ptr, mcsize, mcgran, 0U, 0), ret, fail); // Map the VA locally - CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail); + CUCHECKGOTO(cuMemMap(regPtr, mcsize, 0, mcHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess(regPtr, mcsize, &comm->nvlsResources->accessDesc, 1), ret, fail); regRecord->regAddr = regPtr; - regRecord->regSize = minSize; + regRecord->regUCSize = ucsize; + regRecord->regMCSize = mcsize; regRecord->dev = comm->nvlsResources->dev; regRecord->mcHandle = mcHandle; regRecord->state |= NVLS_REG_COMPLETE; @@ -706,7 +719,7 @@ exit: return ncclSuccess; fail: regBufUsed = 0; - WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize); + INFO(NCCL_REG, "rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize); goto exit; } @@ -843,7 +856,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send return ncclSuccess; } -ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) { return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index dac7621..aed84c5 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -407,6 +407,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st comm->peerInfo[intermediateRank].nvmlDev, useReadStr); } + memset(&req, '\0', sizeof(req)); req.size = sendSize; req.refcount = 0; if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++; @@ -466,6 +467,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st info->rank = intermediateRank; } + memset(&req, '\0', sizeof(req)); req.size = recvSize; req.refcount = 0; if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++; @@ -527,7 +529,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn if (useMemcpy) { // Attach to peer's SHM segment - NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc)); recv->conn.tail = &resources->devShm->recvMem.tail; recv->conn.head = &resources->devShm->sendMem.head; @@ -634,7 +636,7 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st // Create a SHM segment for the peer to attach to shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem); - NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm)); + NCCLCHECK(ncclShmAllocateShareableBuffer(shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm)); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo)); @@ -805,7 +807,7 @@ static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size ncclResult_t ret = ncclSuccess; struct ncclIpcRegInfo* newInfo = NULL; uintptr_t* peerRmtAddrs = NULL; - bool legacyIpcCap = false; + int legacyIpcCap = 0; size_t baseSize = 0; void* baseAddr = NULL; bool needUpdate = false; @@ -916,13 +918,16 @@ ncclResult_t ret = ncclSuccess; if (type == NCCL_IPC_COLLECTIVE) { // for collective, store registered remote buffers into dev memory for future reference if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) { - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); + cudaStream_t hostStream, deviceStream; + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail); if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL) - NCCLCHECKGOTO(ncclCudaCallocAsync(®Record->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(®Record->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, hostStream), ret, fail); if (needUpdate) - NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, hostStream), ret, fail); + NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), ret, fail); } peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs; } else { @@ -941,7 +946,7 @@ fail: *offsetOut = 0; *peerRmtAddrsOut = NULL; if (newInfo) free(newInfo); - WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc); + INFO(NCCL_REG, "rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %d type %s", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc ? *isLegacyIpc : -1, ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR ? "POSIX_FD" : "FABRIC"); goto exit; } diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc new file mode 100644 index 0000000..3e32843 --- /dev/null +++ b/src/transport/profiler.cc @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "transport.h" +#include "proxy.h" +#include "profiler.h" + +static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + connection->proxyAppendPtr = &connection->proxyAppend; + connection->shared = 1; + return ncclSuccess; +} + +// The following ncclProxySubArgs are overloaded by the profiler progress function: +// - base : is set to the current value of workCounter[channelId] +// - posted : is set to sub->nsteps to indicate that the profiler has started the event +// - transmitted: is set to sub->nsteps to indicate that the profiler has stopped the event +static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { + if (args->state == ncclProxyOpReady) { + for (int s = 0; s < args->nsubs; s++) { + struct ncclProxySubArgs* sub = args->subs + s; + sub->base = sub->workCounter; + sub->posted = sub->transmitted = 0; + } + args->state = ncclProxyOpProgress; + } + if (args->state == ncclProxyOpProgress) { + for (int s = 0; s < args->nsubs; s++) { + struct ncclProxySubArgs* sub = args->subs + s; + uint64_t* workStarted = (uint64_t *)sub->sendbuff; + uint64_t* workCompleted = (uint64_t *)sub->recvbuff; + if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId]) { + ncclProfilerStartKernelChEvent(args, s); + sub->posted = sub->nsteps; + continue; // allow events on every channel to start + } + if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId]) { + ncclProfilerStopKernelChEvent(args, s); + sub->transmitted = sub->nsteps; + args->done++; + } + } + if (args->done == args->nsubs) args->state = ncclProxyOpNone; + } + return ncclSuccess; +} + +struct ncclTransport profilerTransport = { + "Prof", + NULL, + { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL, profilerProxyConnect, NULL, profilerProxyProgress, NULL, NULL } +}; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index d2d6906..aa3e6c4 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -18,6 +18,7 @@ struct shmBuffInfo { }; struct shmConnectInfo { + int rank; ncclShmIpcDesc_t desc; struct shmBuffInfo buf; }; @@ -120,6 +121,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn)); NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo))); + info->rank = comm->rank; resources->hostMem = (struct ncclSendMem*)info->buf.hptr; resources->devHostMem = (struct ncclSendMem*)info->buf.dptr; @@ -150,6 +152,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn)); NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo))); + info->rank = comm->rank; resources->hostMem = (struct ncclRecvMem*)info->buf.hptr; resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr; @@ -163,7 +166,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; char* buff; - NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1); for (int p=0; pdesc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1); for (int p=0; ptpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); + NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t)); connection->transportResources = proxyInfo; exit: @@ -485,7 +488,7 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); - NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); + NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t)); connection->transportResources = proxyInfo; exit: @@ -517,9 +520,9 @@ static void initCeOperation() { } } -ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) { - if (desc == NULL || hptr == NULL || tpProxyRank < -1) { - WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank); +ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) { + if (desc == NULL || hptr == NULL) { + WARN("Invalid argument desc %p, hptr %p", desc, hptr); return ncclInvalidArgument; } #if CUDART_VERSION >= 12020 @@ -532,7 +535,6 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // Return the native cuMem handle for later Export/Import via UDS memcpy(&desc->shmci.data, &handle, sizeof(handle)); - desc->shmci.tpProxyRank = tpProxyRank; } else { CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0)); } @@ -560,7 +562,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l return ncclSuccess; } -ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) { +ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) { if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) { WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut); return ncclInvalidArgument; @@ -584,7 +586,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_ // UDS fd support int fd = -1; // Send cuMem handle to remote for conversion to an fd - NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd)); + NCCLCHECK(ncclProxyClientGetFdBlocking(comm, proxyRank, &desc->shmci.data, &fd)); CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type)); (void) close(fd); } else { @@ -625,7 +627,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_ descOut->shmci.ptr = *hptr = (void *)hostptr; descOut->legacy = false; if (dptr) *dptr = (void *)hostptr; - INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity); + INFO(NCCL_SHM, "CUMEM imported shareable host buffer from proxyRank %d size %zi ptr %p, granularity %ld", proxyRank, desc->shmci.size, descOut->shmci.ptr, granularity); } else { char shmPath[SHM_PATH_MAX]; snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);