Add local user buffer registration for NVLink SHARP. Add tuning plugin support. Increase net API to v7 to allow for device-side packet reordering; remove support for v4 plugins. Add support for RoCE ECE. Add support for C2C links. Better detect SHM allocation failures to avoid crash with Bus Error. Fix missing thread unlocks in bootstrap (Fixes #936). Disable network flush by default on H100. Move device code from src/collectives/device to src/device.
69 lines
3.4 KiB
C
69 lines
3.4 KiB
C
/*
|
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
|
*/
|
|
|
|
#ifndef NCCL_NET_V6_H_
|
|
#define NCCL_NET_V6_H_
|
|
|
|
typedef struct {
|
|
char* name; // Used mostly for logging.
|
|
char* pciPath; // Path to the PCI device in /sys.
|
|
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
|
// cards with multiple PCI functions (Physical or virtual).
|
|
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
|
int speed; // Port speed in Mbps.
|
|
int port; // Port number.
|
|
float latency; // Network latency
|
|
int maxComms; // Maximum number of comms we can create
|
|
int maxRecvs; // Maximum number of grouped receives.
|
|
}ncclNetProperties_v6_t;
|
|
|
|
typedef struct {
|
|
// Name of the network (mainly for logs)
|
|
const char* name;
|
|
// Initialize the network.
|
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
|
// Return the number of adapters.
|
|
ncclResult_t (*devices)(int* ndev);
|
|
// Get various device properties.
|
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
|
// Create a receiving object and provide a handle to connect to it. The
|
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
|
// between ranks to create a connection.
|
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
|
// Connect to a handle and return a sending comm object for that peer.
|
|
// This call must not block for the connection to be established, and instead
|
|
// should return successfully with sendComm == NULL with the expectation that
|
|
// it will be called again until sendComm != NULL.
|
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
|
// Finalize connection establishment after remote peer has called connect.
|
|
// This call must not block for the connection to be established, and instead
|
|
// should return successfully with recvComm == NULL with the expectation that
|
|
// it will be called again until recvComm != NULL.
|
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
|
/* DMA-BUF support */
|
|
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
|
// Asynchronous send to a peer.
|
|
// May return request == NULL if the call cannot be performed (or would block)
|
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
|
// Asynchronous recv from a peer.
|
|
// May return request == NULL if the call cannot be performed (or would block)
|
|
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
|
// visible to the GPU
|
|
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
|
// Test whether a request is complete. If size is not NULL, it returns the
|
|
// number of bytes sent/received.
|
|
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
|
// Close and free send/recv comm objects
|
|
ncclResult_t (*closeSend)(void* sendComm);
|
|
ncclResult_t (*closeRecv)(void* recvComm);
|
|
ncclResult_t (*closeListen)(void* listenComm);
|
|
} ncclNet_v6_t;
|
|
|
|
#endif // end include guard
|