Sylvain Jeaugey 6aae379278 2.24.3-1
Network user buffer support for collectives
 * Leverage user buffer registration to achieve zero-copy
   inter-node communications for Ring, NVLS and Collnet

Add RAS subsystem
 * Create a RAS thread keeping track of all NCCL communicators.
 * Add a ncclras tool contacting the RAS thread and getting a
   report.

Add fp8 support
 * Add support for e5m2 and e4m3 8-bit floating point operations.
 * Use Tree/PAT algorithms when possible for better numerical
   stability.

Add NIC fusion
 * Add a NET API to ask the network plugin to fuse a set of
   interfaces together.
 * Fuse multiple NICs under the same PCI switch as a single,
   larger NIC.

Socket connection failure retry
 * Retry in case of socket connection failure (unreachable host)
 * Avoid "Software caused connection abort" errors on retries

QP connection failure retry
 * Retry in case of IB QP connection failure during ibv_modify_qp.

NET API improvements
 * Allow plugins to force a flush in case data and completion
   ordering is not guaranteed.
 * Indicate when completion is not needed (e.g. for the LL128
   protocol), allowing plugins to skip generating a completion.
 * Allow for full offload of allgather operations when using one
   GPU per node.

NCCL_ALGO/NCCL_PROTO strict enforcement
 * Extend NCCL_ALGO/NCCL_PROTO syntax to be able to specify
   ALGO/PROTO filters for each collective operation.
 * Strictly enforce the ALGO/PROTO filters, no longer fall back
   on the ring algorithm when the filtering leaves no option and
   error out instead.

Enable CUMEM host allocations
 * Use cumem functions for host memory allocation by default.

Improved profiler plugin API
 * Avoid dependencies with NCCL includes.
 * Add information on whether the buffer is registered or not

Adjust PAT tuning
 * Improve transition between PAT and ring at scale.

Fix hangs when running with different CPU architectures
 * Detect when we use a mix of GPU architectures
 * Ensure Algo/Proto decisions are made based on that unified
   state.

Fix FD leak in UDS
 * Fix a leak when mapping buffers intra-node with cumem IPCs.

Fix crash when mixing buffer registration and graph buffer registration.
 * Separate local and graph registration to avoid crashes when we free
   buffers.

Fix user buffer registration with dmabuf
 * Make ncclSend/ncclRecv communication with buffer registration functional
   on network plugins relying on dmabuf for buffer registration.

Fix crash in IB code caused by uninitialized fields.

Fix non-blocking ncclSend/ncclRecv
 * Fix case where ncclSend/ncclRecv would return ncclSuccess in non-blocking
   mode even though the operation was not enqueued onto the stream.
 * Issue #1495

Various compiler tweaks and fixes
 * PR #758

Fix typo in ncclTopoPrintGraph
 * Issue #1468
2025-01-07 02:01:15 -08:00

167 lines
5.9 KiB
C

/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef EVENT_H_
#define EVENT_H_
#include <sys/types.h>
#include <stdint.h>
#include <unistd.h>
#include "profiler.h"
#define MAX_CHANNELS 32
#define MAX_STEPS 16
#define MAX_OPS 16 // Up to 64K ranks for PAT
#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted)
#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted)
#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait)
#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait)
#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1)
#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1)
#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1)
#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1)
#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET)
#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET)
#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET)
#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET)
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
#define MAX_COMM_CLIQUES (32 * 8)
struct proxyOp;
struct proxyStep {
uint8_t type; // type of event: network transfer
int step; // network transfer id in given channel
int isSend; // send/recv channel operation
double timestamp[MAX_PROXY_STEP_STATES];
double startTs;
double stopTs;
struct proxyOp* parent;
};
struct proxyOp {
uint8_t type; // type of event: proxy operation
uint8_t channelId; // channel id for this proxy operation
pid_t pid;
int rank;
int peer; // peer rank for this proxy operation
int nSteps; // total number of network transfers for this proxy operation
int chunkSize; // chunk size for this proxy operation
int isSend; // send/recv channel operation
size_t transSize; // transfer data size for this proxy operation
struct {
int steps; // completed steps for this proxy operation state
double timestamp;
} states[MAX_PROXY_OP_STATES];
double startTs;
double stopTs;
int stepCount; // last processed network operation for this proxy operation
struct proxyStep step[MAX_STEPS]; // array of network transfer events
struct taskEventBase* parent; // parent event p2p/collective
};
struct group;
struct context;
struct proxyCtrl {
uint8_t type;
struct context* ctx; // profiler context
double startTs;
double stopTs;
int state;
int appended; // appended proxy operations
};
// task level event base structure
struct taskEventBase {
uint8_t type; // event type: collective/p2p
int rank; // rank of the operation in NCCL communicator
const char* name; // FIXME: unused
uint64_t commHash; // communicator identifier
const char* func; // ncclFunc*
int refCount; // number of references for this operation
struct group* parent; // parent event group
struct taskEventBase* next; // next top level event in group
double startTs;
double stopTs;
};
struct collective {
struct taskEventBase base; // base structure for this event
uint64_t seqNumber; // sequence number for this collective in communicator
void const* sendBuff;
void* recvBuff;
size_t count;
size_t trafficBytes;
int root;
const char* datatype;
uint8_t nMaxChannels;
const char* algo;
const char* proto;
int nWarps;
struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
int nProxyOps[MAX_CHANNELS];
};
struct p2p {
struct taskEventBase base; // base structure for this event
uint8_t func;
void const* buff;
size_t count;
const char* datatype;
int peer;
struct proxyOp op[MAX_CHANNELS];
};
struct group {
uint8_t type;
struct context* ctx; // profiler context
int groupId;
int refCount;
struct taskEventBase* eventHead; // queue head for task events
struct taskEventBase* eventTail; // queue tail for task events
double startTs;
double stopTs;
struct group* next; // next group event in queue
};
// arrays for different event objects
struct context {
int groupPoolSize;
int groupPoolBase;
int groupPoolIndex;
struct group* groupPool;
int collPoolSize;
int collPoolBase;
int collPoolIndex;
struct collective* collPool;
int p2pPoolSize;
int p2pPoolBase;
int p2pPoolIndex;
struct p2p* p2pPool;
int proxyCtrlPoolSize;
int proxyCtrlPoolBase;
int proxyCtrlPoolIndex;
struct proxyCtrl* proxyCtrlPool;
};
int taskEventQueueEmpty(struct group* g);
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
struct taskEventBase* taskEventQueueHead(struct group* g);
struct taskEventBase* taskEventQueueDequeue(struct group* g);
#endif