Optimization for Tree allreduce on A100. Improve aggregation performance. Use shared buffers for inter-node send/recv. Add NVTX profiling hooks. Accelerate alltoall connections by merging communication for all channels. Add support for one hop communication through NVLink, for faster send/recv communication on cubemesh topologies like DGX-1. Improve alltoall scheduling to better balance intra/inter node communication. Increase send/recv parallelism by 8x, each warp sending or receiving to a different peer. Net: move to v4. Net: make flush operation asynchronous to accelerate alltoall. Net: define maximum number of requests. Fix hang when using LL128 protocol after 2^31 steps. Fix #379 : topology injection failing when using less GPUs than described in the XML. Fix #394 : protocol mismatch causing hangs or crashes when using one GPU per node.
157 lines
3.8 KiB
C
157 lines
3.8 KiB
C
/*************************************************************************
|
|
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_COMM_H_
|
|
#define NCCL_COMM_H_
|
|
|
|
#include "transport.h"
|
|
#include "p2p.h"
|
|
|
|
#if CUDART_VERSION < 9000
|
|
struct cudaLaunchParams {
|
|
void *func;
|
|
dim3 gridDim;
|
|
dim3 blockDim;
|
|
void **args;
|
|
size_t sharedMem;
|
|
cudaStream_t stream;
|
|
};
|
|
#endif
|
|
|
|
#define CACHE_LINE_SIZE 128
|
|
#define MEM_ALIGN 4096
|
|
#define CUDA_IPC_MIN 2097152UL
|
|
|
|
// Channels / LL tuning
|
|
#define NCCL_LL_THREAD_THRESHOLD 8
|
|
#define NCCL_LL128_THREAD_THRESHOLD 8
|
|
#define NCCL_SIMPLE_THREAD_THRESHOLD 64
|
|
|
|
struct ncclSendMem {
|
|
union {
|
|
struct {
|
|
uint64_t head;
|
|
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
|
void* ptrExchange;
|
|
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
|
|
};
|
|
char pad3[MEM_ALIGN];
|
|
};
|
|
char buff[1]; // Actually larger than that
|
|
};
|
|
|
|
struct ncclRecvMem {
|
|
union {
|
|
struct {
|
|
uint64_t tail;
|
|
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
|
int sizesFifo[NCCL_STEPS];
|
|
void* ptrsFifo[NCCL_STEPS];
|
|
};
|
|
char pad4[MEM_ALIGN];
|
|
};
|
|
char buff[1]; // Actually larger than that
|
|
};
|
|
|
|
struct ncclComm {
|
|
struct ncclChannel channels[MAXCHANNELS];
|
|
|
|
struct ncclPeerInfo* peerInfo;
|
|
struct ncclTopoSystem* topo;
|
|
|
|
void* bootstrap;
|
|
// Bitmasks for ncclTransportP2pSetup
|
|
int connect;
|
|
uint32_t* connectSend;
|
|
uint32_t* connectRecv;
|
|
|
|
int rank; // my rank in the communicator
|
|
int nRanks; // number of GPUs in communicator
|
|
int cudaDev; // my cuda device index
|
|
int64_t busId; // my PCI bus ID in int format
|
|
|
|
int node;
|
|
int nNodes;
|
|
int localRanks;
|
|
|
|
enum { GROUP, PARALLEL } launchMode;
|
|
cudaStream_t userStream;
|
|
bool userStreamSet;
|
|
cudaEvent_t doneEvent;
|
|
bool checkPointers;
|
|
|
|
// Counter to make sure collectives match (needed for bcast/reduce
|
|
// where syncs are not symmetric).
|
|
uint64_t opCount;
|
|
uint64_t lastOpCount;
|
|
|
|
// Channels for collectives
|
|
int nChannels;
|
|
// Channels (per peer) for p2p
|
|
int p2pnChannels;
|
|
int p2pnChannelsPerPeer;
|
|
int p2pChannels[MAXCHANNELS];
|
|
|
|
// Buffer sizes
|
|
int buffSizes[NCCL_NUM_PROTOCOLS];
|
|
|
|
// Algorithm/Protocols thresholds
|
|
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
|
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
|
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
|
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
|
|
|
// An internal CUDA stream for NCCL kernel CGMD launches
|
|
int groupCudaStream;
|
|
cudaStream_t groupStream;
|
|
|
|
// Whether there has been a fatal error in this communicator.
|
|
ncclResult_t fatalError;
|
|
|
|
// Flag to ask NCCL kernels to abort
|
|
volatile uint32_t *abortFlag;
|
|
|
|
// Device side of the communicator
|
|
struct ncclDevComm *devComm;
|
|
// Host copy of the devComm (to free CUDA allocs)
|
|
struct ncclDevComm hostDevComm;
|
|
|
|
// Intra-process sync
|
|
int intraRank;
|
|
int intraRanks;
|
|
int* intraBarrier;
|
|
int intraPhase;
|
|
|
|
// Storage for deferred intra-process launch
|
|
struct cudaLaunchParams * intraParams;
|
|
struct cudaLaunchParams *myParams;
|
|
int* intraCudaDevs;
|
|
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
|
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
|
struct ncclWorkElem args;
|
|
void* argsptr;
|
|
|
|
// Global proxy thread
|
|
pthread_t proxyThread;
|
|
struct ncclProxyState proxyState;
|
|
|
|
// Whether this communicator uses collNet
|
|
int collNetSupport;
|
|
|
|
// Store info of async operations
|
|
struct ncclInfo* asyncOps;
|
|
int asyncOpCount;
|
|
size_t asyncTotalSize;
|
|
|
|
//list of async p2p operation queued in a group semantics
|
|
struct ncclP2Plist* p2pSends;
|
|
struct ncclP2Plist* p2pRecvs;
|
|
int p2pSendCount;
|
|
int p2pRecvCount;
|
|
};
|
|
|
|
#endif
|