nccl/src/include/comm.h
Sylvain Jeaugey 920dbe5b35 2.8.3-1
Optimization for Tree allreduce on A100.
Improve aggregation performance.
Use shared buffers for inter-node send/recv.
Add NVTX profiling hooks.
Accelerate alltoall connections by merging communication for all
channels.
Add support for one hop communication through NVLink, for faster
send/recv communication on cubemesh topologies like DGX-1.
Improve alltoall scheduling to better balance intra/inter node
communication.
Increase send/recv parallelism by 8x, each warp sending or
receiving to a different peer.
Net: move to v4.
Net: make flush operation asynchronous to accelerate alltoall.
Net: define maximum number of requests.
Fix hang when using LL128 protocol after 2^31 steps.
Fix #379 : topology injection failing when using less GPUs than
described in the XML.
Fix #394 : protocol mismatch causing hangs or crashes when using
one GPU per node.
2020-11-17 11:08:52 -08:00

157 lines
3.8 KiB
C

/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#include "transport.h"
#include "p2p.h"
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
dim3 gridDim;
dim3 blockDim;
void **args;
size_t sharedMem;
cudaStream_t stream;
};
#endif
#define CACHE_LINE_SIZE 128
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
// Channels / LL tuning
#define NCCL_LL_THREAD_THRESHOLD 8
#define NCCL_LL128_THREAD_THRESHOLD 8
#define NCCL_SIMPLE_THREAD_THRESHOLD 64
struct ncclSendMem {
union {
struct {
uint64_t head;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
};
char pad3[MEM_ALIGN];
};
char buff[1]; // Actually larger than that
};
struct ncclRecvMem {
union {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
void* ptrsFifo[NCCL_STEPS];
};
char pad4[MEM_ALIGN];
};
char buff[1]; // Actually larger than that
};
struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
int connect;
uint32_t* connectSend;
uint32_t* connectRecv;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int64_t busId; // my PCI bus ID in int format
int node;
int nNodes;
int localRanks;
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
bool userStreamSet;
cudaEvent_t doneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
uint64_t opCount;
uint64_t lastOpCount;
// Channels for collectives
int nChannels;
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
int p2pChannels[MAXCHANNELS];
// Buffer sizes
int buffSizes[NCCL_NUM_PROTOCOLS];
// Algorithm/Protocols thresholds
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
cudaStream_t groupStream;
// Whether there has been a fatal error in this communicator.
ncclResult_t fatalError;
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
// Device side of the communicator
struct ncclDevComm *devComm;
// Host copy of the devComm (to free CUDA allocs)
struct ncclDevComm hostDevComm;
// Intra-process sync
int intraRank;
int intraRanks;
int* intraBarrier;
int intraPhase;
// Storage for deferred intra-process launch
struct cudaLaunchParams * intraParams;
struct cudaLaunchParams *myParams;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclWorkElem args;
void* argsptr;
// Global proxy thread
pthread_t proxyThread;
struct ncclProxyState proxyState;
// Whether this communicator uses collNet
int collNetSupport;
// Store info of async operations
struct ncclInfo* asyncOps;
int asyncOpCount;
size_t asyncTotalSize;
//list of async p2p operation queued in a group semantics
struct ncclP2Plist* p2pSends;
struct ncclP2Plist* p2pRecvs;
int p2pSendCount;
int p2pRecvCount;
};
#endif