1294 lines
44 KiB
C++
1294 lines
44 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "nccl.h"
|
|
#include "core.h"
|
|
#include "channel.h"
|
|
#include "param.h"
|
|
#include "nvmlwrap.h"
|
|
#include "rings.h"
|
|
#include "trees.h"
|
|
#include "bootstrap.h"
|
|
#include "transport.h"
|
|
#include "group.h"
|
|
#include "utils.h"
|
|
#include "net.h"
|
|
#include "checks.h"
|
|
#include "enqueue.h"
|
|
#include "topo.h"
|
|
#include "nvlink.h"
|
|
#include "cpuset.h"
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/types.h>
|
|
#include <sched.h>
|
|
#include <fcntl.h>
|
|
#include <unistd.h>
|
|
#include <cuda_runtime.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
#include <assert.h>
|
|
#include <dlfcn.h>
|
|
|
|
#define STR2(v) #v
|
|
#define STR(v) STR2(v)
|
|
|
|
int ncclDebugLevel;
|
|
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
|
|
pthread_mutex_t ncclDebugOutputLock;
|
|
FILE *ncclDebugFile = stdout;
|
|
|
|
#ifdef ENABLE_TRACE
|
|
std::chrono::high_resolution_clock::time_point ncclEpoch;
|
|
#endif
|
|
|
|
#if CUDART_VERSION >= 9020
|
|
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
|
|
#else
|
|
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
|
|
#endif
|
|
|
|
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
|
|
|
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
|
|
|
ncclNet_t* ncclNet = NULL;
|
|
|
|
// We define this as weak to let tests redefine their own
|
|
#pragma weak ncclNvlinkGpu
|
|
ncclResult_t ncclNvlinkGpu(int* nvlink) {
|
|
int cudaDev;
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
|
CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
|
|
*nvlink = getNvlinkGpu(busId, NULL);
|
|
return ncclSuccess;
|
|
}
|
|
// We define this as weak to let tests redefine their own
|
|
#pragma weak ncclCudaCompCap
|
|
int ncclCudaCompCap() {
|
|
int cudaDev;
|
|
if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
|
|
int ccMajor;
|
|
if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
|
|
return ccMajor;
|
|
}
|
|
int ncclCudaFullCompCap() {
|
|
int cudaDev;
|
|
if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
|
|
int ccMajor, ccMinor;
|
|
if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
|
|
if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
|
|
return ccMajor*10+ccMinor;
|
|
}
|
|
|
|
// Returns ncclInternalError if anything fails, causing that network to be ignored.
|
|
ncclResult_t initNet(ncclNet_t* net) {
|
|
int ndev;
|
|
if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
|
|
if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
|
|
if (ndev <= 0) return ncclSystemError;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t initNetPlugin(ncclNet_t** net) {
|
|
void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
|
|
if (netPluginLib == NULL) {
|
|
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
|
// string, so checking errno doesn't hurt to try to provide a better
|
|
// error message
|
|
if (errno == ENOENT) {
|
|
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so).");
|
|
} else {
|
|
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
|
|
if (extNet == NULL) {
|
|
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
|
|
goto cleanup;
|
|
}
|
|
if (initNet(extNet) == ncclSuccess) {
|
|
*net = extNet;
|
|
return ncclSuccess;
|
|
}
|
|
cleanup:
|
|
if (netPluginLib != NULL) dlclose(netPluginLib);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t initNet() {
|
|
// Always initialize bootstrap network
|
|
NCCLCHECK(bootstrapNetInit());
|
|
|
|
NCCLCHECK(initNetPlugin(&ncclNet));
|
|
if (ncclNet != NULL) return ncclSuccess;
|
|
if (initNet(&ncclNetIb) == ncclSuccess) {
|
|
ncclNet = &ncclNetIb;
|
|
} else {
|
|
NCCLCHECK(initNet(&ncclNetSocket));
|
|
ncclNet = &ncclNetSocket;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
|
|
NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
|
|
NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2);
|
|
|
|
int ncclThreadThreshold(int minCompCap, int multiNode) {
|
|
int threshold = ncclParamThreadThreshold();
|
|
if (threshold == -2) { // user has not set this env variable
|
|
threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
|
|
// multiply by 2 if running on multiple nodes
|
|
if (multiNode) {
|
|
threshold *= 2;
|
|
}
|
|
}
|
|
return threshold;
|
|
}
|
|
|
|
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
|
|
static bool initialized = false;
|
|
static ncclResult_t ncclInit() {
|
|
if (initialized) return ncclSuccess;
|
|
pthread_mutex_lock(&initLock);
|
|
if (!initialized) {
|
|
initEnv();
|
|
initDebug();
|
|
initNet();
|
|
initialized = true;
|
|
}
|
|
pthread_mutex_unlock(&initLock);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
|
|
ncclResult_t ncclGetVersion(int* version) {
|
|
if (version == NULL) return ncclInvalidArgument;
|
|
*version = NCCL_VERSION_CODE;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
|
|
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
|
|
NCCLCHECK(ncclInit());
|
|
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
|
|
return bootstrapGetUniqueId(out);
|
|
}
|
|
|
|
// Prevent compiler from optimizing out these operations
|
|
void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) {
|
|
comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1;
|
|
}
|
|
|
|
static ncclResult_t commFree(ncclComm_t comm) {
|
|
if (comm == NULL)
|
|
return ncclSuccess;
|
|
|
|
free(comm->peerInfo);
|
|
|
|
if (comm->bootstrap)
|
|
NCCLCHECK(bootstrapClose(comm->bootstrap));
|
|
|
|
CUDACHECK(cudaFree(comm->hostDevComm.channels));
|
|
CUDACHECK(cudaFree(comm->devComm));
|
|
|
|
for (int channel=0; channel<comm->nChannels; channel++)
|
|
NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
|
|
|
|
if (comm->doneEvent != NULL)
|
|
CUDACHECK(cudaEventDestroy(comm->doneEvent));
|
|
|
|
if (comm->launchMode == ncclComm::GROUP) {
|
|
CUDACHECK(cudaStreamDestroy(comm->groupStream));
|
|
}
|
|
|
|
// Last rank frees shared resources between threads
|
|
int isLast;
|
|
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
|
|
if (isLast) {
|
|
free(comm->intraBarrier);
|
|
free(comm->intraParams);
|
|
free(comm->intraCudaDevs);
|
|
free(comm->intraCGMode);
|
|
free(comm->intraCC);
|
|
}
|
|
CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
|
|
CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
|
|
|
|
// Poison comm to try and catch a double free
|
|
commPoison(comm);
|
|
|
|
free(comm);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|
if (ndev < 1) {
|
|
WARN("invalid device count (%d) requested", ndev);
|
|
return ncclInvalidArgument;
|
|
}
|
|
if (rank >= ndev || rank < 0) {
|
|
WARN("rank %d exceeds ndev=%d", rank, ndev);
|
|
return ncclInvalidArgument;
|
|
}
|
|
|
|
// Try to create a CUDA object right away. If there is something wrong with
|
|
// the device we're on (failure cause #1) , better know it early.
|
|
cudaEvent_t doneEvent;
|
|
CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
|
|
|
|
struct ncclComm* comm;
|
|
NCCLCHECK(ncclCalloc(&comm, 1));
|
|
|
|
comm->rank = comm->hostDevComm.rank =rank;
|
|
comm->nRanks = comm->hostDevComm.nRanks = ndev;
|
|
cudaGetDevice(&comm->cudaDev);
|
|
getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
|
|
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
|
|
|
|
comm->doneEvent = doneEvent;
|
|
comm->llThreshold = ncclParamLlThreshold();
|
|
comm->treeThreshold = ncclParamTreeThreshold();
|
|
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
|
#if CUDART_VERSION >= 9020
|
|
comm->groupCudaStream = ncclParamGroupCudaStream();
|
|
#else
|
|
// Don't allow the user to overload the default setting in older CUDA builds
|
|
comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
|
|
#endif
|
|
comm->fatalError = ncclSuccess;
|
|
|
|
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
|
|
*comm->fatalDevError = ncclDevSuccess;
|
|
|
|
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
|
|
*comm->abortFlag = 0;
|
|
|
|
comm->argsptr = &comm->args;
|
|
|
|
*comret = comm;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
|
// Duplicate the channels on the device
|
|
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
|
|
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
|
|
|
|
// Copy userRanks and peers
|
|
for (int r=0; r<comm->nChannels; r++) {
|
|
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
|
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
|
|
}
|
|
|
|
// Duplicate the dev comm on the device
|
|
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
|
|
NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
|
|
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
|
|
static void showVersion() {
|
|
static int shown = 0;
|
|
if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
|
|
printf("%s\n", VERSION_STRING);
|
|
fflush(stdout);
|
|
if (ncclDebugFile != stdout)
|
|
INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
|
|
shown = 1;
|
|
}
|
|
}
|
|
|
|
static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
|
|
info->rank = rank;
|
|
CUDACHECK(cudaGetDevice(&info->cudaDev));
|
|
NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
|
|
info->hostHash=getHostHash()+commHash;
|
|
info->pidHash=getPidHash()+commHash;
|
|
|
|
// Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
|
|
// cudaDev is a CUDA runtime dev number which could be different from the
|
|
// NVML device number. Then we get the busID from NVML to be sure it is
|
|
// consistent with NVML remote PCI bus Ids.
|
|
CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
|
|
nvmlDevice_t nvmlDevice;
|
|
NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
|
|
nvmlPciInfo_t pciInfo;
|
|
NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
|
|
strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
template <int type>
|
|
static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
|
|
for (int t=0; t<NTRANSPORTS; t++) {
|
|
struct ncclTransport *transport = ncclTransports+t;
|
|
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
|
ncclTvalue_t ret = 0;
|
|
NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
|
|
if (ret > 0) {
|
|
connector->transportComm = transportComm;
|
|
NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
|
|
return ncclSuccess;
|
|
}
|
|
}
|
|
WARN("No transport found !");
|
|
return ncclInternalError;
|
|
}
|
|
|
|
static int log2(int n) {
|
|
int l = 0;
|
|
while (n>>=1) l++;
|
|
return l;
|
|
}
|
|
|
|
static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) {
|
|
int nvlink;
|
|
NCCLCHECK(ncclNvlinkGpu(&nvlink));
|
|
float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us
|
|
float ringlatinter = 6;
|
|
float treelatintra = 4;
|
|
float treelatinter = 15;
|
|
float treebw;
|
|
if (!nvlink) {
|
|
treebw = ringbw * 2 / 3;
|
|
} else {
|
|
treebw = ringbw * 3 / 4;
|
|
if (nnodes == 2) treebw *= 2;
|
|
}
|
|
float ringlat = ringlatinter*(nranks-1);
|
|
float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1);
|
|
if (nnodes < 2 || ringlat <= treelat)
|
|
*treeThreshold = 0;
|
|
else if (treebw > ringbw)
|
|
*treeThreshold = 0x7fffffffffffffff;
|
|
else
|
|
*treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) {
|
|
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
|
|
NCCLCHECK(initChannel(comm, channelId));
|
|
|
|
struct ncclChannel* channel = comm->channels+channelId;
|
|
struct ncclRing* ring = &channel->ring;
|
|
|
|
// Reorganize ranks to start with rank.
|
|
int shift;
|
|
for (shift = 0; shift<nranks; shift++) {
|
|
if (ringRanks[shift] == rank) {
|
|
break;
|
|
}
|
|
}
|
|
for (int i=0; i<nranks; i++) {
|
|
ring->userRanks[i] = ringRanks[(i+shift)%nranks];
|
|
}
|
|
int prev = ring->prev = ring->userRanks[nranks-1];
|
|
int next = ring->next = ring->userRanks[1];
|
|
|
|
struct ncclTree* tree = &channel->tree;
|
|
tree->up = -1;
|
|
tree->down[0] = tree->down[1] = tree->down[2] = -1;
|
|
|
|
//
|
|
// Find per-node masters and connect them via a binary tree
|
|
//
|
|
|
|
int nMasters = 0;
|
|
for (int r=0; r<nranks; r++) nMasters += treeMasters[r];
|
|
if (nMasters == 0) {
|
|
nMasters = 1;
|
|
treeMasters[0] = 1;
|
|
}
|
|
|
|
if (comm->treeThreshold == -2)
|
|
NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold));
|
|
|
|
if (comm->treeThreshold > 0) {
|
|
// Compute tree depth. Not an exact value but a good approximation in most
|
|
// cases and consistent across nodes
|
|
tree->depth = nranks/nMasters + log2(nMasters);
|
|
|
|
// Find my master : go backwards in the ring to find my root
|
|
int master = 0;
|
|
for (int i = 0; i<nranks; i++) {
|
|
int r = ring->userRanks[(nranks-i)%nranks];
|
|
if (treeMasters[r]) {
|
|
master = r;
|
|
break;
|
|
}
|
|
}
|
|
|
|
int* ranks;
|
|
NCCLCHECK(ncclCalloc(&ranks, nMasters));
|
|
int i = 0, masterIndex = -1;
|
|
// Build binary tree
|
|
for (int r=0; r<nranks; r++) {
|
|
// Create index table
|
|
if (r == master) masterIndex = i;
|
|
if (treeMasters[r]) ranks[i++] = r;
|
|
}
|
|
int btreeUp, btreeDown0, btreeDown1;
|
|
int u0, d0_0, d0_1, u1, d1_0, d1_1;
|
|
NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
|
|
if (channelId < DIVUP(comm->nChannels, 2)) {
|
|
btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1;
|
|
} else {
|
|
btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1;
|
|
}
|
|
|
|
//
|
|
// Now build the full tree, combining the intra-node ring and the
|
|
// inter-node binary tree.
|
|
//
|
|
|
|
if (rank == master) {
|
|
int nDown = 0;
|
|
if (btreeUp != -1) tree->up = ranks[btreeUp];
|
|
if (treeMasters[next] == 0) tree->down[nDown++] = next;
|
|
if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0];
|
|
if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1];
|
|
} else {
|
|
tree->up = prev;
|
|
if (treeMasters[next] == 0) tree->down[0] = next;
|
|
}
|
|
free(ranks);
|
|
}
|
|
|
|
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
|
|
for (int r=0; r<nranks; r++) {
|
|
connectTransport[r] = -1;
|
|
for (int t=0; t<NTRANSPORTS; t++) {
|
|
NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r));
|
|
if (connectValue[r] > 0) {
|
|
connectTransport[r] = t;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
#define MAXWIDTH 20
|
|
#define PREFIXLEN 15
|
|
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
|
|
void dumpMatrix(int* connectMatrix, int nranks) {
|
|
char line[STRLENGTH+1];
|
|
line[STRLENGTH] = '\0';
|
|
memset(line, ' ', STRLENGTH);
|
|
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
|
|
INFO(NCCL_INIT,"%s", line);
|
|
for (int i=0; i<nranks; i++) {
|
|
memset(line, ' ', STRLENGTH);
|
|
sprintf(line, "%3d ", i);
|
|
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
|
|
INFO(NCCL_INIT,"%s", line);
|
|
}
|
|
}
|
|
|
|
void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
|
|
char line[STRLENGTH+1];
|
|
line[STRLENGTH] = '\0';
|
|
memset(line, ' ', STRLENGTH);
|
|
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
|
|
INFO(NCCL_INIT,"%s", line);
|
|
for (int i=0; i<nranks; i++) {
|
|
memset(line, ' ', STRLENGTH);
|
|
sprintf(line, "%3d ", i);
|
|
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
|
|
INFO(NCCL_INIT,"%s", line);
|
|
}
|
|
}
|
|
|
|
|
|
void dumpLine(int* values, int nranks, const char* prefix) {
|
|
int prefixlen = strlen(prefix);
|
|
char line[STRLENGTH+1];
|
|
line[STRLENGTH] = '\0';
|
|
memset(line, ' ', STRLENGTH);
|
|
strncpy(line, prefix, PREFIXLEN);
|
|
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
|
|
INFO(NCCL_INIT,"%s", line);
|
|
}
|
|
|
|
static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
|
|
for (int r=0; r<nrings; r++) {
|
|
char prefix[30];
|
|
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
|
|
dumpLine(prev+r*nranks, nranks, prefix);
|
|
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
|
|
dumpLine(next+r*nranks, nranks, prefix);*/
|
|
|
|
int current = rank;
|
|
for (int i=0; i<nranks; i++) {
|
|
rings[r*nranks+i] = current;
|
|
current = next[r*nranks+current];
|
|
}
|
|
sprintf(prefix, "Channel %02d : ", r);
|
|
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
|
|
if (current != rank) {
|
|
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
|
|
return ncclInternalError;
|
|
}
|
|
// Check that all ranks are there
|
|
for (int i=0; i<nranks; i++) {
|
|
int found = 0;
|
|
for (int j=0; j<nranks; j++) {
|
|
if (rings[r*nranks+j] == i) {
|
|
found = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (found == 0) {
|
|
WARN("Error : ring %d does not contain rank %d", r, i);
|
|
return ncclInternalError;
|
|
}
|
|
}
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
void* waitForNonNullPtr(void* p) {
|
|
volatile void** ptr = (volatile void**) p;
|
|
while (*ptr == NULL) sched_yield();
|
|
return (void*)*ptr;
|
|
}
|
|
|
|
ncclResult_t initParams(struct ncclComm* comm) {
|
|
struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
|
|
params->args = &comm->argsptr;
|
|
params->stream = NULL;
|
|
params->sharedMem = 0;
|
|
params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
|
|
params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
// Allocate/Set Intra Process Structures and set CG options
|
|
ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
|
|
comm->intraRank = rank;
|
|
comm->intraRanks = ranks;
|
|
comm->intraPhase = 0;
|
|
|
|
// Alloc shared structures
|
|
if (rank == 0) {
|
|
assert(comm == comm0);
|
|
int* bar;
|
|
NCCLCHECK(ncclCalloc(&bar, 2));
|
|
bar[0] = bar[1] = 0;
|
|
comm->intraBarrier = bar;
|
|
NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
|
|
NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
|
|
int* CGMode;
|
|
NCCLCHECK(ncclCalloc(&CGMode, 1));
|
|
*CGMode = 0x11;
|
|
comm->intraCGMode = CGMode;
|
|
int* CC;
|
|
NCCLCHECK(ncclCalloc(&CC, 1));
|
|
*CC = ncclCudaFullCompCap();
|
|
comm->intraCC = CC;
|
|
} else {
|
|
comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
|
|
comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
|
|
comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
|
|
comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
|
|
comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
|
|
}
|
|
comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
|
|
NCCLCHECK(initParams(comm));
|
|
|
|
int cgMdLaunch = 0;
|
|
|
|
// Set CG Mode
|
|
comm->launchMode = ncclComm::GROUP;
|
|
char* str = getenv("NCCL_LAUNCH_MODE");
|
|
if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
|
|
comm->launchMode = ncclComm::PARALLEL;
|
|
}
|
|
if (comm->launchMode == ncclComm::GROUP) {
|
|
CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
|
|
#if CUDART_VERSION >= 9000
|
|
if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
|
|
// Check whether the GPU supports Cooperative Group Multi Device Launch
|
|
(void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Disable cgMdLaunch if any rank does not support it
|
|
if (cgMdLaunch == 0) {
|
|
*comm->intraCGMode = 0x10;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
|
|
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
|
uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
|
|
struct ncclConnect connect;
|
|
struct ncclConnector* conn;
|
|
for (int i=0; i<nrecv; i++) {
|
|
int peer = peerRecv[i];
|
|
if (peer == -1) continue;
|
|
conn = &channel->peers[peer].recv;
|
|
if (conn->connected) { ++nSkippedRecv; continue; }
|
|
memset(&connect, 0, sizeof(connect));
|
|
NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
|
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
|
}
|
|
for (int i=0; i<nsend; i++) {
|
|
int peer = peerSend[i];
|
|
if (peer == -1) continue;
|
|
conn = &channel->peers[peer].send;
|
|
if (conn->connected) { ++nSkippedSend; continue; }
|
|
memset(&connect, 0, sizeof(connect));
|
|
NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
|
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
|
}
|
|
for (int i=0; i<nsend; i++) {
|
|
int peer = peerSend[i];
|
|
if (peer == -1) continue;
|
|
conn = &channel->peers[peer].send;
|
|
if (conn->connected) {++nSkippedSend; continue; }
|
|
memset(&connect, 0, sizeof(connect));
|
|
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
|
NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
|
conn->connected = 1;
|
|
}
|
|
for (int i=0; i<nrecv; i++) {
|
|
int peer = peerRecv[i];
|
|
if (peer == -1) continue;
|
|
conn = &channel->peers[peer].recv;
|
|
if (conn->connected) {++nSkippedRecv; continue; }
|
|
memset(&connect, 0, sizeof(connect));
|
|
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
|
NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
|
conn->connected = 1;
|
|
}
|
|
TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
|
|
// We use 3 AllGathers
|
|
// 1. { peerInfo, comm }
|
|
// 2. ConnectTransport[nranks], ConnectValue[nranks]
|
|
// 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
|
|
|
|
int rank = comm->rank;
|
|
int nranks = comm->nRanks;
|
|
uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
|
|
TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
|
|
NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
|
|
|
|
// AllGather1 - begin
|
|
struct {
|
|
struct ncclPeerInfo peerInfo;
|
|
struct ncclComm* comm;
|
|
} *allGather1Data;
|
|
|
|
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
|
|
allGather1Data[rank].comm = comm;
|
|
NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
|
|
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
|
|
|
|
NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
|
|
for (int i = 0; i < nranks; i++) {
|
|
memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
|
|
}
|
|
// AllGather1 data is used again below
|
|
// AllGather1 - end
|
|
|
|
// AllGather2 - begin
|
|
size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks;
|
|
void *allGather2Data;
|
|
NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks));
|
|
int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank);
|
|
ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks);
|
|
|
|
NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow));
|
|
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize));
|
|
|
|
int* connectTransport;
|
|
ncclTvalue_t* connectValue;
|
|
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
|
|
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
|
|
for (int i = 0; i < nranks; i++) {
|
|
memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks);
|
|
memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks);
|
|
}
|
|
free(allGather2Data);
|
|
// AllGather2 - end
|
|
|
|
//if (rank == 0) dumpMatrix(connectTransport, nranks);
|
|
//if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
|
|
|
|
// Get my rings
|
|
int nrings;
|
|
int* prev, *next, *treeIn, *treeOut;
|
|
NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
|
|
NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
|
|
NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
|
|
NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
|
|
comm->nThreads = getDefaultThreads();
|
|
NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
|
|
TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings);
|
|
assert(nrings <= MAXCHANNELS);
|
|
free(connectTransport);
|
|
free(connectValue);
|
|
|
|
// AllGather3 - begin
|
|
struct {
|
|
int nThreads;
|
|
int nrings;
|
|
int cudaCompCap;
|
|
int prev[MAXCHANNELS];
|
|
int next[MAXCHANNELS];
|
|
} *allGather3Data;
|
|
|
|
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
|
|
allGather3Data[rank].nThreads = comm->nThreads;
|
|
allGather3Data[rank].nrings = nrings;
|
|
allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
|
|
for (int r=0; r<nrings; r++) {
|
|
allGather3Data[rank].prev[r] = *(prev+r*nranks+rank);
|
|
allGather3Data[rank].next[r] = *(next+r*nranks+rank);
|
|
}
|
|
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
|
|
|
|
// Find max nThreads
|
|
for (int i=0; i<nranks; i++)
|
|
comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads);
|
|
|
|
// Determine the minimum CUDA Compute capability of all GPUs
|
|
int myCompCap = allGather3Data[rank].cudaCompCap;
|
|
int minCompCap = myCompCap;
|
|
for (int i = 0; i < nranks; i++)
|
|
minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
|
|
|
|
// Determine thread threshold across all GPUs
|
|
int nnodes = 0;
|
|
for (int r=0; r<nranks; r++) nnodes += treeIn[r];
|
|
comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes);
|
|
|
|
// Find min nrings across ranks
|
|
for (int i=0; i<nranks; i++)
|
|
nrings = std::min(allGather3Data[i].nrings, nrings);
|
|
comm->nChannels = nrings;
|
|
|
|
// Unpack the per ring prev/next arrays
|
|
for (int i = 0; i < nranks; i++) {
|
|
for (int r = 0; r < nrings; r++) {
|
|
prev[r*nranks+i] = allGather3Data[i].prev[r];
|
|
next[r*nranks+i] = allGather3Data[i].next[r];
|
|
}
|
|
}
|
|
free(allGather3Data);
|
|
// AllGather3 - end
|
|
|
|
int *rings;
|
|
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
|
NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
|
|
free(prev);
|
|
free(next);
|
|
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings);
|
|
|
|
// Connect with prev/next for each ring
|
|
struct ncclConnect *connect;
|
|
NCCLCHECK(ncclCalloc(&connect, 2));
|
|
for (int r=0; r<nrings; r++) {
|
|
struct ncclChannel* channel = comm->channels+r;
|
|
NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks));
|
|
NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
|
|
NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up));
|
|
NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down));
|
|
}
|
|
if (comm->treeThreshold > 0) {
|
|
char line[1024];
|
|
line[0]='\0';
|
|
for (int c=0; c<nrings; c++) {
|
|
struct ncclTree* tree = &comm->channels[c].tree;
|
|
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d",
|
|
c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]);
|
|
}
|
|
line[1023] = '\0';
|
|
INFO(NCCL_INIT, "Trees%s", line);
|
|
}
|
|
if (rank == 0) {
|
|
char treeline[64];
|
|
snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold);
|
|
INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap,
|
|
comm->treeThreshold == 0 ? "disabled" :
|
|
comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" :
|
|
treeline);
|
|
}
|
|
|
|
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings);
|
|
free(connect);
|
|
free(rings);
|
|
free(treeIn);
|
|
free(treeOut);
|
|
|
|
// Compute intra ranks (using AllGather1 data)
|
|
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
|
|
for (int i = 0; i < nranks; i++) {
|
|
if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
|
|
(allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
|
|
if (intraRanks == 0) intraRank0 = i;
|
|
if (i == rank) intraRank = intraRanks;
|
|
intraRanks++;
|
|
}
|
|
}
|
|
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
|
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
|
|
if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
|
|
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
|
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
|
|
return ncclInternalError;
|
|
}
|
|
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
|
|
|
|
// Done with AllGather1 data
|
|
free(allGather1Data);
|
|
|
|
if (nnodes) NCCLCHECK(transportCreateProxy(comm));
|
|
|
|
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
|
|
CPU_ZERO_S(sizeof(cpu_set_t), mask);
|
|
char* cudaPath;
|
|
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
|
|
char path[PATH_MAX];
|
|
strncpy(path, cudaPath, PATH_MAX-1);
|
|
snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
|
|
path[PATH_MAX-1] = '\0';
|
|
int fd;
|
|
SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
|
|
char affinityStr[sizeof(cpu_set_t)*2 + 1];
|
|
int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
|
|
if (r > 0) {
|
|
affinityStr[r] = '\0';
|
|
NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
|
|
}
|
|
close(fd);
|
|
free(cudaPath);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
|
|
|
|
static ncclResult_t setCpuAffinity(int cudaDev) {
|
|
// Query the CPU affinity set we were provided
|
|
cpu_set_t mask;
|
|
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
|
|
|
|
#ifdef ENABLE_TRACE
|
|
{
|
|
char affinityStr[sizeof(cpu_set_t)*2];
|
|
NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
|
|
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
|
|
}
|
|
#endif
|
|
|
|
// Find the CPUs that are local to the supplied GPU
|
|
cpu_set_t gpuMask;
|
|
NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
|
|
|
|
#ifdef ENABLE_TRACE
|
|
{
|
|
char affinityStr[sizeof(cpu_set_t)*2];
|
|
NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
|
|
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
|
|
}
|
|
#endif
|
|
|
|
cpu_set_t finalMask;
|
|
if (ncclParamIgnoreCpuAffinity())
|
|
// Ignore the CPU affinity set and use the GPU one instead
|
|
finalMask = gpuMask;
|
|
else
|
|
// Use a subset of the GPU affinity set
|
|
CPU_AND(&finalMask, &mask, &gpuMask);
|
|
|
|
// If there is a non empty set, use it to set affinity
|
|
if (CPU_COUNT(&finalMask)) {
|
|
char affinityStr[sizeof(cpu_set_t)*2];
|
|
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
|
|
INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
|
|
SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
|
|
cpu_set_t affinitySave;
|
|
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
|
|
|
NCCLCHECK(wrapNvmlSymbols());
|
|
NCCLCHECK(wrapNvmlInit());
|
|
|
|
// Make sure all host memory allocation are close to the GPU
|
|
int cudaDev;
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
NCCLCHECK(setCpuAffinity(cudaDev));
|
|
ncclResult_t res;
|
|
|
|
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
|
|
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
|
|
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
|
|
|
|
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
|
NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
|
|
|
|
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev);
|
|
|
|
return ncclSuccess;
|
|
cleanup:
|
|
*newcomm = NULL;
|
|
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
|
return res;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
|
|
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
|
|
char* env = getenv("NCCL_COMM_ID");
|
|
if (env && myrank == 0) {
|
|
NCCLCHECK(bootstrapCreateRoot(&commId, true));
|
|
}
|
|
|
|
NCCLCHECK(ncclInit());
|
|
if (myrank == 0) showVersion();
|
|
|
|
// Make sure the CUDA runtime is initialized.
|
|
CUDACHECK(cudaFree(NULL));
|
|
|
|
NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
|
|
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
|
|
WARN("Invalid rank requested : %d/%d", myrank, nranks);
|
|
return ncclInvalidArgument;
|
|
}
|
|
|
|
if (ncclAsyncMode()) {
|
|
int cudaDev;
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
|
|
} else {
|
|
return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
|
|
}
|
|
}
|
|
|
|
static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
|
|
struct ncclPeerInfo* allInfo;
|
|
NCCLCHECK(ncclCalloc(&allInfo, nranks));
|
|
for (int rank=0; rank<nranks; rank++) {
|
|
CUDACHECK(cudaSetDevice(devs[rank]));
|
|
NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
|
|
}
|
|
|
|
int* connectTransport;
|
|
ncclTvalue_t* connectValue;
|
|
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
|
|
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
|
|
for (int rank=0; rank<nranks; rank++)
|
|
NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
|
|
|
|
int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut;
|
|
NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
|
|
NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS));
|
|
NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
|
|
NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS));
|
|
NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
|
|
NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
|
|
int nrings = MAXCHANNELS;
|
|
int nthreads=0;
|
|
int myCompCap = ncclCudaCompCap();
|
|
int minCompCap = myCompCap;
|
|
for (int rank=0; rank<nranks; rank++) {
|
|
CUDACHECK(cudaSetDevice(devs[rank]));
|
|
int nringsRank;
|
|
int nthreadsRank = getDefaultThreads();
|
|
myCompCap = ncclCudaCompCap();
|
|
NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
|
|
nrings = std::min(nrings, nringsRank);
|
|
nthreads = std::max(nthreads, nthreadsRank);
|
|
minCompCap = std::min(minCompCap, myCompCap);
|
|
for (int ring=0; ring<nrings; ring++) {
|
|
int index = ring*nranks+rank;
|
|
prevFinal[index] = prev[index];
|
|
nextFinal[index] = next[index];
|
|
}
|
|
}
|
|
free(connectTransport);
|
|
free(connectValue);
|
|
free(prev);
|
|
free(next);
|
|
|
|
INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap);
|
|
|
|
int* rings;
|
|
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
|
NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
|
|
free(prevFinal);
|
|
free(nextFinal);
|
|
|
|
// Determine thread threshold across all GPUs
|
|
int threadThreshold = ncclThreadThreshold(minCompCap, 0);
|
|
|
|
for (int rank=0; rank<nranks; rank++) {
|
|
comms[rank]->nChannels = nrings;
|
|
comms[rank]->nThreads = nthreads;
|
|
comms[rank]->threadThreshold = threadThreshold;
|
|
}
|
|
|
|
struct ncclConnect* connect;
|
|
NCCLCHECK(ncclCalloc(&connect, 2*nranks));
|
|
for (int r=0; r<nrings; r++) {
|
|
int* ringRanks = rings+r*nranks;
|
|
for (int rank=0; rank<nranks; rank++) {
|
|
CUDACHECK(cudaSetDevice(devs[rank]));
|
|
struct ncclChannel* channel = comms[rank]->channels+r;
|
|
struct ncclRing *ring = &channel->ring;
|
|
NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
|
|
// Make sure we don't use trees, we cannot use them with initAll
|
|
comms[rank]->treeThreshold = 0;
|
|
int prev = channel->ring.prev = ring->userRanks[nranks-1];
|
|
int next = channel->ring.next = ring->userRanks[1];
|
|
struct ncclConnector* recv = &channel->peers[prev].recv;
|
|
struct ncclConnector* send = &channel->peers[next].send;
|
|
NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id));
|
|
NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
|
|
}
|
|
for (int rank=0; rank<nranks; rank++) {
|
|
CUDACHECK(cudaSetDevice(devs[rank]));
|
|
struct ncclChannel* channel = comms[rank]->channels+r;
|
|
struct ncclRing *ring = &channel->ring;
|
|
struct ncclConnector* recv = &channel->peers[ring->prev].recv;
|
|
struct ncclConnector* send = &channel->peers[ring->next].send;
|
|
NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv));
|
|
NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
|
|
}
|
|
}
|
|
free(connect);
|
|
free(allInfo);
|
|
free(rings);
|
|
free(treeIn);
|
|
free(treeOut);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
|
|
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
|
|
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
|
NCCLCHECK(ncclInit());
|
|
NCCLCHECK(wrapNvmlSymbols());
|
|
NCCLCHECK(wrapNvmlInit());
|
|
showVersion();
|
|
|
|
INFO(NCCL_INIT,"nranks %d", ndev);
|
|
|
|
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
|
|
if (ndev < 1) {
|
|
WARN("Invalid device count requested : %d", ndev);
|
|
return ncclInvalidArgument;
|
|
}
|
|
|
|
ncclResult_t res;
|
|
int savedDevice;
|
|
int rank, cudaDev;
|
|
ncclComm_t comm = NULL;
|
|
int* ncclDevList = NULL;
|
|
NCCLCHECK(ncclCalloc(&ncclDevList, ndev));
|
|
for (int i=0; i<ndev; i++) {
|
|
ncclDevList[i] = devlist ? devlist[i] : i;
|
|
}
|
|
|
|
CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);
|
|
|
|
for(rank=0; rank<ndev; ++rank)
|
|
comms[rank] = NULL;
|
|
|
|
cpu_set_t affinitySave;
|
|
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
|
|
|
for (rank=0; rank<ndev; ++rank) {
|
|
cudaDev = ncclDevList[rank];
|
|
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
|
|
|
|
NCCLCHECK(setCpuAffinity(cudaDev));
|
|
|
|
NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
|
|
comms[rank] = comm;
|
|
|
|
NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
|
|
}
|
|
|
|
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
|
|
|
NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
|
|
|
|
for(rank=0; rank<ndev; ++rank) {
|
|
cudaDev = ncclDevList[rank];
|
|
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
|
|
NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
|
|
}
|
|
|
|
res = ncclSuccess;
|
|
goto final;
|
|
|
|
cleanup:
|
|
for(rank=0; rank<ndev; ++rank) {
|
|
if(comms[rank] != NULL) {
|
|
commFree(comms[rank]);
|
|
}
|
|
}
|
|
|
|
final:
|
|
free(ncclDevList);
|
|
if(wrapNvmlShutdown() != ncclSuccess)
|
|
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
|
|
cudaSetDevice(savedDevice);
|
|
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
|
return res;
|
|
}
|
|
|
|
|
|
static ncclResult_t commDestroy(ncclComm_t comm) {
|
|
int savedDevice;
|
|
#ifdef ENABLE_TRACE
|
|
int rank = comm->rank;
|
|
#endif
|
|
CUDACHECK(cudaGetDevice(&savedDevice));
|
|
int commDevice = comm->cudaDev;
|
|
|
|
if (savedDevice != commDevice) {
|
|
CUDACHECK(cudaSetDevice(commDevice));
|
|
}
|
|
|
|
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
|
|
|
|
CUDACHECK(cudaStreamSynchronize(comm->groupStream));
|
|
NCCLCHECK(transportDestroyProxy(comm));
|
|
NCCLCHECK(commFree(comm));
|
|
|
|
if (savedDevice != commDevice)
|
|
CUDACHECK(cudaSetDevice(savedDevice));
|
|
|
|
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
|
|
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
|
|
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
|
if (comm == NULL)
|
|
return ncclSuccess;
|
|
|
|
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev);
|
|
|
|
// Try and prevent a double free of the comm struct (user error)
|
|
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) {
|
|
WARN("comm %p has already been destroyed", comm);
|
|
return ncclInvalidArgument;
|
|
}
|
|
|
|
return commDestroy(comm);
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
|
|
ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
|
if (comm == NULL)
|
|
return ncclSuccess;
|
|
|
|
// Ask anything that might still be running on the device to quit
|
|
*comm->abortFlag = 1;
|
|
|
|
return commDestroy(comm);
|
|
}
|
|
|
|
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
|
|
const char* ncclGetErrorString(ncclResult_t code) {
|
|
switch (code) {
|
|
case ncclSuccess : return "no error";
|
|
case ncclUnhandledCudaError : return "unhandled cuda error";
|
|
case ncclSystemError : return "unhandled system error";
|
|
case ncclInternalError : return "internal error";
|
|
case ncclInvalidArgument : return "invalid argument";
|
|
case ncclInvalidUsage : return "invalid usage";
|
|
default : return "unknown result code";
|
|
}
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
|
|
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
|
|
NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
|
|
NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
|
|
|
|
// Check device reported error
|
|
static ncclDevError_t printedDevErr = ncclDevSuccess;
|
|
switch(*comm->fatalDevError) {
|
|
case ncclDevSuccess :
|
|
break;
|
|
case ncclDevAssertedMismatch :
|
|
if (printedDevErr != ncclDevAssertedMismatch) {
|
|
WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
|
|
printedDevErr = ncclDevAssertedMismatch;
|
|
}
|
|
if (comm->fatalError == ncclSuccess) {
|
|
comm->fatalError = ncclInvalidUsage;
|
|
}
|
|
break;
|
|
case ncclDevSuspectedMismatch :
|
|
if (printedDevErr != ncclDevSuspectedMismatch) {
|
|
WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
|
|
printedDevErr = ncclDevSuspectedMismatch;
|
|
}
|
|
break;
|
|
default:
|
|
WARN("Unknown device error %d", *comm->fatalDevError);
|
|
return ncclInternalError;
|
|
}
|
|
*asyncError = comm->fatalError;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
|
|
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
|
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
|
|
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
|
|
*count = comm->nRanks;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
|
|
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
|
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
|
|
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
|
|
*devid = comm->cudaDev;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
|
|
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
|
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
|
|
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
|
|
*rank = comm->rank;
|
|
return ncclSuccess;
|
|
}
|