2.9.9-1
Fix crash when setting NCCL_MAX_P2P_NCHANNELS below nchannels. Fix hang during sendrecv dynamic NVB connection establishment on cubemesh topologies. Add environment variable to only use SHARP on communicators beyond a given number of ranks. Add debug subsystem to trace memory allocations. Fix compilation with TRACE=1. (Issue #505)
This commit is contained in:
parent
ca8485b0d0
commit
3fec2fa5ee
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 9
|
NCCL_MINOR := 9
|
||||||
NCCL_PATCH := 8
|
NCCL_PATCH := 9
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -62,6 +62,8 @@ void ncclDebugInit() {
|
|||||||
mask = NCCL_TUNING;
|
mask = NCCL_TUNING;
|
||||||
} else if (strcasecmp(subsys, "ENV") == 0) {
|
} else if (strcasecmp(subsys, "ENV") == 0) {
|
||||||
mask = NCCL_ENV;
|
mask = NCCL_ENV;
|
||||||
|
} else if (strcasecmp(subsys, "ALLOC") == 0) {
|
||||||
|
mask = NCCL_ALLOC;
|
||||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||||
mask = NCCL_ALL;
|
mask = NCCL_ALL;
|
||||||
}
|
}
|
||||||
|
@ -133,7 +133,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
|
|||||||
// Because in cudaGraph mode the launch param needs to be determined
|
// Because in cudaGraph mode the launch param needs to be determined
|
||||||
// at capture time instead of launch time.
|
// at capture time instead of launch time.
|
||||||
if (!usingCudaGraph) {
|
if (!usingCudaGraph) {
|
||||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
|
||||||
|
for (int c=0; c<nChannels; c++) {
|
||||||
if (comm->channels[c].workCount) params->gridDim.x = c+1;
|
if (comm->channels[c].workCount) params->gridDim.x = c+1;
|
||||||
}
|
}
|
||||||
eqInfo->maxChannels = params->gridDim.x;
|
eqInfo->maxChannels = params->gridDim.x;
|
||||||
@ -169,8 +170,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
|
|||||||
// GDRCOPY support
|
// GDRCOPY support
|
||||||
uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
|
uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
|
||||||
uint64_t nelems = channel->workCount;
|
uint64_t nelems = channel->workCount;
|
||||||
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld last %ld nelems %zi",
|
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi",
|
||||||
channel->workFifo, channel->workFifoGdr, first, last, nelems);
|
channel->workFifo, channel->workFifoGdr, first, nelems);
|
||||||
|
|
||||||
for (int i = 0; i < nelems; i++) {
|
for (int i = 0; i < nelems; i++) {
|
||||||
int elem = (first+i) % NCCL_MAX_OPS;
|
int elem = (first+i) % NCCL_MAX_OPS;
|
||||||
@ -799,6 +800,14 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
|
|||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
cudaStreamCaptureStatus captureStatus;
|
cudaStreamCaptureStatus captureStatus;
|
||||||
unsigned long long cudaGraphId;
|
unsigned long long cudaGraphId;
|
||||||
|
if (comm->driverVersion < 11030) {
|
||||||
|
CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
|
||||||
|
if (captureStatus != cudaStreamCaptureStatusNone) {
|
||||||
|
WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
|
||||||
|
return ncclInvalidUsage;
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
|
CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
|
||||||
if (captureStatus == cudaStreamCaptureStatusActive) {
|
if (captureStatus == cudaStreamCaptureStatusActive) {
|
||||||
if (cudaGraphId != comm->lastCudaGraphId) {
|
if (cudaGraphId != comm->lastCudaGraphId) {
|
||||||
|
@ -29,6 +29,8 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
|
|||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
|
||||||
|
|
||||||
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
|
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
|
||||||
if (baseNode->paths[baseNode->type] == NULL) {
|
if (baseNode->paths[baseNode->type] == NULL) {
|
||||||
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||||
@ -63,7 +65,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
|||||||
|
|
||||||
// allow routing through a GPU only as 1 hop
|
// allow routing through a GPU only as 1 hop
|
||||||
if (node != baseNode && node->type == GPU &&
|
if (node != baseNode && node->type == GPU &&
|
||||||
(link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
|
(ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
|
||||||
|
|
||||||
if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) {
|
if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) {
|
||||||
// Find reverse link
|
// Find reverse link
|
||||||
@ -529,3 +531,20 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
|||||||
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) {
|
||||||
|
int ngpus = system->nodes[GPU].count;
|
||||||
|
NCCLCHECK(ncclCalloc(ranks, ngpus));
|
||||||
|
int nvbGpus = 0;
|
||||||
|
for (int g=0; g<ngpus; g++) {
|
||||||
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||||
|
if (gpu->gpu.rank != rank) continue;
|
||||||
|
for (int p=0; p<ngpus; p++) {
|
||||||
|
if (gpu->paths[GPU][p].type == PATH_NVB) {
|
||||||
|
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*nranks = nvbGpus;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
@ -469,7 +469,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
|||||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
|
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
|
||||||
}
|
}
|
||||||
ncclDebugNoWarn = 1;
|
ncclDebugNoWarn = NCCL_GRAPH;
|
||||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
|
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
|
||||||
if (index == -1) {
|
if (index == -1) {
|
||||||
if (path == NULL) getPciPath(busId, &path);
|
if (path == NULL) getPciPath(busId, &path);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -16,6 +16,7 @@ template <typename T>
|
|||||||
static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) {
|
static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) {
|
||||||
CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
|
CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
|
||||||
memset(*ptr, 0, nelem*sizeof(T));
|
memset(*ptr, 0, nelem*sizeof(T));
|
||||||
|
INFO(NCCL_ALLOC, "Cuda Host Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,6 +34,7 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
|||||||
}
|
}
|
||||||
memset(p, 0, nelem*sizeof(T));
|
memset(p, 0, nelem*sizeof(T));
|
||||||
*ptr = (T*)p;
|
*ptr = (T*)p;
|
||||||
|
INFO(NCCL_ALLOC, "Mem Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -45,6 +47,7 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
|
|||||||
CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
||||||
CUDACHECK(cudaStreamSynchronize(stream));
|
CUDACHECK(cudaStreamSynchronize(stream));
|
||||||
CUDACHECK(cudaStreamDestroy(stream));
|
CUDACHECK(cudaStreamDestroy(stream));
|
||||||
|
INFO(NCCL_ALLOC, "Cuda Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,6 +68,7 @@ static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
|
|||||||
if (ret != 0) return ncclSystemError;
|
if (ret != 0) return ncclSystemError;
|
||||||
memset(p, 0, size);
|
memset(p, 0, size);
|
||||||
*ptr = p;
|
*ptr = p;
|
||||||
|
INFO(NCCL_ALLOC, "Ib Alloc Size %ld pointer %p", size, *ptr);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,6 +159,7 @@ struct ncclComm {
|
|||||||
struct ncclQueueInfo* enqueueInfo;
|
struct ncclQueueInfo* enqueueInfo;
|
||||||
cudaGraphNode_t lastSetupNode;
|
cudaGraphNode_t lastSetupNode;
|
||||||
unsigned long long lastCudaGraphId;
|
unsigned long long lastCudaGraphId;
|
||||||
|
int driverVersion;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -26,6 +26,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
|||||||
void ncclTopoFree(struct ncclTopoSystem* system);
|
void ncclTopoFree(struct ncclTopoSystem* system);
|
||||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||||
|
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
|
||||||
|
|
||||||
// Query topology
|
// Query topology
|
||||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
|
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -19,7 +19,7 @@
|
|||||||
#define NCCL_NET_MAX_REQUESTS 8
|
#define NCCL_NET_MAX_REQUESTS 8
|
||||||
|
|
||||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
|
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||||
|
|
||||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||||
|
|
||||||
|
@ -60,4 +60,5 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
enum { collNetRecv=0, collNetSend=1 };
|
enum { collNetRecv=0, collNetSend=1 };
|
||||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
||||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
||||||
|
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
|
||||||
#endif
|
#endif
|
||||||
|
80
src/init.cc
80
src/init.cc
@ -248,7 +248,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||||||
comm->nRanks = comm->hostDevComm.nRanks = ndev;
|
comm->nRanks = comm->hostDevComm.nRanks = ndev;
|
||||||
cudaGetDevice(&comm->cudaDev);
|
cudaGetDevice(&comm->cudaDev);
|
||||||
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
|
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
|
||||||
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
|
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
|
||||||
|
|
||||||
comm->doneEvent = doneEvent;
|
comm->doneEvent = doneEvent;
|
||||||
comm->intDoneEvent = intDoneEvent;
|
comm->intDoneEvent = intDoneEvent;
|
||||||
@ -277,6 +277,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||||||
comm->lastSetupNode = NULL;
|
comm->lastSetupNode = NULL;
|
||||||
comm->lastCudaGraphId = -1;
|
comm->lastCudaGraphId = -1;
|
||||||
|
|
||||||
|
CUDACHECK(cudaDriverGetVersion(&comm->driverVersion));
|
||||||
|
|
||||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
||||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
|
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
|
||||||
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
|
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
|
||||||
@ -295,11 +297,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||||||
|
|
||||||
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||||
// Duplicate the channels on the device
|
// Duplicate the channels on the device
|
||||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
|
int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
|
||||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
|
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, nChannels));
|
||||||
|
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, nChannels));
|
||||||
|
|
||||||
// Copy userRanks and peers
|
// Copy userRanks and peers
|
||||||
for (int r=0; r<comm->p2pnChannels; r++) {
|
for (int r=0; r<comm->nChannels; r++) {
|
||||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -459,6 +462,8 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
|||||||
|
|
||||||
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
|
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
|
||||||
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
|
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
|
||||||
|
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
|
||||||
|
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
|
||||||
|
|
||||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
|
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
|
||||||
// We use 2 AllGathers
|
// We use 2 AllGathers
|
||||||
@ -579,7 +584,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
|
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine CollNet support
|
// Determine local CollNet support before all-gather
|
||||||
if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
|
if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
|
||||||
if (intraRanks > 8) {
|
if (intraRanks > 8) {
|
||||||
if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
|
if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
|
||||||
@ -687,6 +692,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
|
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Determine CollNet support after all-gather now that we know nNodes
|
||||||
|
int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
|
||||||
|
if (comm->nNodes < collNetNodeThreshold) {
|
||||||
|
if (comm->collNetSupport == 1)
|
||||||
|
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
|
||||||
|
comm->collNetSupport = 0;
|
||||||
|
}
|
||||||
|
|
||||||
int *rings;
|
int *rings;
|
||||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph));
|
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph));
|
||||||
@ -727,6 +740,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
|
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
|
||||||
}
|
}
|
||||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
|
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
|
||||||
|
free(rings);
|
||||||
INFO(NCCL_INIT, "Connected all rings");
|
INFO(NCCL_INIT, "Connected all rings");
|
||||||
|
|
||||||
// Connect Trees
|
// Connect Trees
|
||||||
@ -759,27 +773,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
else if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend) != 1)
|
else if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend) != 1)
|
||||||
collNetSetupFail = 1;
|
collNetSetupFail = 1;
|
||||||
}
|
}
|
||||||
|
// Verify CollNet setup across ranks after trying the first channel
|
||||||
|
if (c == 0) {
|
||||||
|
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
|
||||||
}
|
}
|
||||||
free(heads);
|
}
|
||||||
// Verify CollNet setup across ranks
|
// Verify CollNet setup across ranks after trying all channels
|
||||||
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
|
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
|
||||||
if (comm->collNetSupport) {
|
|
||||||
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
|
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
|
||||||
|
|
||||||
|
// Connect intra-node CollNet
|
||||||
for (int c=0; c<comm->nChannels; c++) {
|
for (int c=0; c<comm->nChannels; c++) {
|
||||||
struct ncclChannel* channelRecv = comm->channels+c;
|
struct ncclChannel* channelRecv = comm->channels+c;
|
||||||
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0));
|
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
|
||||||
}
|
}
|
||||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0));
|
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
|
||||||
for (int c=0; c<comm->nChannels; c++) {
|
for (int c=0; c<comm->nChannels; c++) {
|
||||||
struct ncclChannel* channelSend = comm->channels+c;
|
struct ncclChannel* channelSend = comm->channels+c;
|
||||||
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1));
|
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
|
||||||
}
|
}
|
||||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1));
|
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
|
||||||
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
|
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
|
||||||
|
|
||||||
|
collnet_cleanup:
|
||||||
|
free(heads);
|
||||||
|
if (ret != ncclSuccess) {
|
||||||
|
NCCLCHECK(ncclTransportCollNetFree(comm));
|
||||||
|
comm->collNetSupport = 0;
|
||||||
|
ret = ncclSuccess;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
|
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
|
||||||
free(rings);
|
|
||||||
|
|
||||||
// Compute time models for algorithm and protocol combinations
|
// Compute time models for algorithm and protocol combinations
|
||||||
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
|
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
|
||||||
@ -787,6 +811,32 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
// Compute nChannels per peer for p2p
|
// Compute nChannels per peer for p2p
|
||||||
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
|
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
|
||||||
|
|
||||||
|
if (ncclParamNvbPreconnect()) {
|
||||||
|
// Connect p2p when using NVB path
|
||||||
|
int nvbNpeers;
|
||||||
|
int* nvbPeers;
|
||||||
|
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
|
||||||
|
for (int r=0; r<nvbNpeers; r++) {
|
||||||
|
int peer = nvbPeers[r];
|
||||||
|
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||||
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||||
|
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||||
|
if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
|
||||||
|
comm->connectRecv[peer] |= (1<<channelId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||||
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||||
|
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||||
|
if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
|
||||||
|
comm->connectSend[peer] |= (1<<channelId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 0));
|
||||||
|
free(nvbPeers);
|
||||||
|
}
|
||||||
|
|
||||||
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
|
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
|
||||||
|
|
||||||
if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
|
if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
|
||||||
@ -916,7 +966,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
|||||||
if (comm == NULL)
|
if (comm == NULL)
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
|
||||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
|
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
|
||||||
|
|
||||||
// Try and prevent a double free of the comm struct (user error)
|
// Try and prevent a double free of the comm struct (user error)
|
||||||
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
|
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
|
||||||
|
@ -237,10 +237,16 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
|
|||||||
free(allGatherFailures);
|
free(allGatherFailures);
|
||||||
if (collNetSetupFail) {
|
if (collNetSetupFail) {
|
||||||
if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
|
if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
|
||||||
// Free collNet resources
|
// Free collNet resources
|
||||||
for (int r=0; r<comm->nChannels; r++) {
|
for (int r=0; r<comm->nChannels; r++) {
|
||||||
struct ncclChannel* channel = comm->channels+r;
|
struct ncclChannel* channel = comm->channels+r;
|
||||||
struct ncclPeer* peer = channel->peers+nranks;
|
struct ncclPeer* peer = channel->peers+comm->nRanks;
|
||||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||||
struct ncclConnector* send = peer->send + b;
|
struct ncclConnector* send = peer->send + b;
|
||||||
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
|
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
|
||||||
@ -252,8 +258,5 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
|
|||||||
recv->transportResources = NULL; // avoid double free
|
recv->transportResources = NULL; // avoid double free
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Set support to 0
|
|
||||||
comm->collNetSupport = 0;
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
@ -371,7 +371,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||||||
// Data is ready, try to send.
|
// Data is ready, try to send.
|
||||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
|
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
|
||||||
if (sub->requests[buffSlot] != NULL) {
|
if (sub->requests[buffSlot] != NULL) {
|
||||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
|
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
|
||||||
sizesFifo[buffSlot] = -1;
|
sizesFifo[buffSlot] = -1;
|
||||||
// Make sure size is reset to zero before we update the head.
|
// Make sure size is reset to zero before we update the head.
|
||||||
__sync_synchronize();
|
__sync_synchronize();
|
||||||
@ -388,7 +388,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||||
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
|
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
|
||||||
if (done) {
|
if (done) {
|
||||||
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", sub->done, buffSlot, sub->requests[buffSlot]);
|
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||||
sub->done += args->sliceSteps;
|
sub->done += args->sliceSteps;
|
||||||
|
|
||||||
if (resources->shared == 0) {
|
if (resources->shared == 0) {
|
||||||
@ -447,7 +447,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
|||||||
}
|
}
|
||||||
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
|
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
|
||||||
if (sub->requests[buffSlot] != NULL) {
|
if (sub->requests[buffSlot] != NULL) {
|
||||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
|
TRACE(NCCL_NET, "recvProxy [%ld/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
|
||||||
sub->posted += args->sliceSteps;
|
sub->posted += args->sliceSteps;
|
||||||
args->idle = 0;
|
args->idle = 0;
|
||||||
continue;
|
continue;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user