2.9.9-1
Fix crash when setting NCCL_MAX_P2P_NCHANNELS below nchannels. Fix hang during sendrecv dynamic NVB connection establishment on cubemesh topologies. Add environment variable to only use SHARP on communicators beyond a given number of ranks. Add debug subsystem to trace memory allocations. Fix compilation with TRACE=1. (Issue #505)
This commit is contained in:
parent
ca8485b0d0
commit
3fec2fa5ee
@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 9
|
||||
NCCL_PATCH := 8
|
||||
NCCL_PATCH := 9
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@ -62,6 +62,8 @@ void ncclDebugInit() {
|
||||
mask = NCCL_TUNING;
|
||||
} else if (strcasecmp(subsys, "ENV") == 0) {
|
||||
mask = NCCL_ENV;
|
||||
} else if (strcasecmp(subsys, "ALLOC") == 0) {
|
||||
mask = NCCL_ALLOC;
|
||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||
mask = NCCL_ALL;
|
||||
}
|
||||
|
@ -133,7 +133,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
|
||||
// Because in cudaGraph mode the launch param needs to be determined
|
||||
// at capture time instead of launch time.
|
||||
if (!usingCudaGraph) {
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
if (comm->channels[c].workCount) params->gridDim.x = c+1;
|
||||
}
|
||||
eqInfo->maxChannels = params->gridDim.x;
|
||||
@ -169,8 +170,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
|
||||
// GDRCOPY support
|
||||
uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
|
||||
uint64_t nelems = channel->workCount;
|
||||
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld last %ld nelems %zi",
|
||||
channel->workFifo, channel->workFifoGdr, first, last, nelems);
|
||||
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi",
|
||||
channel->workFifo, channel->workFifoGdr, first, nelems);
|
||||
|
||||
for (int i = 0; i < nelems; i++) {
|
||||
int elem = (first+i) % NCCL_MAX_OPS;
|
||||
@ -799,6 +800,14 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
cudaStreamCaptureStatus captureStatus;
|
||||
unsigned long long cudaGraphId;
|
||||
if (comm->driverVersion < 11030) {
|
||||
CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
|
||||
if (captureStatus != cudaStreamCaptureStatusNone) {
|
||||
WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
|
||||
if (captureStatus == cudaStreamCaptureStatusActive) {
|
||||
if (cudaGraphId != comm->lastCudaGraphId) {
|
||||
|
@ -29,6 +29,8 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
|
||||
|
||||
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
|
||||
if (baseNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
@ -63,7 +65,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
|
||||
// allow routing through a GPU only as 1 hop
|
||||
if (node != baseNode && node->type == GPU &&
|
||||
(link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
|
||||
(ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
|
||||
|
||||
if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) {
|
||||
// Find reverse link
|
||||
@ -529,3 +531,20 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
NCCLCHECK(ncclCalloc(ranks, ngpus));
|
||||
int nvbGpus = 0;
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
if (gpu->gpu.rank != rank) continue;
|
||||
for (int p=0; p<ngpus; p++) {
|
||||
if (gpu->paths[GPU][p].type == PATH_NVB) {
|
||||
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
|
||||
}
|
||||
}
|
||||
}
|
||||
*nranks = nvbGpus;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
@ -469,7 +469,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
|
||||
}
|
||||
ncclDebugNoWarn = 1;
|
||||
ncclDebugNoWarn = NCCL_GRAPH;
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) getPciPath(busId, &path);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -16,6 +16,7 @@ template <typename T>
|
||||
static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) {
|
||||
CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
|
||||
memset(*ptr, 0, nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "Cuda Host Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@ -33,6 +34,7 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
}
|
||||
memset(p, 0, nelem*sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
INFO(NCCL_ALLOC, "Mem Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@ -45,6 +47,7 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
|
||||
CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
CUDACHECK(cudaStreamDestroy(stream));
|
||||
INFO(NCCL_ALLOC, "Cuda Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@ -65,6 +68,7 @@ static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
|
||||
if (ret != 0) return ncclSystemError;
|
||||
memset(p, 0, size);
|
||||
*ptr = p;
|
||||
INFO(NCCL_ALLOC, "Ib Alloc Size %ld pointer %p", size, *ptr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -159,6 +159,7 @@ struct ncclComm {
|
||||
struct ncclQueueInfo* enqueueInfo;
|
||||
cudaGraphNode_t lastSetupNode;
|
||||
unsigned long long lastCudaGraphId;
|
||||
int driverVersion;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -26,6 +26,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
void ncclTopoFree(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@ -19,7 +19,7 @@
|
||||
#define NCCL_NET_MAX_REQUESTS 8
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
|
@ -60,4 +60,5 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
enum { collNetRecv=0, collNetSend=1 };
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
||||
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
|
||||
#endif
|
||||
|
94
src/init.cc
94
src/init.cc
@ -248,7 +248,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
comm->nRanks = comm->hostDevComm.nRanks = ndev;
|
||||
cudaGetDevice(&comm->cudaDev);
|
||||
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
|
||||
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
|
||||
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
|
||||
|
||||
comm->doneEvent = doneEvent;
|
||||
comm->intDoneEvent = intDoneEvent;
|
||||
@ -277,6 +277,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
comm->lastSetupNode = NULL;
|
||||
comm->lastCudaGraphId = -1;
|
||||
|
||||
CUDACHECK(cudaDriverGetVersion(&comm->driverVersion));
|
||||
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
|
||||
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
|
||||
@ -295,11 +297,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
|
||||
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
// Duplicate the channels on the device
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
|
||||
int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, nChannels));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, nChannels));
|
||||
|
||||
// Copy userRanks and peers
|
||||
for (int r=0; r<comm->p2pnChannels; r++) {
|
||||
for (int r=0; r<comm->nChannels; r++) {
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
||||
}
|
||||
|
||||
@ -459,6 +462,8 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
|
||||
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
|
||||
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
|
||||
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
|
||||
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
|
||||
|
||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
|
||||
// We use 2 AllGathers
|
||||
@ -579,7 +584,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
|
||||
}
|
||||
|
||||
// Determine CollNet support
|
||||
// Determine local CollNet support before all-gather
|
||||
if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
|
||||
if (intraRanks > 8) {
|
||||
if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
|
||||
@ -687,6 +692,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
|
||||
}
|
||||
|
||||
// Determine CollNet support after all-gather now that we know nNodes
|
||||
int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
|
||||
if (comm->nNodes < collNetNodeThreshold) {
|
||||
if (comm->collNetSupport == 1)
|
||||
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
|
||||
comm->collNetSupport = 0;
|
||||
}
|
||||
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph));
|
||||
@ -727,6 +740,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
|
||||
free(rings);
|
||||
INFO(NCCL_INIT, "Connected all rings");
|
||||
|
||||
// Connect Trees
|
||||
@ -759,27 +773,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
else if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend) != 1)
|
||||
collNetSetupFail = 1;
|
||||
}
|
||||
// Verify CollNet setup across ranks after trying the first channel
|
||||
if (c == 0) {
|
||||
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
|
||||
}
|
||||
}
|
||||
// Verify CollNet setup across ranks after trying all channels
|
||||
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
|
||||
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
|
||||
|
||||
// Connect intra-node CollNet
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelRecv = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelSend = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
|
||||
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
|
||||
|
||||
collnet_cleanup:
|
||||
free(heads);
|
||||
// Verify CollNet setup across ranks
|
||||
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
|
||||
if (comm->collNetSupport) {
|
||||
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelRecv = comm->channels+c;
|
||||
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0));
|
||||
}
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0));
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelSend = comm->channels+c;
|
||||
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1));
|
||||
}
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1));
|
||||
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
|
||||
if (ret != ncclSuccess) {
|
||||
NCCLCHECK(ncclTransportCollNetFree(comm));
|
||||
comm->collNetSupport = 0;
|
||||
ret = ncclSuccess;
|
||||
}
|
||||
}
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
|
||||
free(rings);
|
||||
|
||||
// Compute time models for algorithm and protocol combinations
|
||||
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
|
||||
@ -787,6 +811,32 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
// Compute nChannels per peer for p2p
|
||||
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
|
||||
|
||||
if (ncclParamNvbPreconnect()) {
|
||||
// Connect p2p when using NVB path
|
||||
int nvbNpeers;
|
||||
int* nvbPeers;
|
||||
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
|
||||
for (int r=0; r<nvbNpeers; r++) {
|
||||
int peer = nvbPeers[r];
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
}
|
||||
}
|
||||
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
}
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 0));
|
||||
free(nvbPeers);
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
|
||||
|
||||
if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
|
||||
@ -916,7 +966,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
|
||||
|
||||
// Try and prevent a double free of the comm struct (user error)
|
||||
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
|
||||
|
@ -237,23 +237,26 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
|
||||
free(allGatherFailures);
|
||||
if (collNetSetupFail) {
|
||||
if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
|
||||
// Free collNet resources
|
||||
for (int r=0; r<comm->nChannels; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
struct ncclPeer* peer = channel->peers+nranks;
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* send = peer->send + b;
|
||||
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
|
||||
send->transportResources = NULL; // avoid double free
|
||||
}
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* recv = peer->recv + b;
|
||||
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
|
||||
recv->transportResources = NULL; // avoid double free
|
||||
}
|
||||
}
|
||||
// Set support to 0
|
||||
comm->collNetSupport = 0;
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
|
||||
// Free collNet resources
|
||||
for (int r=0; r<comm->nChannels; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
struct ncclPeer* peer = channel->peers+comm->nRanks;
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* send = peer->send + b;
|
||||
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
|
||||
send->transportResources = NULL; // avoid double free
|
||||
}
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* recv = peer->recv + b;
|
||||
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
|
||||
recv->transportResources = NULL; // avoid double free
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
@ -371,7 +371,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
// Data is ready, try to send.
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
|
||||
sizesFifo[buffSlot] = -1;
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
@ -388,7 +388,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||
sub->done += args->sliceSteps;
|
||||
|
||||
if (resources->shared == 0) {
|
||||
@ -447,7 +447,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
|
||||
sub->posted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
continue;
|
||||
|
Loading…
x
Reference in New Issue
Block a user