2.20.5-1
Fix UDS connection failure when using ncclCommSplit. Issue #1185
This commit is contained in:
parent
b6475625fb
commit
48bb7fec79
@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 20
|
||||
NCCL_PATCH := 3
|
||||
NCCL_PATCH := 5
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
@ -305,7 +305,10 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
|
||||
// cuMem UDS support
|
||||
state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash;
|
||||
// Make sure we create a unique UDS socket name
|
||||
uint64_t randId;
|
||||
NCCLCHECK(getRandomData(&randId, sizeof(randId)));
|
||||
state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)));
|
||||
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
|
||||
|
||||
@ -371,7 +374,10 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
|
||||
// cuMem UDS support
|
||||
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
|
||||
state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash;
|
||||
// Make sure we create a unique UDS socket name
|
||||
uint64_t randId;
|
||||
NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail);
|
||||
state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
|
||||
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
|
||||
}
|
||||
|
@ -1077,7 +1077,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
|
||||
int localRanks = comm->topo->nodes[GPU].count;
|
||||
int netNum = 0;
|
||||
int net[MAXCHANNELS];
|
||||
|
||||
|
||||
for (int c = 0; c < graph->nChannels; c++) {
|
||||
if (graph->intra[c * localRanks] == comm->rank) {
|
||||
net[netNum++] = graph->inter[c * 2];
|
||||
|
@ -826,6 +826,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
int nNodes = 1;
|
||||
cpu_set_t affinitySave;
|
||||
struct ncclTopoGraph ringGraph;
|
||||
struct ncclTopoGraph treeGraph;
|
||||
@ -865,6 +866,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
|
||||
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
|
||||
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
|
||||
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
|
||||
ret = ncclInvalidUsage;
|
||||
@ -879,7 +881,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
#include "cudawrap.h"
|
||||
|
||||
// MNNVL support
|
||||
{
|
||||
if (nNodes > 1) {
|
||||
int cliqueSize = 0;
|
||||
comm->MNNVL = 0;
|
||||
// Determine the size of the MNNVL domain/clique
|
||||
@ -1485,15 +1487,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
|
||||
snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
|
||||
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
|
||||
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
|
||||
NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
|
||||
} else {
|
||||
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
|
||||
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
|
||||
NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
|
||||
}
|
||||
|
||||
comm->cudaArch = cudaArch;
|
||||
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
|
||||
|
||||
|
@ -1660,6 +1660,9 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
|
||||
comm->proxyState->listenSock = sock;
|
||||
comm->proxyState->peerAddresses = peerAddresses;
|
||||
comm->proxyState->peerAddressesUDS = peerAddressesUDS;
|
||||
|
||||
// UDS support
|
||||
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag));
|
||||
// Seed the random number generator for UDS filename generation
|
||||
struct timeval time;
|
||||
gettimeofday(&time,NULL);
|
||||
@ -1693,8 +1696,7 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||
ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
|
||||
|
||||
// UDS support
|
||||
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d pidHash %lx", comm, comm->rank, comm->peerInfo[comm->rank].pidHash);
|
||||
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, comm->peerInfo[comm->rank].pidHash, comm->abortFlag));
|
||||
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
|
||||
pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
|
||||
ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
|
||||
}
|
||||
|
@ -853,7 +853,7 @@ ib_connect_check:
|
||||
// Print just the QPs for this dev
|
||||
if (comm->base.qps[q].devIndex == i)
|
||||
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
|
||||
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
|
||||
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
|
||||
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, ncclParamIbGidIndex(),
|
||||
devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user