Fix UDS connection failure when using ncclCommSplit. Issue #1185
This commit is contained in:
Sylvain Jeaugey 2024-02-26 02:52:26 -08:00
parent b6475625fb
commit 48bb7fec79
6 changed files with 19 additions and 10 deletions

View File

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 20
NCCL_PATCH := 3
NCCL_PATCH := 5
NCCL_SUFFIX :=
PKG_REVISION := 1

View File

@ -305,7 +305,10 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
// cuMem UDS support
state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash;
// Make sure we create a unique UDS socket name
uint64_t randId;
NCCLCHECK(getRandomData(&randId, sizeof(randId)));
state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)));
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
@ -371,7 +374,10 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
// cuMem UDS support
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash;
// Make sure we create a unique UDS socket name
uint64_t randId;
NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail);
state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail);
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
}

View File

@ -1077,7 +1077,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
int localRanks = comm->topo->nodes[GPU].count;
int netNum = 0;
int net[MAXCHANNELS];
for (int c = 0; c < graph->nChannels; c++) {
if (graph->intra[c * localRanks] == comm->rank) {
net[netNum++] = graph->inter[c * 2];

View File

@ -826,6 +826,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
int nNodes = 1;
cpu_set_t affinitySave;
struct ncclTopoGraph ringGraph;
struct ncclTopoGraph treeGraph;
@ -865,6 +866,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
for (int i = 0; i < nranks; i++) {
if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
ret = ncclInvalidUsage;
@ -879,7 +881,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
#include "cudawrap.h"
// MNNVL support
{
if (nNodes > 1) {
int cliqueSize = 0;
comm->MNNVL = 0;
// Determine the size of the MNNVL domain/clique
@ -1485,15 +1487,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
} else {
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
}
comm->cudaArch = cudaArch;
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));

View File

@ -1660,6 +1660,9 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
comm->proxyState->listenSock = sock;
comm->proxyState->peerAddresses = peerAddresses;
comm->proxyState->peerAddressesUDS = peerAddressesUDS;
// UDS support
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag));
// Seed the random number generator for UDS filename generation
struct timeval time;
gettimeofday(&time,NULL);
@ -1693,8 +1696,7 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
// UDS support
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d pidHash %lx", comm, comm->rank, comm->peerInfo[comm->rank].pidHash);
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, comm->peerInfo[comm->rank].pidHash, comm->abortFlag));
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
}

View File

@ -853,7 +853,7 @@ ib_connect_check:
// Print just the QPs for this dev
if (comm->base.qps[q].devIndex == i)
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, ncclParamIbGidIndex(),
devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey);
}