From 559b70f86c190a0d8f67f0d7a0f2c9810dd1e8c7 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Wed, 23 Aug 2023 06:30:05 -0700 Subject: [PATCH] 2.18.5-1 Fix NVLS search (issue #931). Increase max IB NICs to 32. Fix inconsistent device ordering (issue #820). Try to use different devices for different GPUs in systems with more than one NIC per GFU. --- makefiles/version.mk | 2 +- src/graph/paths.cc | 4 +- src/graph/search.cc | 32 +++++++--- src/graph/topo.cc | 133 ++++++++++------------------------------ src/graph/topo.h | 10 +++ src/graph/xml.cc | 13 +++- src/transport/net_ib.cc | 2 +- 7 files changed, 80 insertions(+), 116 deletions(-) diff --git a/makefiles/version.mk b/makefiles/version.mk index ba16223..df96037 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 18 -NCCL_PATCH := 3 +NCCL_PATCH := 5 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 29cea27..450ba65 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -730,9 +730,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { // fill the whole space of nChannels. To do so we mirror the bits in the // nChannels space. for (int c=0; cp2pnChannels; c++) { - int mirror = 0; - for (int b=1, mb=(comm->p2pnChannels>>1); bp2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb; - comm->p2pChannels[c] = mirror; + comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels); } return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index bb83d5d..dd8896b 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -376,6 +376,28 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in int* localNets; NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count)); + // First add the preferred NICs + for (int g=0; gnodes[GPU].count; g++) { + if (gpu != -1 && gpu != g) continue; + localNetCount = 0; + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + for (int c = 0;; c++) { + int netId; + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); + if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; + localNetCount++; + } + // Append NICs to list + for (int i=0; inodes[GPU].count; g++) { if (gpu != -1 && gpu != g) continue; @@ -385,14 +407,6 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in for (int n=0; nnodes[NET].count; n++) { if (paths[n].type == t) localNets[localNetCount++] = n; } - if (localNetCount == 0) continue; - // Shuffle by gpu NVML device number so that GPUs on the same PCI switch - // with multiple NICs don't use the same one as first choice. - for (int r=0; rnodes[GPU].nodes[g].gpu.dev % localNetCount; r++) { - int net0 = localNets[0]; - for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS) { if (graph->nChannels < netcount) { int gpu; - NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu)); + NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu)); if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu)); } } else { diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 529eff4..cdcc066 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -679,126 +679,57 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy return ncclSuccess; } -static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) { +ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) { int minType = PATH_DIS; float maxBw = 0; int count = 0; - int* nets; - NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); - for (int n=0; nnodes[NET].count; n++) { - struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g; - if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) { - maxBw = path->bw; - minType = path->type; - if (type) *type = minType; + NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count)); + struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType]; + for (int i=0; inodes[resultType].count; i++) { + if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) { + maxBw = paths[i].bw; + minType = paths[i].type; + if (pathType) *pathType = minType; count = 0; } - if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; + if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i; } - - *localNetMask = 0ULL; - for (int n=0; n= 64) return ncclInternalError; - *localNetMask |= 1ULL<nodes[GPU].count; - NCCLCHECK(ncclCalloc(&localNetMasks, ngpus)); - - // Fill localNetMasks for all GPUs. - for (int g=0; gnodes[GPU].nodes[gpu].gpu.dev; + if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount); + net += channelId%(DIVUP(localNetCount,localGpuCount)); + *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id; + free(localNets); + free(localGpus); + return ncclSuccess; } ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) { - int ngpus = system->nodes[GPU].count; - int* gpus; - NCCLCHECK(ncclCalloc(&gpus, ngpus)); - - // Find localNetMask which includes net with the most local GPUs. - int netLocalGpus = 0, minType = PATH_DIS; - uint64_t localNetMask = 0ULL; - for (int g=0; gnodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + int id; + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id)); + if (net == id) { + *gpuIndex = g; return ncclSuccess; } - gIndex++; - if (gIndex == netLocalGpus) { - gIndex = 0; - cId++; - } } - n = (n+1) % 64; } + *gpuIndex = -1; + return ncclSuccess; } /****************************/ diff --git a/src/graph/topo.h b/src/graph/topo.h index 281d16a..8951505 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -208,4 +208,14 @@ static float ncclTopoNVLinkBw(int cudaCompCap) { cudaCompCap >= 60 ? SM60_NVLINK_BW : SM80_NVLINK_BW; } + +// Mirror bits +static bool isPow2(int val) { + return (val & (val-1)) == 0; +} +static int mirrorBits(int val, int pow2) { + int mirror = 0; + for (int b=1, mb=(pow2>>1); b>=1) if (val & b) mirror |= mb; + return mirror; +} #endif diff --git a/src/graph/xml.cc b/src/graph/xml.cc index d441de0..ac862a4 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -576,7 +576,18 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* } } pciNode->parent = parent; - parent->subs[parent->nSubs++] = pciNode; + // Keep PCI sub devices ordered by PCI Bus ID (Issue #820) + int subIndex = parent->nSubs; + const char* newBusId; + NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId)); + for (int s=0; snSubs; s++) { + const char* busId; + NCCLCHECK(xmlGetAttrStr(parent->subs[s], "busid", &busId)); + if (strcmp(newBusId, busId) < 0) { subIndex = s; break; } + } + for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1]; + parent->subs[subIndex] = pciNode; + parent->nSubs++; } if (strcmp(parent->name, "pci") == 0) { NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 861fa57..421f0a1 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -66,7 +66,7 @@ struct userIbDev { uint16_t port_en; }; -#define MAX_IB_DEVS 16 +#define MAX_IB_DEVS 32 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; struct userIbDev userIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;