Fix NVLS search (issue #931).
Increase max IB NICs to 32.
Fix inconsistent device ordering (issue #820).
Try to use different devices for different GPUs in systems with
more than one NIC per GFU.
This commit is contained in:
Sylvain Jeaugey 2023-08-23 06:30:05 -07:00
parent 8ed014bae9
commit 559b70f86c
7 changed files with 80 additions and 116 deletions

View File

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 18
NCCL_PATCH := 3
NCCL_PATCH := 5
NCCL_SUFFIX :=
PKG_REVISION := 1

View File

@ -730,9 +730,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
// fill the whole space of nChannels. To do so we mirror the bits in the
// nChannels space.
for (int c=0; c<comm->p2pnChannels; c++) {
int mirror = 0;
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
comm->p2pChannels[c] = mirror;
comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
}
return ncclSuccess;
}

View File

@ -376,6 +376,28 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
int* localNets;
NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
// First add the preferred NICs
for (int g=0; g<system->nodes[GPU].count; g++) {
if (gpu != -1 && gpu != g) continue;
localNetCount = 0;
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
for (int c = 0;; c++) {
int netId;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
localNetCount++;
}
// Append NICs to list
for (int i=0; i<localNetCount; i++) {
int n = localNets[i];
int found = 0;
while (nets[found] != n && found<netCount) found++;
if (found == netCount) nets[netCount++] = n;
}
}
// Then add others satisfying typeInter
for (int t=0; t <= typeInter; t++) {
for (int g=0; g<system->nodes[GPU].count; g++) {
if (gpu != -1 && gpu != g) continue;
@ -385,14 +407,6 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
for (int n=0; n<system->nodes[NET].count; n++) {
if (paths[n].type == t) localNets[localNetCount++] = n;
}
if (localNetCount == 0) continue;
// Shuffle by gpu NVML device number so that GPUs on the same PCI switch
// with multiple NICs don't use the same one as first choice.
for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
int net0 = localNets[0];
for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1];
localNets[localNetCount-1] = net0;
}
// Append NICs to list
for (int i=0; i<localNetCount; i++) {
int n = localNets[i];
@ -532,7 +546,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
if (graph->nChannels < netcount) {
int gpu;
NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu));
NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu));
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
}
} else {

View File

@ -679,126 +679,57 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
return ncclSuccess;
}
static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
int minType = PATH_DIS;
float maxBw = 0;
int count = 0;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
for (int n=0; n<system->nodes[NET].count; n++) {
struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
maxBw = path->bw;
minType = path->type;
if (type) *type = minType;
NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
for (int i=0; i<system->nodes[resultType].count; i++) {
if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
maxBw = paths[i].bw;
minType = paths[i].type;
if (pathType) *pathType = minType;
count = 0;
}
if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
}
*localNetMask = 0ULL;
for (int n=0; n<count; n++) {
if (nets[n] >= 64) return ncclInternalError;
*localNetMask |= 1ULL<<nets[n];
}
free(nets);
*localCount = count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
uint64_t* localNetMasks;
int ngpus = system->nodes[GPU].count;
NCCLCHECK(ncclCalloc(&localNetMasks, ngpus));
// Fill localNetMasks for all GPUs.
for (int g=0; g<ngpus; g++) {
NCCLCHECK(getLocalNetMask(system, g, localNetMasks+g, NULL));
}
// Find GPUs which have the same mask as rank, i.e. share the same local Nets.
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
int netLocalGpus = 0, netLocalGpu = 0;
for (int g=0; g<ngpus; g++) {
if (localNetMasks[g] == localNetMasks[gpu]) {
if (g == gpu) netLocalGpu = netLocalGpus;
netLocalGpus++;
}
}
uint64_t localNetMask = localNetMasks[gpu];
free(localNetMasks);
if (localNetMask == 0) return ncclInternalError;
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while (1) {
if (1ULL << n & localNetMask) {
if (gIndex == netLocalGpu && cId == channelId) {
*id = n;
return ncclSuccess;
}
gIndex++;
if (gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n+1) % 64;
}
int* localNets;
int localNetCount;
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
int* localGpus;
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
net += channelId%(DIVUP(localNetCount,localGpuCount));
*id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
free(localNets);
free(localGpus);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
int ngpus = system->nodes[GPU].count;
int* gpus;
NCCLCHECK(ncclCalloc(&gpus, ngpus));
// Find localNetMask which includes net with the most local GPUs.
int netLocalGpus = 0, minType = PATH_DIS;
uint64_t localNetMask = 0ULL;
for (int g=0; g<ngpus; g++) {
int type = PATH_DIS;
uint64_t mask;
NCCLCHECK(getLocalNetMask(system, g, &mask, &type));
if ((1ULL<<net) & mask) {
if (type < minType) {
localNetMask = mask;
netLocalGpus = 0;
minType = type;
}
if (type == minType) {
if (localNetMask && mask != localNetMask) {
WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus-1], minType, net, mask, localNetMask);
free(gpus);
return ncclInternalError;
}
gpus[netLocalGpus] = g;
netLocalGpus++;
}
}
}
if (localNetMask == 0ULL) {
*gpuIndex = -1;
free(gpus);
return ncclSuccess;
}
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while (1) {
if (1ULL << n & localNetMask) {
if (n == net) {
*gpuIndex = gpus[gIndex];
free(gpus);
for (int c=0; c<MAXCHANNELS; c++) {
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int id;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
if (net == id) {
*gpuIndex = g;
return ncclSuccess;
}
gIndex++;
if (gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n+1) % 64;
}
*gpuIndex = -1;
return ncclSuccess;
}
/****************************/

View File

@ -208,4 +208,14 @@ static float ncclTopoNVLinkBw(int cudaCompCap) {
cudaCompCap >= 60 ? SM60_NVLINK_BW :
SM80_NVLINK_BW;
}
// Mirror bits
static bool isPow2(int val) {
return (val & (val-1)) == 0;
}
static int mirrorBits(int val, int pow2) {
int mirror = 0;
for (int b=1, mb=(pow2>>1); b<pow2; b<<=1, mb>>=1) if (val & b) mirror |= mb;
return mirror;
}
#endif

View File

@ -576,7 +576,18 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
}
}
pciNode->parent = parent;
parent->subs[parent->nSubs++] = pciNode;
// Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
int subIndex = parent->nSubs;
const char* newBusId;
NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
for (int s=0; s<parent->nSubs; s++) {
const char* busId;
NCCLCHECK(xmlGetAttrStr(parent->subs[s], "busid", &busId));
if (strcmp(newBusId, busId) < 0) { subIndex = s; break; }
}
for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1];
parent->subs[subIndex] = pciNode;
parent->nSubs++;
}
if (strcmp(parent->name, "pci") == 0) {
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));

View File

@ -66,7 +66,7 @@ struct userIbDev {
uint16_t port_en;
};
#define MAX_IB_DEVS 16
#define MAX_IB_DEVS 32
struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
struct userIbDev userIbDevs[MAX_IB_DEVS];
pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;