2.18.5-1

Fix NVLS search (issue #931). Increase max IB NICs to 32. Fix inconsistent device ordering (issue #820). Try to use different devices for different GPUs in systems with more than one NIC per GFU.
2023-08-23 06:30:05 -07:00 · 2023-08-23 06:30:05 -07:00 · 559b70f86c
commit 559b70f86c
parent 8ed014bae9
7 changed files with 80 additions and 116 deletions
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 18
-NCCL_PATCH   := 3
+NCCL_PATCH   := 5
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@ -730,9 +730,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
  // fill the whole space of nChannels. To do so we mirror the bits in the
  // nChannels space.
  for (int c=0; c<comm->p2pnChannels; c++) {
-    int mirror = 0;
-    for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
-    comm->p2pChannels[c] = mirror;
+    comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
  }
  return ncclSuccess;
 }
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@ -376,6 +376,28 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
  int* localNets;
  NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));

+  // First add the preferred NICs
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (gpu != -1 && gpu != g) continue;
+    localNetCount = 0;
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    for (int c = 0;; c++) {
+      int netId;
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
+      if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
+      localNetCount++;
+    }
+    // Append NICs to list
+    for (int i=0; i<localNetCount; i++) {
+      int n = localNets[i];
+      int found = 0;
+      while (nets[found] != n && found<netCount) found++;
+      if (found == netCount) nets[netCount++] = n;
+    }
+  }
+
+  // Then add others satisfying typeInter
  for (int t=0; t <= typeInter; t++) {
    for (int g=0; g<system->nodes[GPU].count; g++) {
      if (gpu != -1 && gpu != g) continue;
@ -385,14 +407,6 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
      for (int n=0; n<system->nodes[NET].count; n++) {
        if (paths[n].type == t) localNets[localNetCount++] = n;
      }
-      if (localNetCount == 0) continue;
-      // Shuffle by gpu NVML device number so that GPUs on the same PCI switch
-      // with multiple NICs don't use the same one as first choice.
-      for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
-        int net0 = localNets[0];
-        for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1];
-        localNets[localNetCount-1] = net0;
-      }
      // Append NICs to list
      for (int i=0; i<localNetCount; i++) {
        int n = localNets[i];
@ -532,7 +546,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
      if (graph->nChannels < netcount) {
        int gpu;
-        NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu));
+        NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu));
        if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
      }
    } else {
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@ -679,126 +679,57 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
  return ncclSuccess;
 }

-static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
+ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
  int minType = PATH_DIS;
  float maxBw = 0;
  int count = 0;
-  int* nets;
-  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-  for (int n=0; n<system->nodes[NET].count; n++) {
-    struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
-    if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
-      maxBw = path->bw;
-      minType = path->type;
-      if (type) *type = minType;
+  NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
+  struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
+  for (int i=0; i<system->nodes[resultType].count; i++) {
+    if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
+      maxBw = paths[i].bw;
+      minType = paths[i].type;
+      if (pathType) *pathType = minType;
      count = 0;
    }
-    if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
+    if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
  }
-
-  *localNetMask = 0ULL;
-  for (int n=0; n<count; n++) {
-    if (nets[n] >= 64) return ncclInternalError;
-    *localNetMask |= 1ULL<<nets[n];
-  }
-  free(nets);
+  *localCount = count;
  return ncclSuccess;
 }

 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
-  uint64_t* localNetMasks;
-  int ngpus = system->nodes[GPU].count;
-  NCCLCHECK(ncclCalloc(&localNetMasks, ngpus));
-
-  // Fill localNetMasks for all GPUs.
-  for (int g=0; g<ngpus; g++) {
-    NCCLCHECK(getLocalNetMask(system, g, localNetMasks+g, NULL));
-  }
-
-  // Find GPUs which have the same mask as rank, i.e. share the same local Nets.
  int gpu;
  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
-  int netLocalGpus = 0, netLocalGpu = 0;
-  for (int g=0; g<ngpus; g++) {
-    if (localNetMasks[g] == localNetMasks[gpu]) {
-      if (g == gpu) netLocalGpu = netLocalGpus;
-      netLocalGpus++;
-    }
-  }
-  uint64_t localNetMask = localNetMasks[gpu];
-  free(localNetMasks);
-  if (localNetMask == 0) return ncclInternalError;
-
-  // Round robin on GPUs and channels
-  int gIndex = 0, cId = 0, n = 0;
-  while (1) {
-    if (1ULL << n & localNetMask) {
-      if (gIndex == netLocalGpu && cId == channelId) {
-        *id = n;
-        return ncclSuccess;
-      }
-      gIndex++;
-      if (gIndex == netLocalGpus) {
-        gIndex = 0;
-        cId++;
-      }
-    }
-    n = (n+1) % 64;
-  }
+  int* localNets;
+  int localNetCount;
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
+  int* localGpus;
+  int localGpuCount;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
+  int net = system->nodes[GPU].nodes[gpu].gpu.dev;
+  if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
+  net += channelId%(DIVUP(localNetCount,localGpuCount));
+  *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
+  free(localNets);
+  free(localGpus);
+  return ncclSuccess;
 }

 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
-  int ngpus = system->nodes[GPU].count;
-  int* gpus;
-  NCCLCHECK(ncclCalloc(&gpus, ngpus));
-
-  // Find localNetMask which includes net with the most local GPUs.
-  int netLocalGpus = 0, minType = PATH_DIS;
-  uint64_t localNetMask = 0ULL;
-  for (int g=0; g<ngpus; g++) {
-    int type = PATH_DIS;
-    uint64_t mask;
-    NCCLCHECK(getLocalNetMask(system, g, &mask, &type));
-    if ((1ULL<<net) & mask) {
-      if (type < minType) {
-        localNetMask = mask;
-        netLocalGpus = 0;
-        minType = type;
-      }
-      if (type == minType) {
-        if (localNetMask && mask != localNetMask) {
-          WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus-1], minType, net, mask, localNetMask);
-          free(gpus);
-          return ncclInternalError;
-        }
-        gpus[netLocalGpus] = g;
-        netLocalGpus++;
-      }
-    }
-  }
-  if (localNetMask == 0ULL) {
-    *gpuIndex = -1;
-    free(gpus);
-    return ncclSuccess;
-  }
-
-  // Round robin on GPUs and channels
-  int gIndex = 0, cId = 0, n = 0;
-  while (1) {
-    if (1ULL << n & localNetMask) {
-      if (n == net) {
-        *gpuIndex = gpus[gIndex];
-        free(gpus);
+  for (int c=0; c<MAXCHANNELS; c++) {
+    for (int g=0; g<system->nodes[GPU].count; g++) {
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+      int id;
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
+      if (net == id) {
+        *gpuIndex = g;
        return ncclSuccess;
      }
-      gIndex++;
-      if (gIndex == netLocalGpus) {
-        gIndex = 0;
-        cId++;
-      }
    }
-    n = (n+1) % 64;
  }
+  *gpuIndex = -1;
+  return ncclSuccess;
 }

 /****************************/
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@ -208,4 +208,14 @@ static float ncclTopoNVLinkBw(int cudaCompCap) {
    cudaCompCap >= 60 ? SM60_NVLINK_BW :
    SM80_NVLINK_BW;
 }
+
+// Mirror bits
+static bool isPow2(int val) {
+  return (val & (val-1)) == 0;
+}
+static int mirrorBits(int val, int pow2) {
+  int mirror = 0;
+  for (int b=1, mb=(pow2>>1); b<pow2; b<<=1, mb>>=1) if (val & b) mirror |= mb;
+  return mirror;
+}
 #endif
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@ -576,7 +576,18 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
      }
    }
    pciNode->parent = parent;
-    parent->subs[parent->nSubs++] = pciNode;
+    // Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
+    int subIndex = parent->nSubs;
+    const char* newBusId;
+    NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
+    for (int s=0; s<parent->nSubs; s++) {
+      const char* busId;
+      NCCLCHECK(xmlGetAttrStr(parent->subs[s], "busid", &busId));
+      if (strcmp(newBusId, busId) < 0) { subIndex = s; break; }
+    }
+    for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1];
+    parent->subs[subIndex] = pciNode;
+    parent->nSubs++;
  }
  if (strcmp(parent->name, "pci") == 0) {
    NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@ -66,7 +66,7 @@ struct userIbDev {
  uint16_t port_en;
 };

-#define MAX_IB_DEVS 16
+#define MAX_IB_DEVS 32
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 struct userIbDev userIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;