2.18.5-1

Fix NVLS search (issue #931). Increase max IB NICs to 32. Fix inconsistent device ordering (issue #820). Try to use different devices for different GPUs in systems with more than one NIC per GFU.
2023-08-23 06:30:05 -07:00 · 2023-08-23 06:30:05 -07:00 · 559b70f86c
commit 559b70f86c
parent 8ed014bae9
7 changed files with 80 additions and 116 deletions
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 18
-NCCL_PATCH   := 3
+NCCL_PATCH   := 5
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@ -730,9 +730,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
  // fill the whole space of nChannels. To do so we mirror the bits in the
  // nChannels space.
  for (int c=0; c<comm->p2pnChannels; c++) {
-    int mirror = 0;
+    comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
    for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
    comm->p2pChannels[c] = mirror;
  }
  return ncclSuccess;
 }
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@ -376,6 +376,28 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
  int* localNets;
  NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
  // First add the preferred NICs
  for (int g=0; g<system->nodes[GPU].count; g++) {
    if (gpu != -1 && gpu != g) continue;
    localNetCount = 0;
    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
    for (int c = 0;; c++) {
      int netId;
      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
      if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
      localNetCount++;
    }
    // Append NICs to list
    for (int i=0; i<localNetCount; i++) {
      int n = localNets[i];
      int found = 0;
      while (nets[found] != n && found<netCount) found++;
      if (found == netCount) nets[netCount++] = n;
    }
  }
  // Then add others satisfying typeInter
  for (int t=0; t <= typeInter; t++) {
    for (int g=0; g<system->nodes[GPU].count; g++) {
      if (gpu != -1 && gpu != g) continue;
@ -385,14 +407,6 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
      for (int n=0; n<system->nodes[NET].count; n++) {
        if (paths[n].type == t) localNets[localNetCount++] = n;
      }
      if (localNetCount == 0) continue;
      // Shuffle by gpu NVML device number so that GPUs on the same PCI switch
      // with multiple NICs don't use the same one as first choice.
      for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
        int net0 = localNets[0];
        for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1];
        localNets[localNetCount-1] = net0;
      }
      // Append NICs to list
      for (int i=0; i<localNetCount; i++) {
        int n = localNets[i];
@ -532,7 +546,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
      if (graph->nChannels < netcount) {
        int gpu;
-        NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu));
+        NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu));
        if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
      }
    } else {
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@ -679,128 +679,59 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
  return ncclSuccess;
 }
-static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
+ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
  int minType = PATH_DIS;
  float maxBw = 0;
  int count = 0;
-  int* nets;
+  NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
-  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+  struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
-  for (int n=0; n<system->nodes[NET].count; n++) {
+  for (int i=0; i<system->nodes[resultType].count; i++) {
-    struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
+    if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
-    if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
+      maxBw = paths[i].bw;
-      maxBw = path->bw;
+      minType = paths[i].type;
-      minType = path->type;
+      if (pathType) *pathType = minType;
      if (type) *type = minType;
      count = 0;
    }
-    if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
+    if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
  }
-
+  *localCount = count;
  *localNetMask = 0ULL;
  for (int n=0; n<count; n++) {
    if (nets[n] >= 64) return ncclInternalError;
    *localNetMask |= 1ULL<<nets[n];
  }
  free(nets);
  return ncclSuccess;
 }
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
  uint64_t* localNetMasks;
  int ngpus = system->nodes[GPU].count;
  NCCLCHECK(ncclCalloc(&localNetMasks, ngpus));
  // Fill localNetMasks for all GPUs.
  for (int g=0; g<ngpus; g++) {
    NCCLCHECK(getLocalNetMask(system, g, localNetMasks+g, NULL));
  }
  // Find GPUs which have the same mask as rank, i.e. share the same local Nets.
  int gpu;
  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
-  int netLocalGpus = 0, netLocalGpu = 0;
+  int* localNets;
-  for (int g=0; g<ngpus; g++) {
+  int localNetCount;
-    if (localNetMasks[g] == localNetMasks[gpu]) {
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
-      if (g == gpu) netLocalGpu = netLocalGpus;
+  int* localGpus;
-      netLocalGpus++;
+  int localGpuCount;
-    }
+  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
-  }
+  int net = system->nodes[GPU].nodes[gpu].gpu.dev;
-  uint64_t localNetMask = localNetMasks[gpu];
+  if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
-  free(localNetMasks);
+  net += channelId%(DIVUP(localNetCount,localGpuCount));
-  if (localNetMask == 0) return ncclInternalError;
+  *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
-
+  free(localNets);
-  // Round robin on GPUs and channels
+  free(localGpus);
  int gIndex = 0, cId = 0, n = 0;
  while (1) {
    if (1ULL << n & localNetMask) {
      if (gIndex == netLocalGpu && cId == channelId) {
        *id = n;
  return ncclSuccess;
 }
      gIndex++;
      if (gIndex == netLocalGpus) {
        gIndex = 0;
        cId++;
      }
    }
    n = (n+1) % 64;
  }
 }
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
-  int ngpus = system->nodes[GPU].count;
+  for (int c=0; c<MAXCHANNELS; c++) {
-  int* gpus;
+    for (int g=0; g<system->nodes[GPU].count; g++) {
-  NCCLCHECK(ncclCalloc(&gpus, ngpus));
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-
+      int id;
-  // Find localNetMask which includes net with the most local GPUs.
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
-  int netLocalGpus = 0, minType = PATH_DIS;
+      if (net == id) {
-  uint64_t localNetMask = 0ULL;
+        *gpuIndex = g;
-  for (int g=0; g<ngpus; g++) {
+        return ncclSuccess;
    int type = PATH_DIS;
    uint64_t mask;
    NCCLCHECK(getLocalNetMask(system, g, &mask, &type));
    if ((1ULL<<net) & mask) {
      if (type < minType) {
        localNetMask = mask;
        netLocalGpus = 0;
        minType = type;
      }
      if (type == minType) {
        if (localNetMask && mask != localNetMask) {
          WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus-1], minType, net, mask, localNetMask);
          free(gpus);
          return ncclInternalError;
        }
        gpus[netLocalGpus] = g;
        netLocalGpus++;
      }
    }
  }
  if (localNetMask == 0ULL) {
  *gpuIndex = -1;
    free(gpus);
  return ncclSuccess;
 }
  // Round robin on GPUs and channels
  int gIndex = 0, cId = 0, n = 0;
  while (1) {
    if (1ULL << n & localNetMask) {
      if (n == net) {
        *gpuIndex = gpus[gIndex];
        free(gpus);
        return ncclSuccess;
      }
      gIndex++;
      if (gIndex == netLocalGpus) {
        gIndex = 0;
        cId++;
      }
    }
    n = (n+1) % 64;
  }
 }
 /****************************/
 /* External query functions */
 /****************************/
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@ -208,4 +208,14 @@ static float ncclTopoNVLinkBw(int cudaCompCap) {
    cudaCompCap >= 60 ? SM60_NVLINK_BW :
    SM80_NVLINK_BW;
 }
 // Mirror bits
 static bool isPow2(int val) {
  return (val & (val-1)) == 0;
 }
 static int mirrorBits(int val, int pow2) {
  int mirror = 0;
  for (int b=1, mb=(pow2>>1); b<pow2; b<<=1, mb>>=1) if (val & b) mirror |= mb;
  return mirror;
 }
 #endif
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@ -576,7 +576,18 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
      }
    }
    pciNode->parent = parent;
-    parent->subs[parent->nSubs++] = pciNode;
+    // Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
    int subIndex = parent->nSubs;
    const char* newBusId;
    NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
    for (int s=0; s<parent->nSubs; s++) {
      const char* busId;
      NCCLCHECK(xmlGetAttrStr(parent->subs[s], "busid", &busId));
      if (strcmp(newBusId, busId) < 0) { subIndex = s; break; }
    }
    for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1];
    parent->subs[subIndex] = pciNode;
    parent->nSubs++;
  }
  if (strcmp(parent->name, "pci") == 0) {
    NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@ -66,7 +66,7 @@ struct userIbDev {
  uint16_t port_en;
 };
-#define MAX_IB_DEVS 16
+#define MAX_IB_DEVS 32
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 struct userIbDev userIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;