From 559b70f86c190a0d8f67f0d7a0f2c9810dd1e8c7 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Wed, 23 Aug 2023 06:30:05 -0700
Subject: [PATCH] 2.18.5-1

Fix NVLS search (issue #931).
Increase max IB NICs to 32.
Fix inconsistent device ordering (issue #820).
Try to use different devices for different GPUs in systems with
more than one NIC per GFU.
---
 makefiles/version.mk    |   2 +-
 src/graph/paths.cc      |   4 +-
 src/graph/search.cc     |  32 +++++++---
 src/graph/topo.cc       | 133 ++++++++++------------------------------
 src/graph/topo.h        |  10 +++
 src/graph/xml.cc        |  13 +++-
 src/transport/net_ib.cc |   2 +-
 7 files changed, 80 insertions(+), 116 deletions(-)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index ba16223..df96037 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 18
-NCCL_PATCH   := 3
+NCCL_PATCH   := 5
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 29cea27..450ba65 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -730,9 +730,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
   // fill the whole space of nChannels. To do so we mirror the bits in the
   // nChannels space.
   for (int c=0; c<comm->p2pnChannels; c++) {
-    int mirror = 0;
-    for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
-    comm->p2pChannels[c] = mirror;
+    comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
   }
   return ncclSuccess;
 }
diff --git a/src/graph/search.cc b/src/graph/search.cc
index bb83d5d..dd8896b 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -376,6 +376,28 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
   int* localNets;
   NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
 
+  // First add the preferred NICs
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (gpu != -1 && gpu != g) continue;
+    localNetCount = 0;
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    for (int c = 0;; c++) {
+      int netId;
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
+      if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
+      localNetCount++;
+    }
+    // Append NICs to list
+    for (int i=0; i<localNetCount; i++) {
+      int n = localNets[i];
+      int found = 0;
+      while (nets[found] != n && found<netCount) found++;
+      if (found == netCount) nets[netCount++] = n;
+    }
+  }
+
+  // Then add others satisfying typeInter
   for (int t=0; t <= typeInter; t++) {
     for (int g=0; g<system->nodes[GPU].count; g++) {
       if (gpu != -1 && gpu != g) continue;
@@ -385,14 +407,6 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
       for (int n=0; n<system->nodes[NET].count; n++) {
         if (paths[n].type == t) localNets[localNetCount++] = n;
       }
-      if (localNetCount == 0) continue;
-      // Shuffle by gpu NVML device number so that GPUs on the same PCI switch
-      // with multiple NICs don't use the same one as first choice.
-      for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
-        int net0 = localNets[0];
-        for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1];
-        localNets[localNetCount-1] = net0;
-      }
       // Append NICs to list
       for (int i=0; i<localNetCount; i++) {
         int n = localNets[i];
@@ -532,7 +546,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
       if (graph->nChannels < netcount) {
         int gpu;
-        NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu));
+        NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu));
         if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
       }
     } else {
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 529eff4..cdcc066 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -679,126 +679,57 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   return ncclSuccess;
 }
 
-static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
+ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
   int minType = PATH_DIS;
   float maxBw = 0;
   int count = 0;
-  int* nets;
-  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-  for (int n=0; n<system->nodes[NET].count; n++) {
-    struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
-    if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
-      maxBw = path->bw;
-      minType = path->type;
-      if (type) *type = minType;
+  NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
+  struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
+  for (int i=0; i<system->nodes[resultType].count; i++) {
+    if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
+      maxBw = paths[i].bw;
+      minType = paths[i].type;
+      if (pathType) *pathType = minType;
       count = 0;
     }
-    if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
+    if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
   }
-
-  *localNetMask = 0ULL;
-  for (int n=0; n<count; n++) {
-    if (nets[n] >= 64) return ncclInternalError;
-    *localNetMask |= 1ULL<<nets[n];
-  }
-  free(nets);
+  *localCount = count;
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
-  uint64_t* localNetMasks;
-  int ngpus = system->nodes[GPU].count;
-  NCCLCHECK(ncclCalloc(&localNetMasks, ngpus));
-
-  // Fill localNetMasks for all GPUs.
-  for (int g=0; g<ngpus; g++) {
-    NCCLCHECK(getLocalNetMask(system, g, localNetMasks+g, NULL));
-  }
-
-  // Find GPUs which have the same mask as rank, i.e. share the same local Nets.
   int gpu;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
-  int netLocalGpus = 0, netLocalGpu = 0;
-  for (int g=0; g<ngpus; g++) {
-    if (localNetMasks[g] == localNetMasks[gpu]) {
-      if (g == gpu) netLocalGpu = netLocalGpus;
-      netLocalGpus++;
-    }
-  }
-  uint64_t localNetMask = localNetMasks[gpu];
-  free(localNetMasks);
-  if (localNetMask == 0) return ncclInternalError;
-
-  // Round robin on GPUs and channels
-  int gIndex = 0, cId = 0, n = 0;
-  while (1) {
-    if (1ULL << n & localNetMask) {
-      if (gIndex == netLocalGpu && cId == channelId) {
-        *id = n;
-        return ncclSuccess;
-      }
-      gIndex++;
-      if (gIndex == netLocalGpus) {
-        gIndex = 0;
-        cId++;
-      }
-    }
-    n = (n+1) % 64;
-  }
+  int* localNets;
+  int localNetCount;
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
+  int* localGpus;
+  int localGpuCount;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
+  int net = system->nodes[GPU].nodes[gpu].gpu.dev;
+  if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
+  net += channelId%(DIVUP(localNetCount,localGpuCount));
+  *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
+  free(localNets);
+  free(localGpus);
+  return ncclSuccess;
 }
 
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
-  int ngpus = system->nodes[GPU].count;
-  int* gpus;
-  NCCLCHECK(ncclCalloc(&gpus, ngpus));
-
-  // Find localNetMask which includes net with the most local GPUs.
-  int netLocalGpus = 0, minType = PATH_DIS;
-  uint64_t localNetMask = 0ULL;
-  for (int g=0; g<ngpus; g++) {
-    int type = PATH_DIS;
-    uint64_t mask;
-    NCCLCHECK(getLocalNetMask(system, g, &mask, &type));
-    if ((1ULL<<net) & mask) {
-      if (type < minType) {
-        localNetMask = mask;
-        netLocalGpus = 0;
-        minType = type;
-      }
-      if (type == minType) {
-        if (localNetMask && mask != localNetMask) {
-          WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus-1], minType, net, mask, localNetMask);
-          free(gpus);
-          return ncclInternalError;
-        }
-        gpus[netLocalGpus] = g;
-        netLocalGpus++;
-      }
-    }
-  }
-  if (localNetMask == 0ULL) {
-    *gpuIndex = -1;
-    free(gpus);
-    return ncclSuccess;
-  }
-
-  // Round robin on GPUs and channels
-  int gIndex = 0, cId = 0, n = 0;
-  while (1) {
-    if (1ULL << n & localNetMask) {
-      if (n == net) {
-        *gpuIndex = gpus[gIndex];
-        free(gpus);
+  for (int c=0; c<MAXCHANNELS; c++) {
+    for (int g=0; g<system->nodes[GPU].count; g++) {
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+      int id;
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
+      if (net == id) {
+        *gpuIndex = g;
         return ncclSuccess;
       }
-      gIndex++;
-      if (gIndex == netLocalGpus) {
-        gIndex = 0;
-        cId++;
-      }
     }
-    n = (n+1) % 64;
   }
+  *gpuIndex = -1;
+  return ncclSuccess;
 }
 
 /****************************/
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 281d16a..8951505 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -208,4 +208,14 @@ static float ncclTopoNVLinkBw(int cudaCompCap) {
     cudaCompCap >= 60 ? SM60_NVLINK_BW :
     SM80_NVLINK_BW;
 }
+
+// Mirror bits
+static bool isPow2(int val) {
+  return (val & (val-1)) == 0;
+}
+static int mirrorBits(int val, int pow2) {
+  int mirror = 0;
+  for (int b=1, mb=(pow2>>1); b<pow2; b<<=1, mb>>=1) if (val & b) mirror |= mb;
+  return mirror;
+}
 #endif
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index d441de0..ac862a4 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -576,7 +576,18 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
       }
     }
     pciNode->parent = parent;
-    parent->subs[parent->nSubs++] = pciNode;
+    // Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
+    int subIndex = parent->nSubs;
+    const char* newBusId;
+    NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
+    for (int s=0; s<parent->nSubs; s++) {
+      const char* busId;
+      NCCLCHECK(xmlGetAttrStr(parent->subs[s], "busid", &busId));
+      if (strcmp(newBusId, busId) < 0) { subIndex = s; break; }
+    }
+    for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1];
+    parent->subs[subIndex] = pciNode;
+    parent->nSubs++;
   }
   if (strcmp(parent->name, "pci") == 0) {
     NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 861fa57..421f0a1 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -66,7 +66,7 @@ struct userIbDev {
   uint16_t port_en;
 };
 
-#define MAX_IB_DEVS 16
+#define MAX_IB_DEVS 32
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 struct userIbDev userIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;