2.9.8-1

Fix memory leaks. Fix crash in bootstrap error case. Fix Collnet clean-up issue. Make PCI switch vendor/device optional for XML injection. Add support for nvidia-peermem module.
2021-04-26 14:24:50 -07:00 · 2021-04-26 14:24:50 -07:00 · ca8485b0d0
commit ca8485b0d0
parent a46ea10583
9 changed files with 35 additions and 20 deletions
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 9
-NCCL_PATCH   := 6
+NCCL_PATCH   := 8
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@ -515,10 +515,11 @@ ncclResult_t bootstrapClose(void* commState) {

 ncclResult_t bootstrapAbort(void* commState) {
  struct extState* state = (struct extState*)commState;
-  close(state->extListenFd);
-  close(state->extRingSendFd);
-  close(state->extRingRecvFd);
-  state->allocState->stop = 2;
+  if (commState == NULL) return ncclSuccess;
+  if (state->extListenFd) close(state->extListenFd);
+  if (state->extRingSendFd) close(state->extRingSendFd);
+  if (state->extRingRecvFd) close(state->extRingRecvFd);
+  if (state->allocState) state->allocState->stop = 2;
  free(state->peerCommAddresses);
  free(state->peerAllocAddresses);
  free(state);
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@ -212,6 +212,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
    sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
    INFO(NCCL_GRAPH, "%s", line);
  }
+  free(heads);
  return ncclSuccess;
 }

--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@ -764,7 +764,7 @@ search:
    for (int g=0; g<ngpus; g++) {
      printf("%d ", graph->intra[c*ngpus+g]);
    }
-    printf("[%d %d]", graph->inter[0], graph->inter[1]);
+    printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
    printf("\n");
  }
 #endif
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@ -469,26 +469,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
  }
+  ncclDebugNoWarn = 1;
  NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
  if (index == -1) {
-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
-    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"));
+    if (path == NULL) getPciPath(busId, &path);
+    if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
  }
  NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
  if (index == -1) {
-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
-    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"));
+    if (path == NULL) getPciPath(busId, &path);
+    if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device");
  }
  NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
  if (index == -1) {
-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
-    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"));
+    if (path == NULL) getPciPath(busId, &path);
+    if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
  }
  NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
  if (index == -1) {
-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
-    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"));
+    if (path == NULL) getPciPath(busId, &path);
+    if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
  }
+  ncclDebugNoWarn = 0;
  NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
  if (index == -1) {
    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
--- a/src/group.cc
+++ b/src/group.cc
@ -164,6 +164,7 @@ ncclResult_t ncclGroupEnd() {
  for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
  ncclResult_t ret = ncclGroupError;
  int usingCudaGraphAll = -1;
+  cudaGraph_t* graphs = NULL;
  if (ret != ncclSuccess) goto group_cleanup;

  /* Launch async ncclCommInitRank */
@ -307,7 +308,6 @@ sched_delta:
   */

  // Check whether we are in cuda graph mode
-  cudaGraph_t* graphs;
  NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
  for (int i=0; i<ncclGroupIndex; i++) {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
@ -407,5 +407,6 @@ end:
  ncclGroupError = ncclSuccess;
  ncclGroupIndex = 0;
  CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
+  if (graphs) free(graphs);
  return ret;
 }
--- a/src/init.cc
+++ b/src/init.cc
@ -760,6 +760,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
          collNetSetupFail = 1;
      }
    }
+    free(heads);
    // Verify CollNet setup across ranks
    NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
    if (comm->collNetSupport) {
--- a/src/transport.cc
+++ b/src/transport.cc
@ -241,10 +241,16 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
    for (int r=0; r<comm->nChannels; r++) {
      struct ncclChannel* channel = comm->channels+r;
      struct ncclPeer* peer = channel->peers+nranks;
-      if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources));
-      if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources));
-      peer->send->transportResources = NULL; // avoid double free
-      peer->recv->transportResources = NULL; // avoid double free
+      for (int b=0; b<NCCL_MAX_CONNS; b++) {
+        struct ncclConnector* send = peer->send + b;
+        if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
+        send->transportResources = NULL; // avoid double free
+      }
+      for (int b=0; b<NCCL_MAX_CONNS; b++) {
+        struct ncclConnector* recv = peer->recv + b;
+        if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
+        recv->transportResources = NULL; // avoid double free
+      }
    }
    // Set support to 0
    comm->collNetSupport = 0;
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@ -220,7 +220,10 @@ ncclResult_t ncclIbDevices(int* ndev) {
 ncclResult_t ncclIbGdrSupport(int ibDev) {
  static int moduleLoaded = -1;
  if (moduleLoaded == -1) {
-    moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
+    // Check for the nv_peer_mem module being loaded
+    moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
+                    // Also support the new nvidia-peermem module
+                    (access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == -1)) ? 0 : 1;
  }
  if (moduleLoaded == 0) return ncclSystemError;
  return ncclSuccess;