2.8.4-1

Fix hang in corner cases of alltoallv using point to point send/recv. Harmonize error messages. Fix missing NVTX section in the license. Update README.
2021-02-09 15:34:08 -08:00 · 2021-02-09 15:34:08 -08:00 · 911d61f214
commit 911d61f214
parent 3996562690
19 changed files with 53 additions and 57 deletions
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -1,5 +1,5 @@

- Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
@ -29,3 +29,11 @@
 The U.S. Department of Energy funded the development of this software
 under subcontract 7078610 with Lawrence Berkeley National Laboratory.

+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+   https://github.com/NVIDIA/NVTX
+
+for more information and license details.
--- a/README.md
+++ b/README.md
@ -1,29 +1,13 @@
 # NCCL

-Optimized primitives for collective multi-GPU communication.
+Optimized primitives for inter-GPU communication.

 ## Introduction

-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.

 For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).

-## What's inside
-
-At present, the library implements the following collectives operations:
-
- all-reduce
- all-gather
- reduce-scatter
- reduce
- broadcast
-
-These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
-
-## Requirements
-
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
-
 ## Build

 Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.
@ -89,4 +73,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>

 ## Copyright

-All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 8
-NCCL_PATCH   := 3
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@ -72,7 +72,7 @@ static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
  int recvSize;
  NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
  if (recvSize > size) {
-    WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
+    WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
    return ncclInternalError;
  }
  NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
@ -244,7 +244,7 @@ static ncclResult_t remoteAlloc(void** ptr, int fd) {
 void* ncclRemoteMemAllocationService(void* args) {
  struct remAllocState* state = (struct remAllocState *) args;
  if (cudaSetDevice(state->cudaDev) != cudaSuccess) {
-    WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
+    WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
  }

  // Prepare poll descriptor
@ -490,7 +490,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
 ncclResult_t bootstrapClose(void* commState) {
  struct extState* state = (struct extState*)commState;
  if (state->unexpectedConnections != NULL) {
-    WARN("Unexpected connections are not empty.\n");
+    WARN("Unexpected connections are not empty");
    return ncclInternalError;
  }
  close(state->extListenFd);
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@ -350,7 +350,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
    case ncclPatternRingTwice:
      info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
    default:
-      WARN("Unknown pattern %d\n", info->pattern);
+      WARN("Unknown pattern %d", info->pattern);
      return ncclInternalError;
  }
  return ncclSuccess;
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@ -115,7 +115,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
  int x = 0;
  while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
  if (x == NCCL_MAX_TREE_ARITY) {
-    WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
+    WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
    return ncclInternalError;
  }
  tree->down[x] = indexes[d];
@ -213,7 +213,7 @@ int ncclMinNchannels() {
  if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
  if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
  if (minNchannels > MAXCHANNELS) {
-    WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
+    WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
    minNchannels = MAXCHANNELS;
  }
  if (minNchannels < 0) minNchannels = 0;
@ -225,7 +225,7 @@ int ncclMaxNchannels() {
  if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
  if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
  if (maxNchannels < 1) {
-    WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
+    WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
    maxNchannels = 1;
  }
  return maxNchannels;
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@ -25,7 +25,7 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
      return ncclSuccess;
    }
  }
-  WARN("Could not find node of type %d id %lx\n", t, id);
+  WARN("Could not find node of type %d id %lx", t, id);
  return ncclInternalError;
 }

--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@ -55,7 +55,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
      return ncclSuccess;
    }
  }
-  WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
+  WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
  return ncclInternalError;
 }

@ -188,7 +188,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in
      return ncclSuccess;
    }
  }
-  WARN("Could not find gpu rank %d\n", rank);
+  WARN("Could not find gpu rank %d", rank);
  return ncclInternalError;
 }

@ -199,7 +199,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
      return ncclSuccess;
    }
  }
-  WARN("Could not find net id %lx\n", id);
+  WARN("Could not find net id %lx", id);
  return ncclInternalError;
 }

@ -788,7 +788,7 @@ done:
  }

  if (graph->nChannels == 0 && graph->collNet == 0) {
-    WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
+    WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
    graph->inter[0] = graph->inter[1] = 0;
    graph->speedIntra = graph->speedInter = 0.1;
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@ -97,7 +97,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode*

 ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
  if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
-    WARN("Error : tried to create too many nodes of type %d\n", type);
+    WARN("Error : tried to create too many nodes of type %d", type);
    return ncclInternalError;
  }
  struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
@ -421,7 +421,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
    NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
    NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
    if (gpu == NULL) {
-      WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
+      WARN("Add NVLink error : could not find GPU %lx", pBusId);
      return ncclInternalError;
    }
    int count;
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@ -62,7 +62,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
    if (c == '=') {
      ptr[o] = '\0';
      if (value == NULL) {
-        WARN("XML Parse : Unexpected value with name %s\n", ptr);
+        WARN("XML Parse : Unexpected value with name %s", ptr);
        return ncclInternalError;
      }
      return xmlGetValue(file, value, last);
@ -128,7 +128,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
    // Re-read the name, we got '/' in the first call
    NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
    if (c != '>') {
-      WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
+      WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
      return ncclInternalError;
    }
    return ncclSuccess;
@ -141,7 +141,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
  while (c == ' ') {
    NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
    if (a == MAX_ATTR_COUNT) {
-      INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
+      INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
      // Actually we need to still consume the extra attributes so we have an extra one.
    } else a++;
  }
@ -169,7 +169,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
  if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
  while (1) {
    if (xml->maxIndex == MAX_NODES) {
-      WARN("Error : XML parser is limited to 1024 nodes\n");
+      WARN("Error : XML parser is limited to 1024 nodes");
      return ncclInternalError;
    }
    struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
@ -360,7 +360,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat
  char strValue[MAX_STR_LEN];
  NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
  if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
-  TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
+  TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
  return ncclSuccess;
 }

@ -592,7 +592,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
    int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;

    if (maxNvLinks > 0 && nvmlDev == NULL) {
-      WARN("No NVML device handle. Skipping nvlink detection.\n");
+      WARN("No NVML device handle. Skipping nvlink detection.");
      maxNvLinks = 0;
    }

--- a/src/group.cc
+++ b/src/group.cc
@ -201,7 +201,7 @@ ncclResult_t ncclGroupEnd() {
    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
      int err = pthread_join(ncclGroupThreads[i], NULL);
      if (err != 0) {
-        WARN("Error waiting for pthread_join : %s\n", strerror(errno));
+        WARN("Error waiting for pthread_join : %s", strerror(errno));
        return ncclSystemError;
      }
      NCCLCHECKGOTO(args->ret, ret, end);
@ -233,7 +233,7 @@ ncclResult_t ncclGroupEnd() {
        // schedule delta 0, +1, -1, +2, -2, ...
        // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
        for (int d=0; d<=nRanks/4; d++) {
-          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
+          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
          int index = 0;
          int delta = deltas[index];
 sched_delta:
@ -258,6 +258,10 @@ sched_delta:
              ssize_t sendbytes = totSendBytes-sendOffset;
              if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
              if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
+              // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
+              // (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
+              if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
+              if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
              if (sendbytes >= 0 || recvbytes >= 0) {
                NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
                      recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
--- a/src/include/debug.h
+++ b/src/include/debug.h
@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);

-void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));

 // Let code temporarily downgrade WARN into INFO
 extern thread_local int ncclDebugNoWarn;
--- a/src/include/shm.h
+++ b/src/include/shm.h
@ -45,7 +45,7 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
  *shmPtr = ptr;
  return ncclSuccess;
 sysError:
-  WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
+  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
 cudaError:
  if (fd != -1) close(fd);
  if (create) shm_unlink(shmname);
--- a/src/include/socket.h
+++ b/src/include/socket.h
@ -367,7 +367,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
  /* IPv4/IPv6 support */
  int family = remoteAddr->sa.sa_family;
  if (family != AF_INET && family != AF_INET6) {
-    WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
+    WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)", family, AF_INET, AF_INET6);
    return ncclInternalError;
  }
  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
--- a/src/init.cc
+++ b/src/init.cc
@ -581,7 +581,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  for (int i = 0; i < nranks; i++) {
    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
    if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
-      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
      return ncclInvalidUsage;
    }
  }
@ -878,7 +878,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);

-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);

  return ncclSuccess;
 cleanup:
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@ -67,10 +67,10 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
 }

 uint64_t getHash(const char* string, int n) {
-  // Based on DJB2, result = result * 33 + char
+  // Based on DJB2a, result = result * 33 ^ char
  uint64_t result = 5381;
  for (int c = 0; c < n; c++) {
-    result = ((result << 5) + result) + string[c];
+    result = ((result << 5) + result) ^ string[c];
  }
  return result;
 }
--- a/src/proxy.cc
+++ b/src/proxy.cc
@ -72,7 +72,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
  struct ncclProxyArgs* op = state->ops;
  while (op) {
    if (op->idle & OP_SEEN) {
-      WARN("Active list loop at element %ld\n", OP_INDEX(op));
+      WARN("Active list loop at element %ld", OP_INDEX(op));
    }
    op->idle |= OP_SEEN;
    printf("[%ld]", OP_INDEX(op));
@ -98,7 +98,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
  struct ncclProxyArgs* free = state->pool;
  while (free) {
    if (free->idle & OP_SEEN) {
-      WARN("Free list loop at element %ld\n", OP_INDEX(free));
+      WARN("Free list loop at element %ld", OP_INDEX(free));
    }
    free->idle |= OP_SEEN;
    free = free->next;
@ -109,7 +109,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
  while (p) {
    for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
      if ((p->elems[e].idle & OP_SEEN) == 0) {
-        WARN("Element %d of pool %d has been lost\n", e, i);
+        WARN("Element %d of pool %d has been lost", e, i);
        struct ncclProxyArgs* free = state->pool;
        printf("Free list ");
        while (free) {
@ -164,7 +164,7 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
  struct ncclPeer* peerComm = args->channel->peers+peer;
  struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
  if (connector->transportComm == NULL) {
-    WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank,
+    WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank,
        type == proxyRecv ? "recv" : "send", peer, args->channel->id);
    return ncclInternalError;
  }
@ -480,7 +480,7 @@ ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int typ
  while (nslots*state->slotSize < size) nslots *= 2;
  int s = (ptr-buff)/state->slotSize;
  if (s < 0 || s+nslots > state->nslots) {
-    WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
+    WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots);
    return ncclInternalError;
  }
  for (int i=0; i<nslots; i++) used[s+i] = 0;
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@ -84,7 +84,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
  snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
  char* p = realpath(devicePath, NULL);
  if (p == NULL) {
-    WARN("Could not find real path of %s", *devicePath);
+    WARN("Could not find real path of %s (%s)", devName, devicePath);
  } else {
    // Merge multi-port NICs into the same PCI device
    p[strlen(p)-1] = '0';
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@ -237,7 +237,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
    if (fd == -1) {
      // Could not find device vendor. This is handled silently so
      // we don't want to print an INFO error.
-      TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno));
+      TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno));
      goto end;
    }
    char vendor[7];