diff --git a/LICENSE.txt b/LICENSE.txt index e318c66..bcd1867 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,5 +1,5 @@ - Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -29,3 +29,11 @@ The U.S. Department of Energy funded the development of this software under subcontract 7078610 with Lawrence Berkeley National Laboratory. + +This code also includes files from the NVIDIA Tools Extension SDK project. + +See: + + https://github.com/NVIDIA/NVTX + +for more information and license details. diff --git a/README.md b/README.md index 7f0a72f..a0ad9e2 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,13 @@ # NCCL -Optimized primitives for collective multi-GPU communication. +Optimized primitives for inter-GPU communication. ## Introduction -NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. +NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html). -## What's inside - -At present, the library implements the following collectives operations: - -- all-reduce -- all-gather -- reduce-scatter -- reduce -- broadcast - -These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API. - -## Requirements - -NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported. - ## Build Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds. @@ -89,4 +73,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g ## Copyright -All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. diff --git a/makefiles/version.mk b/makefiles/version.mk index f2539c5..f64e8ad 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 8 -NCCL_PATCH := 3 +NCCL_PATCH := 4 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/bootstrap.cc b/src/bootstrap.cc index bd6ec99..6f682f6 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -72,7 +72,7 @@ static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) { int recvSize; NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int))); if (recvSize > size) { - WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size); + WARN("Message truncated : received %d bytes instead of %d", recvSize, size); return ncclInternalError; } NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size))); @@ -244,7 +244,7 @@ static ncclResult_t remoteAlloc(void** ptr, int fd) { void* ncclRemoteMemAllocationService(void* args) { struct remAllocState* state = (struct remAllocState *) args; if (cudaSetDevice(state->cudaDev) != cudaSuccess) { - WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev); + WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev); } // Prepare poll descriptor @@ -490,7 +490,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) { ncclResult_t bootstrapClose(void* commState) { struct extState* state = (struct extState*)commState; if (state->unexpectedConnections != NULL) { - WARN("Unexpected connections are not empty.\n"); + WARN("Unexpected connections are not empty"); return ncclInternalError; } close(state->extListenFd); diff --git a/src/enqueue.cc b/src/enqueue.cc index a427bd9..4137f61 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -350,7 +350,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { case ncclPatternRingTwice: info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break; default: - WARN("Unknown pattern %d\n", info->pattern); + WARN("Unknown pattern %d", info->pattern); return ncclInternalError; } return ncclSuccess; diff --git a/src/graph/connect.cc b/src/graph/connect.cc index a0f1265..a64f9be 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -115,7 +115,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) { int x = 0; while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++; if (x == NCCL_MAX_TREE_ARITY) { - WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]); + WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]); return ncclInternalError; } tree->down[x] = indexes[d]; @@ -213,7 +213,7 @@ int ncclMinNchannels() { if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels(); if (minNchannels > MAXCHANNELS) { - WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS); + WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS); minNchannels = MAXCHANNELS; } if (minNchannels < 0) minNchannels = 0; @@ -225,7 +225,7 @@ int ncclMaxNchannels() { if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels(); if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; if (maxNchannels < 1) { - WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels); + WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels); maxNchannels = 1; } return maxNchannels; diff --git a/src/graph/paths.cc b/src/graph/paths.cc index eb556c4..f4e331b 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -25,7 +25,7 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* return ncclSuccess; } } - WARN("Could not find node of type %d id %lx\n", t, id); + WARN("Could not find node of type %d id %lx", t, id); return ncclInternalError; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 57c66e7..6e9a208 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -55,7 +55,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* return ncclSuccess; } } - WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id); + WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id); return ncclInternalError; } @@ -188,7 +188,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in return ncclSuccess; } } - WARN("Could not find gpu rank %d\n", rank); + WARN("Could not find gpu rank %d", rank); return ncclInternalError; } @@ -199,7 +199,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int* return ncclSuccess; } } - WARN("Could not find net id %lx\n", id); + WARN("Could not find net id %lx", id); return ncclInternalError; } @@ -788,7 +788,7 @@ done: } if (graph->nChannels == 0 && graph->collNet == 0) { - WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern); + WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern); for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; graph->speedIntra = graph->speedInter = 0.1; diff --git a/src/graph/topo.cc b/src/graph/topo.cc index d53b532..b5c5cc8 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -97,7 +97,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode* ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { - WARN("Error : tried to create too many nodes of type %d\n", type); + WARN("Error : tried to create too many nodes of type %d", type); return ncclInternalError; } struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; @@ -421,7 +421,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); if (gpu == NULL) { - WARN("Add NVLink error : could not find GPU %lx\n", pBusId); + WARN("Add NVLink error : could not find GPU %lx", pBusId); return ncclInternalError; } int count; diff --git a/src/graph/xml.cc b/src/graph/xml.cc index b2232c2..a12865e 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -62,7 +62,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) { if (c == '=') { ptr[o] = '\0'; if (value == NULL) { - WARN("XML Parse : Unexpected value with name %s\n", ptr); + WARN("XML Parse : Unexpected value with name %s", ptr); return ncclInternalError; } return xmlGetValue(file, value, last); @@ -128,7 +128,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) { // Re-read the name, we got '/' in the first call NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); if (c != '>') { - WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name); + WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name); return ncclInternalError; } return ncclSuccess; @@ -141,7 +141,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) { while (c == ' ') { NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c)); if (a == MAX_ATTR_COUNT) { - INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT); + INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT); // Actually we need to still consume the extra attributes so we have an extra one. } else a++; } @@ -169,7 +169,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess; while (1) { if (xml->maxIndex == MAX_NODES) { - WARN("Error : XML parser is limited to 1024 nodes\n"); + WARN("Error : XML parser is limited to 1024 nodes"); return ncclInternalError; } struct ncclXmlNode* node = xml->nodes+xml->maxIndex; @@ -360,7 +360,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat char strValue[MAX_STR_LEN]; NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue)); if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); } - TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue); + TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue); return ncclSuccess; } @@ -592,7 +592,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12; if (maxNvLinks > 0 && nvmlDev == NULL) { - WARN("No NVML device handle. Skipping nvlink detection.\n"); + WARN("No NVML device handle. Skipping nvlink detection."); maxNvLinks = 0; } diff --git a/src/group.cc b/src/group.cc index 78a74b6..43a9328 100644 --- a/src/group.cc +++ b/src/group.cc @@ -201,7 +201,7 @@ ncclResult_t ncclGroupEnd() { if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) { int err = pthread_join(ncclGroupThreads[i], NULL); if (err != 0) { - WARN("Error waiting for pthread_join : %s\n", strerror(errno)); + WARN("Error waiting for pthread_join : %s", strerror(errno)); return ncclSystemError; } NCCLCHECKGOTO(args->ret, ret, end); @@ -233,7 +233,7 @@ ncclResult_t ncclGroupEnd() { // schedule delta 0, +1, -1, +2, -2, ... // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. for (int d=0; d<=nRanks/4; d++) { - int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) }; + int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks }; int index = 0; int delta = deltas[index]; sched_delta: @@ -258,6 +258,10 @@ sched_delta: ssize_t sendbytes = totSendBytes-sendOffset; if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; } if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; } + // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested + // (total size == 0), otherwise set size to -1 so that the kernel skips the operation. + if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1; + if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1; if (sendbytes >= 0 || recvbytes >= 0) { NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId, recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL, diff --git a/src/include/debug.h b/src/include/debug.h index d88458c..e7a152c 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); -void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); // Let code temporarily downgrade WARN into INFO extern thread_local int ncclDebugNoWarn; diff --git a/src/include/shm.h b/src/include/shm.h index 9cd9d05..7334f16 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -45,7 +45,7 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt *shmPtr = ptr; return ncclSuccess; sysError: - WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize); + WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize); cudaError: if (fd != -1) close(fd); if (create) shm_unlink(shmname); diff --git a/src/include/socket.h b/src/include/socket.h index e903b04..8b59f72 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -367,7 +367,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { /* IPv4/IPv6 support */ int family = remoteAddr->sa.sa_family; if (family != AF_INET && family != AF_INET6) { - WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6); + WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)", family, AF_INET, AF_INET6); return ncclInternalError; } int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); diff --git a/src/init.cc b/src/init.cc index 81831cf..08bddfd 100644 --- a/src/init.cc +++ b/src/init.cc @@ -581,7 +581,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm for (int i = 0; i < nranks; i++) { memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { - WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId); + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId); return ncclInvalidUsage; } } @@ -878,7 +878,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); return ncclSuccess; cleanup: diff --git a/src/misc/utils.cc b/src/misc/utils.cc index b231eb1..79e6170 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -67,10 +67,10 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { } uint64_t getHash(const char* string, int n) { - // Based on DJB2, result = result * 33 + char + // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; c < n; c++) { - result = ((result << 5) + result) + string[c]; + result = ((result << 5) + result) ^ string[c]; } return result; } diff --git a/src/proxy.cc b/src/proxy.cc index d3824f2..503781e 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -72,7 +72,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) { struct ncclProxyArgs* op = state->ops; while (op) { if (op->idle & OP_SEEN) { - WARN("Active list loop at element %ld\n", OP_INDEX(op)); + WARN("Active list loop at element %ld", OP_INDEX(op)); } op->idle |= OP_SEEN; printf("[%ld]", OP_INDEX(op)); @@ -98,7 +98,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) { struct ncclProxyArgs* free = state->pool; while (free) { if (free->idle & OP_SEEN) { - WARN("Free list loop at element %ld\n", OP_INDEX(free)); + WARN("Free list loop at element %ld", OP_INDEX(free)); } free->idle |= OP_SEEN; free = free->next; @@ -109,7 +109,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) { while (p) { for (int e=0; eelems[e].idle & OP_SEEN) == 0) { - WARN("Element %d of pool %d has been lost\n", e, i); + WARN("Element %d of pool %d has been lost", e, i); struct ncclProxyArgs* free = state->pool; printf("Free list "); while (free) { @@ -164,7 +164,7 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) { struct ncclPeer* peerComm = args->channel->peers+peer; struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; if (connector->transportComm == NULL) { - WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank, + WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank, type == proxyRecv ? "recv" : "send", peer, args->channel->id); return ncclInternalError; } @@ -480,7 +480,7 @@ ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int typ while (nslots*state->slotSize < size) nslots *= 2; int s = (ptr-buff)/state->slotSize; if (s < 0 || s+nslots > state->nslots) { - WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots); + WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots); return ncclInternalError; } for (int i=0; i