Fix hang in corner cases of alltoallv using point to point send/recv.
Harmonize error messages.
Fix missing NVTX section in the license.
Update README.
This commit is contained in:
Sylvain Jeaugey 2021-02-09 15:34:08 -08:00
parent 3996562690
commit 911d61f214
19 changed files with 53 additions and 57 deletions

View File

@ -1,5 +1,5 @@
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions modification, are permitted provided that the following conditions
@ -29,3 +29,11 @@
The U.S. Department of Energy funded the development of this software The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory. under subcontract 7078610 with Lawrence Berkeley National Laboratory.
This code also includes files from the NVIDIA Tools Extension SDK project.
See:
https://github.com/NVIDIA/NVTX
for more information and license details.

View File

@ -1,29 +1,13 @@
# NCCL # NCCL
Optimized primitives for collective multi-GPU communication. Optimized primitives for inter-GPU communication.
## Introduction ## Introduction
NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html). For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
## What's inside
At present, the library implements the following collectives operations:
- all-reduce
- all-gather
- reduce-scatter
- reduce
- broadcast
These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
## Requirements
NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
## Build ## Build
Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds. Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.
@ -89,4 +73,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
## Copyright ## Copyright
All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.

View File

@ -1,6 +1,6 @@
##### version ##### version
NCCL_MAJOR := 2 NCCL_MAJOR := 2
NCCL_MINOR := 8 NCCL_MINOR := 8
NCCL_PATCH := 3 NCCL_PATCH := 4
NCCL_SUFFIX := NCCL_SUFFIX :=
PKG_REVISION := 1 PKG_REVISION := 1

View File

@ -72,7 +72,7 @@ static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
int recvSize; int recvSize;
NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int))); NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
if (recvSize > size) { if (recvSize > size) {
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size); WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
return ncclInternalError; return ncclInternalError;
} }
NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size))); NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
@ -244,7 +244,7 @@ static ncclResult_t remoteAlloc(void** ptr, int fd) {
void* ncclRemoteMemAllocationService(void* args) { void* ncclRemoteMemAllocationService(void* args) {
struct remAllocState* state = (struct remAllocState *) args; struct remAllocState* state = (struct remAllocState *) args;
if (cudaSetDevice(state->cudaDev) != cudaSuccess) { if (cudaSetDevice(state->cudaDev) != cudaSuccess) {
WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev); WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
} }
// Prepare poll descriptor // Prepare poll descriptor
@ -490,7 +490,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
ncclResult_t bootstrapClose(void* commState) { ncclResult_t bootstrapClose(void* commState) {
struct extState* state = (struct extState*)commState; struct extState* state = (struct extState*)commState;
if (state->unexpectedConnections != NULL) { if (state->unexpectedConnections != NULL) {
WARN("Unexpected connections are not empty.\n"); WARN("Unexpected connections are not empty");
return ncclInternalError; return ncclInternalError;
} }
close(state->extListenFd); close(state->extListenFd);

View File

@ -350,7 +350,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternRingTwice: case ncclPatternRingTwice:
info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break; info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
default: default:
WARN("Unknown pattern %d\n", info->pattern); WARN("Unknown pattern %d", info->pattern);
return ncclInternalError; return ncclInternalError;
} }
return ncclSuccess; return ncclSuccess;

View File

@ -115,7 +115,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
int x = 0; int x = 0;
while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++; while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
if (x == NCCL_MAX_TREE_ARITY) { if (x == NCCL_MAX_TREE_ARITY) {
WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]); WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
return ncclInternalError; return ncclInternalError;
} }
tree->down[x] = indexes[d]; tree->down[x] = indexes[d];
@ -213,7 +213,7 @@ int ncclMinNchannels() {
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels(); if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
if (minNchannels > MAXCHANNELS) { if (minNchannels > MAXCHANNELS) {
WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS); WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
minNchannels = MAXCHANNELS; minNchannels = MAXCHANNELS;
} }
if (minNchannels < 0) minNchannels = 0; if (minNchannels < 0) minNchannels = 0;
@ -225,7 +225,7 @@ int ncclMaxNchannels() {
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels(); if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
if (maxNchannels < 1) { if (maxNchannels < 1) {
WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels); WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
maxNchannels = 1; maxNchannels = 1;
} }
return maxNchannels; return maxNchannels;

View File

@ -25,7 +25,7 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
return ncclSuccess; return ncclSuccess;
} }
} }
WARN("Could not find node of type %d id %lx\n", t, id); WARN("Could not find node of type %d id %lx", t, id);
return ncclInternalError; return ncclInternalError;
} }

View File

@ -55,7 +55,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
return ncclSuccess; return ncclSuccess;
} }
} }
WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id); WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
return ncclInternalError; return ncclInternalError;
} }
@ -188,7 +188,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in
return ncclSuccess; return ncclSuccess;
} }
} }
WARN("Could not find gpu rank %d\n", rank); WARN("Could not find gpu rank %d", rank);
return ncclInternalError; return ncclInternalError;
} }
@ -199,7 +199,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
return ncclSuccess; return ncclSuccess;
} }
} }
WARN("Could not find net id %lx\n", id); WARN("Could not find net id %lx", id);
return ncclInternalError; return ncclInternalError;
} }
@ -788,7 +788,7 @@ done:
} }
if (graph->nChannels == 0 && graph->collNet == 0) { if (graph->nChannels == 0 && graph->collNet == 0) {
WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern); WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank; for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
graph->inter[0] = graph->inter[1] = 0; graph->inter[0] = graph->inter[1] = 0;
graph->speedIntra = graph->speedInter = 0.1; graph->speedIntra = graph->speedInter = 0.1;

View File

@ -97,7 +97,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode*
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d\n", type); WARN("Error : tried to create too many nodes of type %d", type);
return ncclInternalError; return ncclInternalError;
} }
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
@ -421,7 +421,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) { if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx\n", pBusId); WARN("Add NVLink error : could not find GPU %lx", pBusId);
return ncclInternalError; return ncclInternalError;
} }
int count; int count;

View File

@ -62,7 +62,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
if (c == '=') { if (c == '=') {
ptr[o] = '\0'; ptr[o] = '\0';
if (value == NULL) { if (value == NULL) {
WARN("XML Parse : Unexpected value with name %s\n", ptr); WARN("XML Parse : Unexpected value with name %s", ptr);
return ncclInternalError; return ncclInternalError;
} }
return xmlGetValue(file, value, last); return xmlGetValue(file, value, last);
@ -128,7 +128,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
// Re-read the name, we got '/' in the first call // Re-read the name, we got '/' in the first call
NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
if (c != '>') { if (c != '>') {
WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name); WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
return ncclInternalError; return ncclInternalError;
} }
return ncclSuccess; return ncclSuccess;
@ -141,7 +141,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
while (c == ' ') { while (c == ' ') {
NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c)); NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
if (a == MAX_ATTR_COUNT) { if (a == MAX_ATTR_COUNT) {
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT); INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
// Actually we need to still consume the extra attributes so we have an extra one. // Actually we need to still consume the extra attributes so we have an extra one.
} else a++; } else a++;
} }
@ -169,7 +169,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess; if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
while (1) { while (1) {
if (xml->maxIndex == MAX_NODES) { if (xml->maxIndex == MAX_NODES) {
WARN("Error : XML parser is limited to 1024 nodes\n"); WARN("Error : XML parser is limited to 1024 nodes");
return ncclInternalError; return ncclInternalError;
} }
struct ncclXmlNode* node = xml->nodes+xml->maxIndex; struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
@ -360,7 +360,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat
char strValue[MAX_STR_LEN]; char strValue[MAX_STR_LEN];
NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue)); NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); } if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue); TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
return ncclSuccess; return ncclSuccess;
} }
@ -592,7 +592,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12; int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
if (maxNvLinks > 0 && nvmlDev == NULL) { if (maxNvLinks > 0 && nvmlDev == NULL) {
WARN("No NVML device handle. Skipping nvlink detection.\n"); WARN("No NVML device handle. Skipping nvlink detection.");
maxNvLinks = 0; maxNvLinks = 0;
} }

View File

@ -201,7 +201,7 @@ ncclResult_t ncclGroupEnd() {
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) { if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
int err = pthread_join(ncclGroupThreads[i], NULL); int err = pthread_join(ncclGroupThreads[i], NULL);
if (err != 0) { if (err != 0) {
WARN("Error waiting for pthread_join : %s\n", strerror(errno)); WARN("Error waiting for pthread_join : %s", strerror(errno));
return ncclSystemError; return ncclSystemError;
} }
NCCLCHECKGOTO(args->ret, ret, end); NCCLCHECKGOTO(args->ret, ret, end);
@ -233,7 +233,7 @@ ncclResult_t ncclGroupEnd() {
// schedule delta 0, +1, -1, +2, -2, ... // schedule delta 0, +1, -1, +2, -2, ...
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
for (int d=0; d<=nRanks/4; d++) { for (int d=0; d<=nRanks/4; d++) {
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) }; int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
int index = 0; int index = 0;
int delta = deltas[index]; int delta = deltas[index];
sched_delta: sched_delta:
@ -258,6 +258,10 @@ sched_delta:
ssize_t sendbytes = totSendBytes-sendOffset; ssize_t sendbytes = totSendBytes-sendOffset;
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; } if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; } if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
// 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
// (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
if (sendbytes >= 0 || recvbytes >= 0) { if (sendbytes >= 0 || recvbytes >= 0) {
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId, NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL, recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,

View File

@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile; extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
// Let code temporarily downgrade WARN into INFO // Let code temporarily downgrade WARN into INFO
extern thread_local int ncclDebugNoWarn; extern thread_local int ncclDebugNoWarn;

View File

@ -45,7 +45,7 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
*shmPtr = ptr; *shmPtr = ptr;
return ncclSuccess; return ncclSuccess;
sysError: sysError:
WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize); WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
cudaError: cudaError:
if (fd != -1) close(fd); if (fd != -1) close(fd);
if (create) shm_unlink(shmname); if (create) shm_unlink(shmname);

View File

@ -367,7 +367,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
/* IPv4/IPv6 support */ /* IPv4/IPv6 support */
int family = remoteAddr->sa.sa_family; int family = remoteAddr->sa.sa_family;
if (family != AF_INET && family != AF_INET6) { if (family != AF_INET && family != AF_INET6) {
WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6); WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)", family, AF_INET, AF_INET6);
return ncclInternalError; return ncclInternalError;
} }
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);

View File

@ -581,7 +581,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
for (int i = 0; i < nranks; i++) { for (int i = 0; i < nranks; i++) {
memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId); WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
return ncclInvalidUsage; return ncclInvalidUsage;
} }
} }
@ -878,7 +878,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
return ncclSuccess; return ncclSuccess;
cleanup: cleanup:

View File

@ -67,10 +67,10 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
} }
uint64_t getHash(const char* string, int n) { uint64_t getHash(const char* string, int n) {
// Based on DJB2, result = result * 33 + char // Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381; uint64_t result = 5381;
for (int c = 0; c < n; c++) { for (int c = 0; c < n; c++) {
result = ((result << 5) + result) + string[c]; result = ((result << 5) + result) ^ string[c];
} }
return result; return result;
} }

View File

@ -72,7 +72,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
struct ncclProxyArgs* op = state->ops; struct ncclProxyArgs* op = state->ops;
while (op) { while (op) {
if (op->idle & OP_SEEN) { if (op->idle & OP_SEEN) {
WARN("Active list loop at element %ld\n", OP_INDEX(op)); WARN("Active list loop at element %ld", OP_INDEX(op));
} }
op->idle |= OP_SEEN; op->idle |= OP_SEEN;
printf("[%ld]", OP_INDEX(op)); printf("[%ld]", OP_INDEX(op));
@ -98,7 +98,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
struct ncclProxyArgs* free = state->pool; struct ncclProxyArgs* free = state->pool;
while (free) { while (free) {
if (free->idle & OP_SEEN) { if (free->idle & OP_SEEN) {
WARN("Free list loop at element %ld\n", OP_INDEX(free)); WARN("Free list loop at element %ld", OP_INDEX(free));
} }
free->idle |= OP_SEEN; free->idle |= OP_SEEN;
free = free->next; free = free->next;
@ -109,7 +109,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
while (p) { while (p) {
for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) { for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
if ((p->elems[e].idle & OP_SEEN) == 0) { if ((p->elems[e].idle & OP_SEEN) == 0) {
WARN("Element %d of pool %d has been lost\n", e, i); WARN("Element %d of pool %d has been lost", e, i);
struct ncclProxyArgs* free = state->pool; struct ncclProxyArgs* free = state->pool;
printf("Free list "); printf("Free list ");
while (free) { while (free) {
@ -164,7 +164,7 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
struct ncclPeer* peerComm = args->channel->peers+peer; struct ncclPeer* peerComm = args->channel->peers+peer;
struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
if (connector->transportComm == NULL) { if (connector->transportComm == NULL) {
WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank, WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank,
type == proxyRecv ? "recv" : "send", peer, args->channel->id); type == proxyRecv ? "recv" : "send", peer, args->channel->id);
return ncclInternalError; return ncclInternalError;
} }
@ -480,7 +480,7 @@ ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int typ
while (nslots*state->slotSize < size) nslots *= 2; while (nslots*state->slotSize < size) nslots *= 2;
int s = (ptr-buff)/state->slotSize; int s = (ptr-buff)/state->slotSize;
if (s < 0 || s+nslots > state->nslots) { if (s < 0 || s+nslots > state->nslots) {
WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots); WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots);
return ncclInternalError; return ncclInternalError;
} }
for (int i=0; i<nslots; i++) used[s+i] = 0; for (int i=0; i<nslots; i++) used[s+i] = 0;

View File

@ -84,7 +84,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName); snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
char* p = realpath(devicePath, NULL); char* p = realpath(devicePath, NULL);
if (p == NULL) { if (p == NULL) {
WARN("Could not find real path of %s", *devicePath); WARN("Could not find real path of %s (%s)", devName, devicePath);
} else { } else {
// Merge multi-port NICs into the same PCI device // Merge multi-port NICs into the same PCI device
p[strlen(p)-1] = '0'; p[strlen(p)-1] = '0';

View File

@ -237,7 +237,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
if (fd == -1) { if (fd == -1) {
// Could not find device vendor. This is handled silently so // Could not find device vendor. This is handled silently so
// we don't want to print an INFO error. // we don't want to print an INFO error.
TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno)); TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno));
goto end; goto end;
} }
char vendor[7]; char vendor[7];