Merge branch 'shm' of git://github.com/lowintelligence/nccl into lowintelligence-shm
This commit is contained in:
commit
01d1836668
@ -11,6 +11,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
uint64_t getnHash(const char* string, int n);
|
||||
uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
|
||||
|
13
src/init.cc
13
src/init.cc
@ -308,12 +308,12 @@ static void showVersion() {
|
||||
}
|
||||
}
|
||||
|
||||
static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
|
||||
static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
|
||||
info->rank = rank;
|
||||
CUDACHECK(cudaGetDevice(&info->cudaDev));
|
||||
NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
|
||||
info->hostHash=getHostHash();
|
||||
info->pidHash=getPidHash();
|
||||
info->hostHash=getHostHash()+commHash;
|
||||
info->pidHash=getPidHash()+commHash;
|
||||
|
||||
// Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
|
||||
// cudaDev is a CUDA runtime dev number which could be different from the
|
||||
@ -691,7 +691,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
|
||||
uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
|
||||
TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
|
||||
NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
|
||||
|
||||
// AllGather1 - begin
|
||||
@ -702,7 +703,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
|
||||
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
|
||||
allGather1Data[rank].comm = comm;
|
||||
NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
|
||||
NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
|
||||
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
|
||||
|
||||
NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
|
||||
@ -998,7 +999,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
|
||||
NCCLCHECK(ncclCalloc(&allInfo, nranks));
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
CUDACHECK(cudaSetDevice(devs[rank]));
|
||||
NCCLCHECK(fillInfo(allInfo+rank, rank));
|
||||
NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
|
||||
}
|
||||
|
||||
int* connectTransport;
|
||||
|
@ -96,6 +96,15 @@ uint64_t getHash(const char* string) {
|
||||
return result;
|
||||
}
|
||||
|
||||
uint64_t getnHash(const char* string, int n) {
|
||||
// Based on DJB2, result = result * 33 + char
|
||||
uint64_t result = 9527;
|
||||
for (int c = 0; c < n; c++) {
|
||||
result = ((result << 5) + result) + string[c];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Generate a hash of the unique identifying string for this host
|
||||
* that will be unique for both bare-metal and container instances
|
||||
* Equivalent of a hash of;
|
||||
|
Loading…
x
Reference in New Issue
Block a user