2.18.5-1
Fix NVLS search (issue #931). Increase max IB NICs to 32. Fix inconsistent device ordering (issue #820). Try to use different devices for different GPUs in systems with more than one NIC per GFU.
This commit is contained in:
parent
8ed014bae9
commit
559b70f86c
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 18
|
NCCL_MINOR := 18
|
||||||
NCCL_PATCH := 3
|
NCCL_PATCH := 5
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -730,9 +730,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
|||||||
// fill the whole space of nChannels. To do so we mirror the bits in the
|
// fill the whole space of nChannels. To do so we mirror the bits in the
|
||||||
// nChannels space.
|
// nChannels space.
|
||||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||||
int mirror = 0;
|
comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
|
||||||
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
|
|
||||||
comm->p2pChannels[c] = mirror;
|
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
@ -376,6 +376,28 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
|||||||
int* localNets;
|
int* localNets;
|
||||||
NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
|
NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
|
||||||
|
|
||||||
|
// First add the preferred NICs
|
||||||
|
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||||
|
if (gpu != -1 && gpu != g) continue;
|
||||||
|
localNetCount = 0;
|
||||||
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||||
|
for (int c = 0;; c++) {
|
||||||
|
int netId;
|
||||||
|
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
|
||||||
|
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
|
||||||
|
if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
|
||||||
|
localNetCount++;
|
||||||
|
}
|
||||||
|
// Append NICs to list
|
||||||
|
for (int i=0; i<localNetCount; i++) {
|
||||||
|
int n = localNets[i];
|
||||||
|
int found = 0;
|
||||||
|
while (nets[found] != n && found<netCount) found++;
|
||||||
|
if (found == netCount) nets[netCount++] = n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then add others satisfying typeInter
|
||||||
for (int t=0; t <= typeInter; t++) {
|
for (int t=0; t <= typeInter; t++) {
|
||||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||||
if (gpu != -1 && gpu != g) continue;
|
if (gpu != -1 && gpu != g) continue;
|
||||||
@ -385,14 +407,6 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
|||||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||||
if (paths[n].type == t) localNets[localNetCount++] = n;
|
if (paths[n].type == t) localNets[localNetCount++] = n;
|
||||||
}
|
}
|
||||||
if (localNetCount == 0) continue;
|
|
||||||
// Shuffle by gpu NVML device number so that GPUs on the same PCI switch
|
|
||||||
// with multiple NICs don't use the same one as first choice.
|
|
||||||
for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
|
|
||||||
int net0 = localNets[0];
|
|
||||||
for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1];
|
|
||||||
localNets[localNetCount-1] = net0;
|
|
||||||
}
|
|
||||||
// Append NICs to list
|
// Append NICs to list
|
||||||
for (int i=0; i<localNetCount; i++) {
|
for (int i=0; i<localNetCount; i++) {
|
||||||
int n = localNets[i];
|
int n = localNets[i];
|
||||||
@ -532,7 +546,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
|||||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
|
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
|
||||||
if (graph->nChannels < netcount) {
|
if (graph->nChannels < netcount) {
|
||||||
int gpu;
|
int gpu;
|
||||||
NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu));
|
NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu));
|
||||||
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
|
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -679,128 +679,59 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
|
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
|
||||||
int minType = PATH_DIS;
|
int minType = PATH_DIS;
|
||||||
float maxBw = 0;
|
float maxBw = 0;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
int* nets;
|
NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
|
||||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
|
||||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
for (int i=0; i<system->nodes[resultType].count; i++) {
|
||||||
struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
|
if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
|
||||||
if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
|
maxBw = paths[i].bw;
|
||||||
maxBw = path->bw;
|
minType = paths[i].type;
|
||||||
minType = path->type;
|
if (pathType) *pathType = minType;
|
||||||
if (type) *type = minType;
|
|
||||||
count = 0;
|
count = 0;
|
||||||
}
|
}
|
||||||
if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
|
if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
|
||||||
}
|
}
|
||||||
|
*localCount = count;
|
||||||
*localNetMask = 0ULL;
|
|
||||||
for (int n=0; n<count; n++) {
|
|
||||||
if (nets[n] >= 64) return ncclInternalError;
|
|
||||||
*localNetMask |= 1ULL<<nets[n];
|
|
||||||
}
|
|
||||||
free(nets);
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
|
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
|
||||||
uint64_t* localNetMasks;
|
|
||||||
int ngpus = system->nodes[GPU].count;
|
|
||||||
NCCLCHECK(ncclCalloc(&localNetMasks, ngpus));
|
|
||||||
|
|
||||||
// Fill localNetMasks for all GPUs.
|
|
||||||
for (int g=0; g<ngpus; g++) {
|
|
||||||
NCCLCHECK(getLocalNetMask(system, g, localNetMasks+g, NULL));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find GPUs which have the same mask as rank, i.e. share the same local Nets.
|
|
||||||
int gpu;
|
int gpu;
|
||||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
|
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
|
||||||
int netLocalGpus = 0, netLocalGpu = 0;
|
int* localNets;
|
||||||
for (int g=0; g<ngpus; g++) {
|
int localNetCount;
|
||||||
if (localNetMasks[g] == localNetMasks[gpu]) {
|
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
|
||||||
if (g == gpu) netLocalGpu = netLocalGpus;
|
int* localGpus;
|
||||||
netLocalGpus++;
|
int localGpuCount;
|
||||||
}
|
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
|
||||||
}
|
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
|
||||||
uint64_t localNetMask = localNetMasks[gpu];
|
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
|
||||||
free(localNetMasks);
|
net += channelId%(DIVUP(localNetCount,localGpuCount));
|
||||||
if (localNetMask == 0) return ncclInternalError;
|
*id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
|
||||||
|
free(localNets);
|
||||||
// Round robin on GPUs and channels
|
free(localGpus);
|
||||||
int gIndex = 0, cId = 0, n = 0;
|
|
||||||
while (1) {
|
|
||||||
if (1ULL << n & localNetMask) {
|
|
||||||
if (gIndex == netLocalGpu && cId == channelId) {
|
|
||||||
*id = n;
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
gIndex++;
|
|
||||||
if (gIndex == netLocalGpus) {
|
|
||||||
gIndex = 0;
|
|
||||||
cId++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
n = (n+1) % 64;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
|
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
|
||||||
int ngpus = system->nodes[GPU].count;
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
int* gpus;
|
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||||
NCCLCHECK(ncclCalloc(&gpus, ngpus));
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||||
|
int id;
|
||||||
// Find localNetMask which includes net with the most local GPUs.
|
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
|
||||||
int netLocalGpus = 0, minType = PATH_DIS;
|
if (net == id) {
|
||||||
uint64_t localNetMask = 0ULL;
|
*gpuIndex = g;
|
||||||
for (int g=0; g<ngpus; g++) {
|
return ncclSuccess;
|
||||||
int type = PATH_DIS;
|
|
||||||
uint64_t mask;
|
|
||||||
NCCLCHECK(getLocalNetMask(system, g, &mask, &type));
|
|
||||||
if ((1ULL<<net) & mask) {
|
|
||||||
if (type < minType) {
|
|
||||||
localNetMask = mask;
|
|
||||||
netLocalGpus = 0;
|
|
||||||
minType = type;
|
|
||||||
}
|
|
||||||
if (type == minType) {
|
|
||||||
if (localNetMask && mask != localNetMask) {
|
|
||||||
WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus-1], minType, net, mask, localNetMask);
|
|
||||||
free(gpus);
|
|
||||||
return ncclInternalError;
|
|
||||||
}
|
|
||||||
gpus[netLocalGpus] = g;
|
|
||||||
netLocalGpus++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (localNetMask == 0ULL) {
|
|
||||||
*gpuIndex = -1;
|
*gpuIndex = -1;
|
||||||
free(gpus);
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Round robin on GPUs and channels
|
|
||||||
int gIndex = 0, cId = 0, n = 0;
|
|
||||||
while (1) {
|
|
||||||
if (1ULL << n & localNetMask) {
|
|
||||||
if (n == net) {
|
|
||||||
*gpuIndex = gpus[gIndex];
|
|
||||||
free(gpus);
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
gIndex++;
|
|
||||||
if (gIndex == netLocalGpus) {
|
|
||||||
gIndex = 0;
|
|
||||||
cId++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
n = (n+1) % 64;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/****************************/
|
/****************************/
|
||||||
/* External query functions */
|
/* External query functions */
|
||||||
/****************************/
|
/****************************/
|
||||||
|
@ -208,4 +208,14 @@ static float ncclTopoNVLinkBw(int cudaCompCap) {
|
|||||||
cudaCompCap >= 60 ? SM60_NVLINK_BW :
|
cudaCompCap >= 60 ? SM60_NVLINK_BW :
|
||||||
SM80_NVLINK_BW;
|
SM80_NVLINK_BW;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Mirror bits
|
||||||
|
static bool isPow2(int val) {
|
||||||
|
return (val & (val-1)) == 0;
|
||||||
|
}
|
||||||
|
static int mirrorBits(int val, int pow2) {
|
||||||
|
int mirror = 0;
|
||||||
|
for (int b=1, mb=(pow2>>1); b<pow2; b<<=1, mb>>=1) if (val & b) mirror |= mb;
|
||||||
|
return mirror;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -576,7 +576,18 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
pciNode->parent = parent;
|
pciNode->parent = parent;
|
||||||
parent->subs[parent->nSubs++] = pciNode;
|
// Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
|
||||||
|
int subIndex = parent->nSubs;
|
||||||
|
const char* newBusId;
|
||||||
|
NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
|
||||||
|
for (int s=0; s<parent->nSubs; s++) {
|
||||||
|
const char* busId;
|
||||||
|
NCCLCHECK(xmlGetAttrStr(parent->subs[s], "busid", &busId));
|
||||||
|
if (strcmp(newBusId, busId) < 0) { subIndex = s; break; }
|
||||||
|
}
|
||||||
|
for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1];
|
||||||
|
parent->subs[subIndex] = pciNode;
|
||||||
|
parent->nSubs++;
|
||||||
}
|
}
|
||||||
if (strcmp(parent->name, "pci") == 0) {
|
if (strcmp(parent->name, "pci") == 0) {
|
||||||
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
|
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
|
||||||
|
@ -66,7 +66,7 @@ struct userIbDev {
|
|||||||
uint16_t port_en;
|
uint16_t port_en;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define MAX_IB_DEVS 16
|
#define MAX_IB_DEVS 32
|
||||||
struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
|
struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
|
||||||
struct userIbDev userIbDevs[MAX_IB_DEVS];
|
struct userIbDev userIbDevs[MAX_IB_DEVS];
|
||||||
pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
|
pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user