Fix bug #307 : wrong NIC selection on the reduction tree.
The reduction tree (tree up) was inverting the NICs to use, causing performance issue in cases where we are using different NICs on a given channel.
This commit is contained in:
parent
533e3702cf
commit
b5b6c6acdd
@ -804,7 +804,10 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
|
||||
*dev = graph->inter[(channelId%graph->nChannels)*2+dir];
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* dev) {
|
||||
int channel = channelId%graph->nChannels;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
|
||||
*dev = graph->inter[channel*2+index];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
@ -27,7 +27,7 @@ void ncclTopoFree(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* net);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
|
||||
|
@ -84,7 +84,7 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
|
||||
NCCLCHECK(ncclCalloc(&sendResources, 1));
|
||||
send->transportResources = sendResources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &sendResources->netDev));
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &sendResources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
@ -110,7 +110,7 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
|
||||
NCCLCHECK(ncclCalloc(&recvResources, 1));
|
||||
recv->transportResources = recvResources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &recvResources->netDev));
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &recvResources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
|
@ -60,7 +60,7 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
@ -83,7 +83,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
|
Loading…
x
Reference in New Issue
Block a user