Fix inter-node NVLS graph search
We were passing a net ID instead of a gpu index, which could cause crashes if those were unrelated (and they usually are). Issue #931
This commit is contained in:
parent
6e24ef4e1f
commit
8ed014bae9
@ -530,7 +530,11 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
|
||||
// NVLS needs to balance on all NICs
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels]));
|
||||
if (graph->nChannels < netcount) {
|
||||
int gpu;
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu));
|
||||
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
|
||||
}
|
||||
} else {
|
||||
if (graph->nChannels > 0) {
|
||||
// Try to replay the last channel
|
||||
|
Loading…
x
Reference in New Issue
Block a user