Fix inter-node NVLS graph search

We were passing a net ID instead of a gpu index, which could cause
crashes if those were unrelated (and they usually are).

Issue #931
This commit is contained in:
Sylvain Jeaugey 2023-08-02 16:03:45 +02:00
parent 6e24ef4e1f
commit 8ed014bae9

View File

@ -530,7 +530,11 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
// NVLS needs to balance on all NICs
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels]));
if (graph->nChannels < netcount) {
int gpu;
NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu));
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
}
} else {
if (graph->nChannels > 0) {
// Try to replay the last channel