From 8ed014bae9f30d4470cdfa655f32d35ee5b3e7ca Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Wed, 2 Aug 2023 16:03:45 +0200 Subject: [PATCH] Fix inter-node NVLS graph search We were passing a net ID instead of a gpu index, which could cause crashes if those were unrelated (and they usually are). Issue #931 --- src/graph/search.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/graph/search.cc b/src/graph/search.cc index 3b13c48..bb83d5d 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -530,7 +530,11 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo // NVLS needs to balance on all NICs if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels])); + if (graph->nChannels < netcount) { + int gpu; + NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu)); + if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu)); + } } else { if (graph->nChannels > 0) { // Try to replay the last channel