From f36540f55a15683a121b6c330657af442b85c796 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Fri, 17 Apr 2020 10:03:14 -0700 Subject: [PATCH] Fix crash when only a subset of GPUs are visible within a container. Fixes #326. --- src/graph/topo.cc | 1 + src/graph/xml.cc | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 3767ba9..ac6b111 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -520,6 +520,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); struct ncclXmlNode* node; NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); + if (node == NULL) continue; NCCLCHECK(xmlSetAttrInt(node, "rank", r)); NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); } diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 550cfcd..f138d0b 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -569,7 +569,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev)); } NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev)); - if (dev == -1) return ncclSuccess; + if (dev == -1) { *gpuNodeRet = NULL; return ncclSuccess; } NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index)); if (index == -1) {