Fix crash when only a subset of GPUs are visible within a container.

Fixes #326.
This commit is contained in:
Sylvain Jeaugey 2020-04-17 10:03:14 -07:00
parent 23a9fbb788
commit f36540f55a
2 changed files with 2 additions and 1 deletions

View File

@ -520,6 +520,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}

View File

@ -569,7 +569,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
}
NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
if (dev == -1) return ncclSuccess;
if (dev == -1) { *gpuNodeRet = NULL; return ncclSuccess; }
NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
if (index == -1) {