Fix crash when NVswitch is not visible inside a VM.
This commit is contained in:
Sylvain Jeaugey 2020-06-23 14:08:49 -07:00
parent 01afd20a77
commit 1952325569
2 changed files with 9 additions and 4 deletions

View File

@ -1,6 +1,6 @@
##### version ##### version
NCCL_MAJOR := 2 NCCL_MAJOR := 2
NCCL_MINOR := 7 NCCL_MINOR := 7
NCCL_PATCH := 5 NCCL_PATCH := 6
NCCL_SUFFIX := NCCL_SUFFIX :=
PKG_REVISION := 1 PKG_REVISION := 1

View File

@ -640,9 +640,14 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
if (index == -1) { if (index == -1) {
const char* busId; const char* busId;
NCCLCHECK(xmlGetAttr(sub, "target", &busId)); NCCLCHECK(xmlGetAttr(sub, "target", &busId));
char* path; if (strcmp(busId, "fffffff:ffff:ff") == 0) {
NCCLCHECK(getPciPath(busId, &path)); // Remote NVLink device is not visible inside this VM. Assume NVSwitch.
NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
} else {
char* path;
NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
}
} }
} }
*gpuNodeRet = gpuNode; *gpuNodeRet = gpuNode;