Fix crash when NVswitch is not visible inside a VM.
This commit is contained in:
Sylvain Jeaugey 2020-06-23 14:08:49 -07:00
parent 01afd20a77
commit 1952325569
2 changed files with 9 additions and 4 deletions

View File

@ -1,6 +1,6 @@
##### version ##### version
NCCL_MAJOR := 2 NCCL_MAJOR := 2
NCCL_MINOR := 7 NCCL_MINOR := 7
NCCL_PATCH := 5 NCCL_PATCH := 6
NCCL_SUFFIX := NCCL_SUFFIX :=
PKG_REVISION := 1 PKG_REVISION := 1

View File

@ -640,11 +640,16 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
if (index == -1) { if (index == -1) {
const char* busId; const char* busId;
NCCLCHECK(xmlGetAttr(sub, "target", &busId)); NCCLCHECK(xmlGetAttr(sub, "target", &busId));
if (strcmp(busId, "fffffff:ffff:ff") == 0) {
// Remote NVLink device is not visible inside this VM. Assume NVSwitch.
NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
} else {
char* path; char* path;
NCCLCHECK(getPciPath(busId, &path)); NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
} }
} }
}
*gpuNodeRet = gpuNode; *gpuNodeRet = gpuNode;
return ncclSuccess; return ncclSuccess;
} }