Fix memory leaks.
Fix crash in bootstrap error case.
Fix Collnet clean-up issue.
Make PCI switch vendor/device optional for XML injection.
Add support for nvidia-peermem module.
This commit is contained in:
Sylvain Jeaugey 2021-04-26 14:24:50 -07:00
parent a46ea10583
commit ca8485b0d0
9 changed files with 35 additions and 20 deletions

View File

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 9
NCCL_PATCH := 6
NCCL_PATCH := 8
NCCL_SUFFIX :=
PKG_REVISION := 1

View File

@ -515,10 +515,11 @@ ncclResult_t bootstrapClose(void* commState) {
ncclResult_t bootstrapAbort(void* commState) {
struct extState* state = (struct extState*)commState;
close(state->extListenFd);
close(state->extRingSendFd);
close(state->extRingRecvFd);
state->allocState->stop = 2;
if (commState == NULL) return ncclSuccess;
if (state->extListenFd) close(state->extListenFd);
if (state->extRingSendFd) close(state->extRingSendFd);
if (state->extRingRecvFd) close(state->extRingRecvFd);
if (state->allocState) state->allocState->stop = 2;
free(state->peerCommAddresses);
free(state->peerAllocAddresses);
free(state);

View File

@ -212,6 +212,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
INFO(NCCL_GRAPH, "%s", line);
}
free(heads);
return ncclSuccess;
}

View File

@ -764,7 +764,7 @@ search:
for (int g=0; g<ngpus; g++) {
printf("%d ", graph->intra[c*ngpus+g]);
}
printf("[%d %d]", graph->inter[0], graph->inter[1]);
printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
printf("\n");
}
#endif

View File

@ -469,26 +469,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
}
ncclDebugNoWarn = 1;
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"));
if (path == NULL) getPciPath(busId, &path);
if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"));
if (path == NULL) getPciPath(busId, &path);
if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device");
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"));
if (path == NULL) getPciPath(busId, &path);
if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"));
if (path == NULL) getPciPath(busId, &path);
if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
}
ncclDebugNoWarn = 0;
NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));

View File

@ -164,6 +164,7 @@ ncclResult_t ncclGroupEnd() {
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
ncclResult_t ret = ncclGroupError;
int usingCudaGraphAll = -1;
cudaGraph_t* graphs = NULL;
if (ret != ncclSuccess) goto group_cleanup;
/* Launch async ncclCommInitRank */
@ -307,7 +308,6 @@ sched_delta:
*/
// Check whether we are in cuda graph mode
cudaGraph_t* graphs;
NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
@ -407,5 +407,6 @@ end:
ncclGroupError = ncclSuccess;
ncclGroupIndex = 0;
CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
if (graphs) free(graphs);
return ret;
}

View File

@ -760,6 +760,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
collNetSetupFail = 1;
}
}
free(heads);
// Verify CollNet setup across ranks
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
if (comm->collNetSupport) {

View File

@ -241,10 +241,16 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclPeer* peer = channel->peers+nranks;
if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources));
if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources));
peer->send->transportResources = NULL; // avoid double free
peer->recv->transportResources = NULL; // avoid double free
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
recv->transportResources = NULL; // avoid double free
}
}
// Set support to 0
comm->collNetSupport = 0;

View File

@ -220,7 +220,10 @@ ncclResult_t ncclIbDevices(int* ndev) {
ncclResult_t ncclIbGdrSupport(int ibDev) {
static int moduleLoaded = -1;
if (moduleLoaded == -1) {
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
// Check for the nv_peer_mem module being loaded
moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
// Also support the new nvidia-peermem module
(access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == -1)) ? 0 : 1;
}
if (moduleLoaded == 0) return ncclSystemError;
return ncclSuccess;