2.9.8-1
Fix memory leaks. Fix crash in bootstrap error case. Fix Collnet clean-up issue. Make PCI switch vendor/device optional for XML injection. Add support for nvidia-peermem module.
This commit is contained in:
parent
a46ea10583
commit
ca8485b0d0
@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 9
|
||||
NCCL_PATCH := 6
|
||||
NCCL_PATCH := 8
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
@ -515,10 +515,11 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
|
||||
ncclResult_t bootstrapAbort(void* commState) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
close(state->extListenFd);
|
||||
close(state->extRingSendFd);
|
||||
close(state->extRingRecvFd);
|
||||
state->allocState->stop = 2;
|
||||
if (commState == NULL) return ncclSuccess;
|
||||
if (state->extListenFd) close(state->extListenFd);
|
||||
if (state->extRingSendFd) close(state->extRingSendFd);
|
||||
if (state->extRingRecvFd) close(state->extRingRecvFd);
|
||||
if (state->allocState) state->allocState->stop = 2;
|
||||
free(state->peerCommAddresses);
|
||||
free(state->peerAllocAddresses);
|
||||
free(state);
|
||||
|
@ -212,6 +212,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
}
|
||||
free(heads);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -764,7 +764,7 @@ search:
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
printf("%d ", graph->intra[c*ngpus+g]);
|
||||
}
|
||||
printf("[%d %d]", graph->inter[0], graph->inter[1]);
|
||||
printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
@ -469,26 +469,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
|
||||
}
|
||||
ncclDebugNoWarn = 1;
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"));
|
||||
if (path == NULL) getPciPath(busId, &path);
|
||||
if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
|
||||
}
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"));
|
||||
if (path == NULL) getPciPath(busId, &path);
|
||||
if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device");
|
||||
}
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"));
|
||||
if (path == NULL) getPciPath(busId, &path);
|
||||
if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
|
||||
}
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"));
|
||||
if (path == NULL) getPciPath(busId, &path);
|
||||
if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
|
||||
}
|
||||
ncclDebugNoWarn = 0;
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
|
@ -164,6 +164,7 @@ ncclResult_t ncclGroupEnd() {
|
||||
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
|
||||
ncclResult_t ret = ncclGroupError;
|
||||
int usingCudaGraphAll = -1;
|
||||
cudaGraph_t* graphs = NULL;
|
||||
if (ret != ncclSuccess) goto group_cleanup;
|
||||
|
||||
/* Launch async ncclCommInitRank */
|
||||
@ -307,7 +308,6 @@ sched_delta:
|
||||
*/
|
||||
|
||||
// Check whether we are in cuda graph mode
|
||||
cudaGraph_t* graphs;
|
||||
NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
@ -407,5 +407,6 @@ end:
|
||||
ncclGroupError = ncclSuccess;
|
||||
ncclGroupIndex = 0;
|
||||
CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
|
||||
if (graphs) free(graphs);
|
||||
return ret;
|
||||
}
|
||||
|
@ -760,6 +760,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
collNetSetupFail = 1;
|
||||
}
|
||||
}
|
||||
free(heads);
|
||||
// Verify CollNet setup across ranks
|
||||
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
|
||||
if (comm->collNetSupport) {
|
||||
|
@ -241,10 +241,16 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
|
||||
for (int r=0; r<comm->nChannels; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
struct ncclPeer* peer = channel->peers+nranks;
|
||||
if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources));
|
||||
if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources));
|
||||
peer->send->transportResources = NULL; // avoid double free
|
||||
peer->recv->transportResources = NULL; // avoid double free
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* send = peer->send + b;
|
||||
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
|
||||
send->transportResources = NULL; // avoid double free
|
||||
}
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* recv = peer->recv + b;
|
||||
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
|
||||
recv->transportResources = NULL; // avoid double free
|
||||
}
|
||||
}
|
||||
// Set support to 0
|
||||
comm->collNetSupport = 0;
|
||||
|
@ -220,7 +220,10 @@ ncclResult_t ncclIbDevices(int* ndev) {
|
||||
ncclResult_t ncclIbGdrSupport(int ibDev) {
|
||||
static int moduleLoaded = -1;
|
||||
if (moduleLoaded == -1) {
|
||||
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
|
||||
// Check for the nv_peer_mem module being loaded
|
||||
moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
|
||||
// Also support the new nvidia-peermem module
|
||||
(access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == -1)) ? 0 : 1;
|
||||
}
|
||||
if (moduleLoaded == 0) return ncclSystemError;
|
||||
return ncclSuccess;
|
||||
|
Loading…
x
Reference in New Issue
Block a user