2.9.8-1
Fix memory leaks. Fix crash in bootstrap error case. Fix Collnet clean-up issue. Make PCI switch vendor/device optional for XML injection. Add support for nvidia-peermem module.
This commit is contained in:
parent
a46ea10583
commit
ca8485b0d0
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 9
|
NCCL_MINOR := 9
|
||||||
NCCL_PATCH := 6
|
NCCL_PATCH := 8
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -515,10 +515,11 @@ ncclResult_t bootstrapClose(void* commState) {
|
|||||||
|
|
||||||
ncclResult_t bootstrapAbort(void* commState) {
|
ncclResult_t bootstrapAbort(void* commState) {
|
||||||
struct extState* state = (struct extState*)commState;
|
struct extState* state = (struct extState*)commState;
|
||||||
close(state->extListenFd);
|
if (commState == NULL) return ncclSuccess;
|
||||||
close(state->extRingSendFd);
|
if (state->extListenFd) close(state->extListenFd);
|
||||||
close(state->extRingRecvFd);
|
if (state->extRingSendFd) close(state->extRingSendFd);
|
||||||
state->allocState->stop = 2;
|
if (state->extRingRecvFd) close(state->extRingRecvFd);
|
||||||
|
if (state->allocState) state->allocState->stop = 2;
|
||||||
free(state->peerCommAddresses);
|
free(state->peerCommAddresses);
|
||||||
free(state->peerAllocAddresses);
|
free(state->peerAllocAddresses);
|
||||||
free(state);
|
free(state);
|
||||||
|
@ -212,6 +212,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
|
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
|
||||||
INFO(NCCL_GRAPH, "%s", line);
|
INFO(NCCL_GRAPH, "%s", line);
|
||||||
}
|
}
|
||||||
|
free(heads);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -764,7 +764,7 @@ search:
|
|||||||
for (int g=0; g<ngpus; g++) {
|
for (int g=0; g<ngpus; g++) {
|
||||||
printf("%d ", graph->intra[c*ngpus+g]);
|
printf("%d ", graph->intra[c*ngpus+g]);
|
||||||
}
|
}
|
||||||
printf("[%d %d]", graph->inter[0], graph->inter[1]);
|
printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -469,26 +469,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
|||||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
|
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
|
||||||
}
|
}
|
||||||
|
ncclDebugNoWarn = 1;
|
||||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
|
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
|
||||||
if (index == -1) {
|
if (index == -1) {
|
||||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
if (path == NULL) getPciPath(busId, &path);
|
||||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"));
|
if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
|
||||||
}
|
}
|
||||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
|
NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
|
||||||
if (index == -1) {
|
if (index == -1) {
|
||||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
if (path == NULL) getPciPath(busId, &path);
|
||||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"));
|
if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device");
|
||||||
}
|
}
|
||||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
|
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
|
||||||
if (index == -1) {
|
if (index == -1) {
|
||||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
if (path == NULL) getPciPath(busId, &path);
|
||||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"));
|
if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
|
||||||
}
|
}
|
||||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
|
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
|
||||||
if (index == -1) {
|
if (index == -1) {
|
||||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
if (path == NULL) getPciPath(busId, &path);
|
||||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"));
|
if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
|
||||||
}
|
}
|
||||||
|
ncclDebugNoWarn = 0;
|
||||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
|
NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
|
||||||
if (index == -1) {
|
if (index == -1) {
|
||||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||||
|
@ -164,6 +164,7 @@ ncclResult_t ncclGroupEnd() {
|
|||||||
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
|
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
|
||||||
ncclResult_t ret = ncclGroupError;
|
ncclResult_t ret = ncclGroupError;
|
||||||
int usingCudaGraphAll = -1;
|
int usingCudaGraphAll = -1;
|
||||||
|
cudaGraph_t* graphs = NULL;
|
||||||
if (ret != ncclSuccess) goto group_cleanup;
|
if (ret != ncclSuccess) goto group_cleanup;
|
||||||
|
|
||||||
/* Launch async ncclCommInitRank */
|
/* Launch async ncclCommInitRank */
|
||||||
@ -307,7 +308,6 @@ sched_delta:
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Check whether we are in cuda graph mode
|
// Check whether we are in cuda graph mode
|
||||||
cudaGraph_t* graphs;
|
|
||||||
NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
|
NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
|
||||||
for (int i=0; i<ncclGroupIndex; i++) {
|
for (int i=0; i<ncclGroupIndex; i++) {
|
||||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||||
@ -407,5 +407,6 @@ end:
|
|||||||
ncclGroupError = ncclSuccess;
|
ncclGroupError = ncclSuccess;
|
||||||
ncclGroupIndex = 0;
|
ncclGroupIndex = 0;
|
||||||
CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
|
CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
|
||||||
|
if (graphs) free(graphs);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -760,6 +760,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
collNetSetupFail = 1;
|
collNetSetupFail = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
free(heads);
|
||||||
// Verify CollNet setup across ranks
|
// Verify CollNet setup across ranks
|
||||||
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
|
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
|
||||||
if (comm->collNetSupport) {
|
if (comm->collNetSupport) {
|
||||||
|
@ -241,10 +241,16 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
|
|||||||
for (int r=0; r<comm->nChannels; r++) {
|
for (int r=0; r<comm->nChannels; r++) {
|
||||||
struct ncclChannel* channel = comm->channels+r;
|
struct ncclChannel* channel = comm->channels+r;
|
||||||
struct ncclPeer* peer = channel->peers+nranks;
|
struct ncclPeer* peer = channel->peers+nranks;
|
||||||
if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources));
|
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||||
if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources));
|
struct ncclConnector* send = peer->send + b;
|
||||||
peer->send->transportResources = NULL; // avoid double free
|
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
|
||||||
peer->recv->transportResources = NULL; // avoid double free
|
send->transportResources = NULL; // avoid double free
|
||||||
|
}
|
||||||
|
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||||
|
struct ncclConnector* recv = peer->recv + b;
|
||||||
|
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
|
||||||
|
recv->transportResources = NULL; // avoid double free
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Set support to 0
|
// Set support to 0
|
||||||
comm->collNetSupport = 0;
|
comm->collNetSupport = 0;
|
||||||
|
@ -220,7 +220,10 @@ ncclResult_t ncclIbDevices(int* ndev) {
|
|||||||
ncclResult_t ncclIbGdrSupport(int ibDev) {
|
ncclResult_t ncclIbGdrSupport(int ibDev) {
|
||||||
static int moduleLoaded = -1;
|
static int moduleLoaded = -1;
|
||||||
if (moduleLoaded == -1) {
|
if (moduleLoaded == -1) {
|
||||||
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
|
// Check for the nv_peer_mem module being loaded
|
||||||
|
moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
|
||||||
|
// Also support the new nvidia-peermem module
|
||||||
|
(access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == -1)) ? 0 : 1;
|
||||||
}
|
}
|
||||||
if (moduleLoaded == 0) return ncclSystemError;
|
if (moduleLoaded == 0) return ncclSystemError;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user