diff --git a/makefiles/version.mk b/makefiles/version.mk index 6a40a92..78c601f 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 9 -NCCL_PATCH := 6 +NCCL_PATCH := 8 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/bootstrap.cc b/src/bootstrap.cc index d452f91..ff58c42 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -515,10 +515,11 @@ ncclResult_t bootstrapClose(void* commState) { ncclResult_t bootstrapAbort(void* commState) { struct extState* state = (struct extState*)commState; - close(state->extListenFd); - close(state->extRingSendFd); - close(state->extRingRecvFd); - state->allocState->stop = 2; + if (commState == NULL) return ncclSuccess; + if (state->extListenFd) close(state->extListenFd); + if (state->extRingSendFd) close(state->extRingSendFd); + if (state->extRingRecvFd) close(state->extRingRecvFd); + if (state->allocState) state->allocState->stop = 2; free(state->peerCommAddresses); free(state->peerAllocAddresses); free(state); diff --git a/src/graph/connect.cc b/src/graph/connect.cc index b06ea5d..a26611e 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -212,6 +212,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift); INFO(NCCL_GRAPH, "%s", line); } + free(heads); return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index cc9358b..7ced017 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -764,7 +764,7 @@ search: for (int g=0; gintra[c*ngpus+g]); } - printf("[%d %d]", graph->inter[0], graph->inter[1]); + printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]); printf("\n"); } #endif diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 91e8f94..f94d9e8 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -469,26 +469,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); } + ncclDebugNoWarn = 1; NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index)); if (index == -1) { - if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); - NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor")); + if (path == NULL) getPciPath(busId, &path); + if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index)); if (index == -1) { - if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); - NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device")); + if (path == NULL) getPciPath(busId, &path); + if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index)); if (index == -1) { - if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); - NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor")); + if (path == NULL) getPciPath(busId, &path); + if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index)); if (index == -1) { - if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); - NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device")); + if (path == NULL) getPciPath(busId, &path); + if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"); } + ncclDebugNoWarn = 0; NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); if (index == -1) { if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); diff --git a/src/group.cc b/src/group.cc index b695f3a..382b61e 100644 --- a/src/group.cc +++ b/src/group.cc @@ -164,6 +164,7 @@ ncclResult_t ncclGroupEnd() { for (int i=0; icollNetSupport) { diff --git a/src/transport.cc b/src/transport.cc index c7c841b..ad910aa 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -241,10 +241,16 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa for (int r=0; rnChannels; r++) { struct ncclChannel* channel = comm->channels+r; struct ncclPeer* peer = channel->peers+nranks; - if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources)); - if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources)); - peer->send->transportResources = NULL; // avoid double free - peer->recv->transportResources = NULL; // avoid double free + for (int b=0; bsend + b; + if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources)); + send->transportResources = NULL; // avoid double free + } + for (int b=0; brecv + b; + if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources)); + recv->transportResources = NULL; // avoid double free + } } // Set support to 0 comm->collNetSupport = 0; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index b399318..d867e3e 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -220,7 +220,10 @@ ncclResult_t ncclIbDevices(int* ndev) { ncclResult_t ncclIbGdrSupport(int ibDev) { static int moduleLoaded = -1; if (moduleLoaded == -1) { - moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1; + // Check for the nv_peer_mem module being loaded + moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) && + // Also support the new nvidia-peermem module + (access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == -1)) ? 0 : 1; } if (moduleLoaded == 0) return ncclSystemError; return ncclSuccess;