Add env NCCL_NET_DISABLE_INTRA

Disable NET transport for intra-node communication by setting the env to 1
It provides an option to error out instead of falling back to NET when superior intra-node transports (P2P and SHM) are unavailable
This commit is contained in:
Ke Wen 2021-11-23 14:43:14 -08:00 committed by Sylvain Jeaugey
parent 8cf7325d69
commit c88c9f873f
2 changed files with 12 additions and 2 deletions

View File

@ -36,8 +36,8 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
return ncclSuccess; return ncclSuccess;
} }
} }
WARN("No transport found !"); WARN("No transport found for rank %d[%lx] -> rank %d[%lx]", myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
return ncclInternalError; return ncclSystemError;
} }
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {

View File

@ -56,8 +56,18 @@ struct netRecvResources {
uint64_t llLastCleaning; uint64_t llLastCleaning;
}; };
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", -2);
/* Determine if two peers can communicate with NET */ /* Determine if two peers can communicate with NET */
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// Same host?
if (info1->hostHash == info2->hostHash) {
// User disabled NET for intra-node?
if (ncclParamNetDisableIntra() == 1) {
*ret = 0;
return ncclSuccess;
}
}
*ret = 1; *ret = 1;
return ncclSuccess; return ncclSuccess;
} }