Add env NCCL_NET_DISABLE_INTRA
Disable NET transport for intra-node communication by setting the env to 1 It provides an option to error out instead of falling back to NET when superior intra-node transports (P2P and SHM) are unavailable
This commit is contained in:
parent
8cf7325d69
commit
c88c9f873f
@ -36,8 +36,8 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
WARN("No transport found !");
|
WARN("No transport found for rank %d[%lx] -> rank %d[%lx]", myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||||
return ncclInternalError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
||||||
|
@ -56,8 +56,18 @@ struct netRecvResources {
|
|||||||
uint64_t llLastCleaning;
|
uint64_t llLastCleaning;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", -2);
|
||||||
|
|
||||||
/* Determine if two peers can communicate with NET */
|
/* Determine if two peers can communicate with NET */
|
||||||
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||||
|
// Same host?
|
||||||
|
if (info1->hostHash == info2->hostHash) {
|
||||||
|
// User disabled NET for intra-node?
|
||||||
|
if (ncclParamNetDisableIntra() == 1) {
|
||||||
|
*ret = 0;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
}
|
||||||
*ret = 1;
|
*ret = 1;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user