From c88c9f873fa680b547c4ae177c8f915d5c2dd157 Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Tue, 23 Nov 2021 14:43:14 -0800 Subject: [PATCH] Add env NCCL_NET_DISABLE_INTRA Disable NET transport for intra-node communication by setting the env to 1 It provides an option to error out instead of falling back to NET when superior intra-node transports (P2P and SHM) are unavailable --- src/transport.cc | 4 ++-- src/transport/net.cc | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/transport.cc b/src/transport.cc index d7eadcd..2cb5538 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -36,8 +36,8 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* return ncclSuccess; } } - WARN("No transport found !"); - return ncclInternalError; + WARN("No transport found for rank %d[%lx] -> rank %d[%lx]", myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + return ncclSystemError; } ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { diff --git a/src/transport/net.cc b/src/transport/net.cc index 2b548ce..5abc32d 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -56,8 +56,18 @@ struct netRecvResources { uint64_t llLastCleaning; }; +NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", -2); + /* Determine if two peers can communicate with NET */ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + // Same host? + if (info1->hostHash == info2->hostHash) { + // User disabled NET for intra-node? + if (ncclParamNetDisableIntra() == 1) { + *ret = 0; + return ncclSuccess; + } + } *ret = 1; return ncclSuccess; }