Fix memory leaks. Fix crash in bootstrap error case. Fix Collnet clean-up issue. Make PCI switch vendor/device optional for XML injection. Add support for nvidia-peermem module.
260 lines
11 KiB
C++
260 lines
11 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "comm.h"
|
|
#include "info.h"
|
|
#include "bootstrap.h"
|
|
|
|
extern struct ncclTransport p2pTransport;
|
|
extern struct ncclTransport shmTransport;
|
|
extern struct ncclTransport netTransport;
|
|
|
|
struct ncclTransport ncclTransports[NTRANSPORTS] = {
|
|
p2pTransport,
|
|
shmTransport,
|
|
netTransport,
|
|
};
|
|
|
|
template <int type>
|
|
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) {
|
|
struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
|
|
struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
|
|
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
|
|
comm->channels[channelId].peers[peer].recv + connIndex;
|
|
for (int t=0; t<NTRANSPORTS; t++) {
|
|
struct ncclTransport *transport = ncclTransports+t;
|
|
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
|
int ret = 0;
|
|
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
|
|
if (ret) {
|
|
connector->transportComm = transportComm;
|
|
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
|
|
return ncclSuccess;
|
|
}
|
|
}
|
|
WARN("No transport found !");
|
|
return ncclInternalError;
|
|
}
|
|
|
|
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
|
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
|
uint32_t mask = 1 << channel->id;
|
|
for (int i=0; i<nrecv; i++) {
|
|
int peer = peerRecv[i];
|
|
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
|
|
comm->connectRecv[peer] |= mask;
|
|
}
|
|
for (int i=0; i<nsend; i++) {
|
|
int peer = peerSend[i];
|
|
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
|
|
comm->connectSend[peer] |= mask;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
void dumpData(struct ncclConnect* data, int ndata) {
|
|
for (int n=0; n<ndata; n++) {
|
|
printf("[%d] ", n);
|
|
uint8_t* d = (uint8_t*)data;
|
|
for (int i=0; i<sizeof(struct ncclConnect); i++) printf("%02x", d[i]);
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) {
|
|
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
|
cudaStream_t transportSetupStream;
|
|
CUDACHECK(cudaStreamCreateWithFlags(&transportSetupStream, cudaStreamNonBlocking));
|
|
|
|
struct ncclConnect data[2*MAXCHANNELS];
|
|
for (int i=1; i<comm->nRanks; i++) {
|
|
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
|
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
|
int sendPeer = (comm->rank + i) % comm->nRanks;
|
|
uint32_t recvMask = comm->connectRecv[recvPeer];
|
|
uint32_t sendMask = comm->connectSend[sendPeer];
|
|
|
|
struct ncclConnect* recvData = data;
|
|
int sendChannels = 0, recvChannels = 0;
|
|
for (int c=0; c<MAXCHANNELS; c++) {
|
|
if (recvMask & (1<<c)) {
|
|
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex));
|
|
}
|
|
}
|
|
struct ncclConnect* sendData = recvData+recvChannels;
|
|
for (int c=0; c<MAXCHANNELS; c++) {
|
|
if (sendMask & (1<<c)) {
|
|
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex));
|
|
}
|
|
}
|
|
|
|
if (sendPeer == recvPeer) {
|
|
if (recvChannels+sendChannels) {
|
|
NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
|
|
NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
|
|
sendData = data;
|
|
recvData = data+sendChannels;
|
|
}
|
|
} else {
|
|
if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
|
|
if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
|
|
if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
|
|
if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
|
|
}
|
|
|
|
for (int c=0; c<MAXCHANNELS; c++) {
|
|
if (sendMask & (1<<c)) {
|
|
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
|
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
|
conn->connected = 1;
|
|
CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
|
|
}
|
|
}
|
|
for (int c=0; c<MAXCHANNELS; c++) {
|
|
if (recvMask & (1<<c)) {
|
|
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
|
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
|
conn->connected = 1;
|
|
CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
|
|
}
|
|
}
|
|
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
|
|
}
|
|
CUDACHECK(cudaStreamSynchronize(transportSetupStream));
|
|
CUDACHECK(cudaStreamDestroy(transportSetupStream));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
extern struct ncclTransport collNetTransport;
|
|
|
|
// All ranks must participate in collNetSetup call
|
|
// return: 0 - unsupported, 1 - supported
|
|
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
|
|
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type) {
|
|
int rank = comm->rank;
|
|
int nranks = comm->nRanks;
|
|
int nMasters = comm->nNodes;
|
|
int rankInCollNet = -1;
|
|
int supported = 0;
|
|
int isMaster = (rank == masterRank) ? 1 : 0;
|
|
struct {
|
|
int collNetRank;
|
|
ncclConnect connect;
|
|
} sendrecvExchange;
|
|
|
|
// check if we can connect to collnet, whose root is the nranks-th rank
|
|
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
|
|
peerInfo->rank = nranks;
|
|
int ret = 1;
|
|
if (isMaster) {
|
|
NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo));
|
|
}
|
|
|
|
// send master receives connect info from peer recv master
|
|
if (isMaster && type == collNetSend) {
|
|
NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)));
|
|
rankInCollNet = sendrecvExchange.collNetRank;
|
|
TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
|
|
}
|
|
|
|
// select
|
|
struct ncclPeer* root = channel->peers+nranks;
|
|
// connector index: 0 for recv, 1 for send
|
|
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
|
|
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
|
|
conn->transportComm = transportComm;
|
|
// setup
|
|
struct ncclConnect myConnect;
|
|
if (isMaster && ret > 0) {
|
|
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
|
|
}
|
|
// prepare connect handles
|
|
ncclResult_t res;
|
|
struct {
|
|
int isMaster;
|
|
ncclConnect connect;
|
|
} *allConnects = NULL;
|
|
ncclConnect *masterConnects = NULL;
|
|
NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
|
|
if (type == collNetRecv) { // recv side: AllGather
|
|
// all ranks must participate
|
|
NCCLCHECK(ncclCalloc(&allConnects, nranks));
|
|
allConnects[rank].isMaster = isMaster;
|
|
memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
|
|
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
|
|
// consolidate
|
|
int c = 0;
|
|
for (int r = 0; r < nranks; r++) {
|
|
if (allConnects[r].isMaster) {
|
|
memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
|
|
if (r == rank) rankInCollNet = c;
|
|
c++;
|
|
}
|
|
}
|
|
} else { // send side : copy in connect info received from peer recv master
|
|
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
|
|
}
|
|
// connect
|
|
if (isMaster && ret > 0) {
|
|
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
|
|
struct ncclPeer* devRoot = channel->devPeers+nranks;
|
|
struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
|
|
CUDACHECKGOTO(cudaMemcpy(devConn, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice), res, cleanup);
|
|
}
|
|
// recv side sends connect info to send side
|
|
if (isMaster && type == collNetRecv) {
|
|
sendrecvExchange.collNetRank = rankInCollNet;
|
|
memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
|
|
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
|
|
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
|
|
}
|
|
if (ret > 0) {
|
|
supported = 1;
|
|
}
|
|
cleanup:
|
|
if (allConnects != NULL) free(allConnects);
|
|
if (masterConnects != NULL) free(masterConnects);
|
|
return supported;
|
|
}
|
|
|
|
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
|
|
int rank = comm->rank;
|
|
int nranks = comm->nRanks;
|
|
// AllGather collNet setup results
|
|
int* allGatherFailures;
|
|
NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
|
|
allGatherFailures[rank] = collNetSetupFail;
|
|
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
|
|
for (int i=0; i<nranks; i++) {
|
|
if (allGatherFailures[i] != 0) {
|
|
collNetSetupFail = 1;
|
|
break;
|
|
}
|
|
}
|
|
free(allGatherFailures);
|
|
if (collNetSetupFail) {
|
|
if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
|
|
// Free collNet resources
|
|
for (int r=0; r<comm->nChannels; r++) {
|
|
struct ncclChannel* channel = comm->channels+r;
|
|
struct ncclPeer* peer = channel->peers+nranks;
|
|
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
|
struct ncclConnector* send = peer->send + b;
|
|
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
|
|
send->transportResources = NULL; // avoid double free
|
|
}
|
|
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
|
struct ncclConnector* recv = peer->recv + b;
|
|
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
|
|
recv->transportResources = NULL; // avoid double free
|
|
}
|
|
}
|
|
// Set support to 0
|
|
comm->collNetSupport = 0;
|
|
}
|
|
return ncclSuccess;
|
|
}
|