2.12.12-1
Improve allreduce performance when we have more than one network interface per GPU and we need to use PXN to close rings. Add support for PCI Gen5 on 5.4 kernels. Fix crash when setting NCCL_SET_THREAD_NAME. Fix random crash in init due to uninitialized struct. Fix hang on cubemesh topologies. Add P2P_DIRECT_DISABLE parameter to disable direct access to pointers within a process.
This commit is contained in:
parent
9bfc1c6e35
commit
7aa1c46fd5
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 12
|
NCCL_MINOR := 12
|
||||||
NCCL_PATCH := 10
|
NCCL_PATCH := 12
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -165,8 +165,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
|
|||||||
memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
|
memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
|
||||||
pthread_t thread;
|
pthread_t thread;
|
||||||
pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
|
pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
|
||||||
pthread_detach(thread); // will not be pthread_join()'d
|
|
||||||
ncclSetThreadName(thread, "NCCL BootstrapR");
|
ncclSetThreadName(thread, "NCCL BootstrapR");
|
||||||
|
pthread_detach(thread); // will not be pthread_join()'d
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include "coll_net.h"
|
#include "coll_net.h"
|
||||||
#include "gdrwrap.h"
|
#include "gdrwrap.h"
|
||||||
#include "bootstrap.h"
|
#include "bootstrap.h"
|
||||||
|
#include "channel.h"
|
||||||
|
|
||||||
#include <cstring> // std::memcpy
|
#include <cstring> // std::memcpy
|
||||||
|
|
||||||
@ -861,20 +862,14 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
|||||||
struct ncclComm* comm = info->comm;
|
struct ncclComm* comm = info->comm;
|
||||||
int peer = info->root;
|
int peer = info->root;
|
||||||
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
|
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
|
||||||
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
|
int channelBaseId;
|
||||||
int peerNode = comm->rankToNode[peer];
|
NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId));
|
||||||
int peerIndex = comm->rankToLocalRank[peer];
|
|
||||||
int nsteps = comm->maxLocalRanks;
|
|
||||||
int rankIndex = comm->rankToLocalRank[comm->rank];
|
|
||||||
if (info->coll == ncclFuncSend) {
|
if (info->coll == ncclFuncSend) {
|
||||||
if (peer != comm->rank) {
|
if (peer != comm->rank) {
|
||||||
int step = (nsteps + peerIndex - rankIndex)%nsteps;
|
|
||||||
int delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
|
|
||||||
if (comm->nNodes == 1) delta = (comm->nRanks + peer - comm->rank) % comm->nRanks;
|
|
||||||
// Mark channels that need pre-connect
|
// Mark channels that need pre-connect
|
||||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||||
int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
|
int channelId;
|
||||||
int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels;
|
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
|
||||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
||||||
comm->connectSend[peer] |= (1<<channelId);
|
comm->connectSend[peer] |= (1<<channelId);
|
||||||
comm->connect = 1;
|
comm->connect = 1;
|
||||||
@ -885,13 +880,10 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
|||||||
comm->p2pSendCount++;
|
comm->p2pSendCount++;
|
||||||
} else {
|
} else {
|
||||||
if (peer != comm->rank) {
|
if (peer != comm->rank) {
|
||||||
int step = (nsteps + rankIndex - peerIndex)%nsteps;
|
|
||||||
int delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
|
|
||||||
if (comm->nNodes == 1) delta = (comm->nRanks - peer + comm->rank) % comm->nRanks;
|
|
||||||
// Mark channels that need pre-connect
|
// Mark channels that need pre-connect
|
||||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||||
int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
|
int channelId;
|
||||||
int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels;
|
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
|
||||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
||||||
comm->connectRecv[peer] |= (1<<channelId);
|
comm->connectRecv[peer] |= (1<<channelId);
|
||||||
comm->connect = 1;
|
comm->connect = 1;
|
||||||
|
@ -228,6 +228,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Old style numbering
|
// Old style numbering
|
||||||
|
// levelsOldToNew to is an array with each index corresponding to the
|
||||||
|
// "old level" int, and each value mapping to the correct value defined in topo.h
|
||||||
|
// maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
|
||||||
if (l == -1 && str[0] >= '0' && str[0] <= '9') {
|
if (l == -1 && str[0] >= '0' && str[0] <= '9') {
|
||||||
int oldLevel = strtol(str, NULL, 0);
|
int oldLevel = strtol(str, NULL, 0);
|
||||||
const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
|
const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
|
||||||
@ -521,24 +524,27 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
|||||||
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
|
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
|
||||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||||
if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
|
if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
|
||||||
|
int pxnGpu = -1;
|
||||||
|
|
||||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||||
if (p == g) continue;
|
if (p == g) continue;
|
||||||
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
|
|
||||||
|
|
||||||
// To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one.
|
|
||||||
int netDev;
|
|
||||||
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
|
|
||||||
// Make sure we can allocate memory on that GPU.
|
|
||||||
if (netDev != netNode->id) continue;
|
|
||||||
|
|
||||||
// PXN = PCI + NVLink.
|
// PXN = PCI + NVLink.
|
||||||
if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
|
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
|
||||||
|
if (peerNode->paths[NET][n].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
|
||||||
|
|
||||||
|
pxnGpu = p;
|
||||||
|
|
||||||
|
int netDev;
|
||||||
|
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
|
||||||
|
// To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
|
||||||
|
if (netDev == netNode->id) break;
|
||||||
|
}
|
||||||
|
if (pxnGpu != -1) {
|
||||||
// We can use that GPU as relay to communicate with that NIC.
|
// We can use that GPU as relay to communicate with that NIC.
|
||||||
// Only enabling it in the GPU->NIC direction for now to favor
|
// Only enabling it in the GPU->NIC direction for now to favor
|
||||||
// receiving locally and sending remotely (consistent with net.cc)
|
// receiving locally and sending remotely (consistent with net.cc)
|
||||||
NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n));
|
NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n));
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Update path when we dont want to / can't use GPU Direct RDMA.
|
// Update path when we dont want to / can't use GPU Direct RDMA.
|
||||||
|
@ -371,7 +371,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
|
|||||||
|
|
||||||
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
|
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
|
||||||
struct kvDict kvDictPciGen[] = {
|
struct kvDict kvDictPciGen[] = {
|
||||||
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, /* Kernel 5.6 and earlier */
|
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
|
||||||
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
|
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
|
||||||
{ NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
|
{ NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
|
||||||
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
|
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
|
||||||
|
@ -49,13 +49,28 @@ extern const char* topoNodeTypeStr[];
|
|||||||
#define LINK_NET 8
|
#define LINK_NET 8
|
||||||
extern const char* topoLinkTypeStr[];
|
extern const char* topoLinkTypeStr[];
|
||||||
|
|
||||||
|
// Local (myself)
|
||||||
#define PATH_LOC 0
|
#define PATH_LOC 0
|
||||||
|
|
||||||
|
// Connection traversing NVLink
|
||||||
#define PATH_NVL 1
|
#define PATH_NVL 1
|
||||||
|
|
||||||
|
// Connection through NVLink using an intermediate GPU
|
||||||
#define PATH_NVB 2
|
#define PATH_NVB 2
|
||||||
|
|
||||||
|
// Connection traversing at most a single PCIe bridge
|
||||||
#define PATH_PIX 3
|
#define PATH_PIX 3
|
||||||
|
|
||||||
|
// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
|
||||||
#define PATH_PXB 4
|
#define PATH_PXB 4
|
||||||
|
|
||||||
|
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
|
||||||
#define PATH_PXN 5
|
#define PATH_PXN 5
|
||||||
|
|
||||||
|
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
|
||||||
#define PATH_PHB 6
|
#define PATH_PHB 6
|
||||||
|
|
||||||
|
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
|
||||||
#define PATH_SYS 7
|
#define PATH_SYS 7
|
||||||
#define PATH_DIS 7
|
#define PATH_DIS 7
|
||||||
extern const char* topoPathTypeStr[];
|
extern const char* topoPathTypeStr[];
|
||||||
|
16
src/group.cc
16
src/group.cc
@ -8,6 +8,7 @@
|
|||||||
#include "debug.h"
|
#include "debug.h"
|
||||||
#include "enqueue.h"
|
#include "enqueue.h"
|
||||||
#include "transport.h"
|
#include "transport.h"
|
||||||
|
#include "channel.h"
|
||||||
|
|
||||||
#define MAX_ASYNC_OPS 128
|
#define MAX_ASYNC_OPS 128
|
||||||
thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
|
thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
|
||||||
@ -101,18 +102,22 @@ ncclResult_t ncclGroupStart() {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff) {
|
static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff) {
|
||||||
struct ncclInfo info = { ncclFuncSend, "Send",
|
struct ncclInfo info = { ncclFuncSend, "Send",
|
||||||
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
|
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
|
||||||
1, 1 };
|
1, 1 };
|
||||||
|
int channelId;
|
||||||
|
NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
|
||||||
info.channelId = channelId;
|
info.channelId = channelId;
|
||||||
NCCLCHECK(ncclSetupP2pKernel(&info));
|
NCCLCHECK(ncclSetupP2pKernel(&info));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff) {
|
static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff) {
|
||||||
struct ncclInfo info = { ncclFuncRecv, "Recv",
|
struct ncclInfo info = { ncclFuncRecv, "Recv",
|
||||||
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
|
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
|
||||||
1, 1 };
|
1, 1 };
|
||||||
|
int channelId;
|
||||||
|
NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
|
||||||
info.channelId = channelId;
|
info.channelId = channelId;
|
||||||
NCCLCHECK(ncclSetupP2pKernel(&info));
|
NCCLCHECK(ncclSetupP2pKernel(&info));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -208,7 +213,6 @@ ncclResult_t ncclGroupEnd() {
|
|||||||
int node = comm->node;
|
int node = comm->node;
|
||||||
int nNodes = comm->nNodes;
|
int nNodes = comm->nNodes;
|
||||||
int localRank = comm->localRank;
|
int localRank = comm->localRank;
|
||||||
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
|
|
||||||
|
|
||||||
// Compute how much to split operations
|
// Compute how much to split operations
|
||||||
// Natural step size matching buffer steps.
|
// Natural step size matching buffer steps.
|
||||||
@ -266,8 +270,6 @@ sched_delta:
|
|||||||
do {
|
do {
|
||||||
// Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
|
// Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
|
||||||
// to use multiple channels to guarantee progress on all ranks from the same node.
|
// to use multiple channels to guarantee progress on all ranks from the same node.
|
||||||
int shuffle = comm->nNodes > 1 ? delta+(s/p2pGroupSize) : s;
|
|
||||||
int channelId = (shuffle+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
|
|
||||||
ssize_t recvbytes = totRecvBytes-recvOffset;
|
ssize_t recvbytes = totRecvBytes-recvOffset;
|
||||||
ssize_t sendbytes = totSendBytes-sendOffset;
|
ssize_t sendbytes = totSendBytes-sendOffset;
|
||||||
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
|
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
|
||||||
@ -277,10 +279,10 @@ sched_delta:
|
|||||||
if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
|
if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
|
||||||
if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
|
if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
|
||||||
if (recv) {
|
if (recv) {
|
||||||
NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)recvBuff)+recvOffset), ret, group_cleanup);
|
NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, chunk, recvbytes, ((char*)recvBuff)+recvOffset), ret, group_cleanup);
|
||||||
}
|
}
|
||||||
if (send) {
|
if (send) {
|
||||||
NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)sendBuff)+sendOffset), ret, group_cleanup);
|
NCCLCHECKGOTO(scheduleSend(comm, sendPeer, chunk, sendbytes, ((char*)sendBuff)+sendOffset), ret, group_cleanup);
|
||||||
}
|
}
|
||||||
recvOffset += recvChunkSize;
|
recvOffset += recvChunkSize;
|
||||||
sendOffset += sendChunkSize;
|
sendOffset += sendChunkSize;
|
||||||
|
@ -10,5 +10,36 @@
|
|||||||
|
|
||||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
|
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
|
||||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
|
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
|
||||||
|
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
|
||||||
|
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
|
||||||
|
int peerNode = comm->rankToNode[peer];
|
||||||
|
int peerIndex = comm->rankToLocalRank[peer];
|
||||||
|
int nsteps = comm->maxLocalRanks;
|
||||||
|
int rankIndex = comm->rankToLocalRank[comm->rank];
|
||||||
|
int step, delta;
|
||||||
|
if (coll == ncclFuncSend) {
|
||||||
|
step = (nsteps + peerIndex - rankIndex)%nsteps;
|
||||||
|
delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
|
||||||
|
} else if (coll == ncclFuncRecv) {
|
||||||
|
step = (nsteps + rankIndex - peerIndex)%nsteps;
|
||||||
|
delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
|
||||||
|
} else {
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
*channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
|
||||||
|
*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
|
||||||
|
int base;
|
||||||
|
NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
|
||||||
|
NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
19
src/init.cc
19
src/init.cc
@ -823,20 +823,19 @@ collnet_cleanup:
|
|||||||
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
|
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
|
||||||
for (int r=0; r<nvbNpeers; r++) {
|
for (int r=0; r<nvbNpeers; r++) {
|
||||||
int peer = nvbPeers[r];
|
int peer = nvbPeers[r];
|
||||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
int channelId;
|
||||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
|
||||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
|
||||||
comm->connectRecv[peer] |= (1<<channelId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
|
||||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
|
||||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
|
||||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
|
||||||
comm->connectSend[peer] |= (1<<channelId);
|
comm->connectSend[peer] |= (1<<channelId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||||
|
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
|
||||||
|
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
|
||||||
|
comm->connectRecv[peer] |= (1<<channelId);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
|
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
|
||||||
free(nvbPeers);
|
free(nvbPeers);
|
||||||
|
@ -960,12 +960,10 @@ void* ncclProxyService(void* _args) {
|
|||||||
|
|
||||||
struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
|
struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
|
||||||
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
|
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
|
||||||
|
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
|
||||||
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
|
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
|
||||||
peers[s].sock.fd = pollfds[s].fd = -1;
|
peers[s].sock.fd = pollfds[s].fd = -1;
|
||||||
peers[s].sock.abortFlag = NULL;
|
|
||||||
peers[s].sock.asyncFlag = 0;
|
|
||||||
pollfds[s].events = POLLHUP|POLLIN;
|
pollfds[s].events = POLLHUP|POLLIN;
|
||||||
peers[s].asyncOps.type = 0;
|
|
||||||
}
|
}
|
||||||
pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
|
pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
|
||||||
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
|
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
|
||||||
@ -1066,13 +1064,13 @@ void* ncclProxyService(void* _args) {
|
|||||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
|
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
|
||||||
comm->proxyState.listenSock = sock;
|
comm->proxyState.listenSock = sock;
|
||||||
comm->proxyState.peerAddresses = peerAddresses;
|
comm->proxyState.peerAddresses = peerAddresses;
|
||||||
ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||||
// comm->proxyState.thread is pthread_join()'d by commFree() in init.cc
|
// comm->proxyState.thread is pthread_join()'d by commFree() in init.cc
|
||||||
pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
|
pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
|
||||||
|
ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,8 +223,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
|||||||
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
|
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
|
||||||
|
|
||||||
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
|
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
|
||||||
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
|
|
||||||
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
|
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
|
||||||
|
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
|
||||||
ncclNIbDevs++;
|
ncclNIbDevs++;
|
||||||
nPorts++;
|
nPorts++;
|
||||||
}
|
}
|
||||||
|
@ -127,6 +127,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
|||||||
|
|
||||||
// Setting this to non zero causes P2P to use Reads rather than Writes
|
// Setting this to non zero causes P2P to use Reads rather than Writes
|
||||||
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
|
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
|
||||||
|
NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);
|
||||||
|
|
||||||
static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
|
static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
|
||||||
int p2p;
|
int p2p;
|
||||||
@ -185,7 +186,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
|||||||
if (intermediateRank == -1) {
|
if (intermediateRank == -1) {
|
||||||
info->rank = myInfo->rank;
|
info->rank = myInfo->rank;
|
||||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||||
send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
||||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||||
} else {
|
} else {
|
||||||
@ -230,7 +231,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
|||||||
if (intermediateRank == -1) {
|
if (intermediateRank == -1) {
|
||||||
info->rank = myInfo->rank;
|
info->rank = myInfo->rank;
|
||||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||||
recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||||
} else {
|
} else {
|
||||||
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user