nccl/src/graph/paths.cc
Sylvain Jeaugey 7aa1c46fd5 2.12.12-1
Improve allreduce performance when we have more than one network interface per
GPU and we need to use PXN to close rings.
Add support for PCI Gen5 on 5.4 kernels.
Fix crash when setting NCCL_SET_THREAD_NAME.
Fix random crash in init due to uninitialized struct.
Fix hang on cubemesh topologies.
Add P2P_DIRECT_DISABLE parameter to disable direct access to pointers within a
process.
2022-05-13 00:26:57 -07:00

698 lines
26 KiB
C++

/*************************************************************************
* Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "net.h"
#include "channel.h"
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
struct ncclTopoNodeList {
struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES];
int count;
};
static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) {
for (int i=0; i<system->nodes[t].count; i++) {
if (system->nodes[t].nodes[i].id == id) {
*path = node->paths[t]+i;
return ncclSuccess;
}
}
WARN("Could not find node of type %d id %lx", t, id);
return ncclInternalError;
}
NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
// breadth-first search to set all paths to that node in the system
struct ncclTopoNodeList nodeList;
struct ncclTopoNodeList nextNodeList;
nodeList.count = 1; nodeList.list[0] = baseNode;
nextNodeList.count = 0;
struct ncclTopoLinkList* basePath;
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->width = LOC_WIDTH;
basePath->type = PATH_LOC;
while (nodeList.count) {
nextNodeList.count = 0;
for (int n=0; n<nodeList.count; n++) {
struct ncclTopoNode* node = nodeList.list[n];
struct ncclTopoLinkList* path;
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
struct ncclTopoNode* remNode = link->remNode;
if (remNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
float width = std::min(path->width, link->width);
// allow routing through a GPU only as 1 hop
if (node != baseNode && node->type == GPU &&
(ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
if (remNode->links[l].remNode == node) {
remPath->list[0] = remNode->links+l;
break;
}
}
if (remPath->list[0] == NULL) {
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
return ncclInternalError;
}
// Copy the rest of the path
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
remPath->count = path->count + 1;
remPath->width = width;
// Start with path type = link type. PATH and LINK types are supposed to match.
// Don't consider LINK_NET as we only care about the NIC->GPU path.
int type = link->type == LINK_NET ? LINK_LOC : link->type;
// Differentiate between one and multiple PCI switches
if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
// Consider a path going through the CPU as PATH_PHB
if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
// Set 1 hop NVLink as NVB
if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
remPath->type = std::max(path->type, type);
// Add to the list for the next iteration if not already in the list
int i;
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
}
}
}
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
}
return ncclSuccess;
}
static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
char line[1024];
#ifdef ENABLE_TRACE
INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
#else
sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
int offset = strlen(line);
#endif
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
if (node->paths[t] == NULL) continue;
for (int n = 0; n<system->nodes[t].count; n++) {
#ifdef ENABLE_TRACE
line[0] = 0;
int offset = 0;
for (int i=0; i<node->paths[t][n].count; i++) {
struct ncclTopoLink* link = node->paths[t][n].list[i];
struct ncclTopoNode* remNode = link->remNode;
sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width);
#else
sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]);
offset = strlen(line);
#endif
}
}
#ifndef ENABLE_TRACE
INFO(NCCL_GRAPH, "%s", line);
#endif
}
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
for (int i=0; i<system->nodes[GPU].count; i++) {
printNodePaths(system, system->nodes[GPU].nodes+i);
}
for (int i=0; i<system->nodes[NET].count; i++) {
printNodePaths(system, system->nodes[NET].nodes+i);
}
return ncclSuccess;
}
static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
// Find the closest CPU to a GPU
int minHops = 0;
int localCpu = -1;
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
for (int c=0; c<system->nodes[CPU].count; c++) {
int hops = paths[c].count;
if (minHops == 0 || hops < minHops) {
localCpu = c;
minHops = hops;
}
}
if (localCpu == -1) {
WARN("Error : could not find CPU close to GPU %d", gpu);
return ncclInternalError;
}
*retCpu = localCpu;
return ncclSuccess;
}
static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
int l=0;
// Node 1 -> CPU
for (int i=0; i<srcNode->paths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
// CPU -> Node 2
for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
// Update path characteristics
srcNode->paths[t2][i2].count = l;
srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width);
return ncclSuccess;
}
// Remove/free paths for a given type
static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
// Remove links _to_ the given type
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
free(node->paths[nodeType]);
node->paths[nodeType] = NULL;
}
// Remove links _from_ the given type
for (int n=0; n<system->nodes[nodeType].count; n++) {
struct ncclTopoNode* node = system->nodes[nodeType].nodes+n;
free(node->paths[t]);
node->paths[t] = NULL;
}
}
}
static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS };
ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
if (*level == -1) {
int l = -1;
if (disableEnv) {
char* str = getenv(disableEnv);
if (str) {
int disable = strtol(str, NULL, 0);
if (disable == 1) l = 0;
}
}
if (l == -1) {
char* str = getenv(levelEnv);
if (str) {
for (int i=0; i<=PATH_SYS; i++) {
if (strcmp(str, topoPathTypeStr[i]) == 0) {
l = i;
break;
}
}
// Old style numbering
// levelsOldToNew to is an array with each index corresponding to the
// "old level" int, and each value mapping to the correct value defined in topo.h
// maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
if (l == -1 && str[0] >= '0' && str[0] <= '9') {
int oldLevel = strtol(str, NULL, 0);
const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
l = levelsOldToNew[oldLevel];
}
}
}
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
*level = l >= 0 ? l : -2;
}
return ncclSuccess;
}
NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
int ncclTopoUserP2pLevel = -1;
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
*p2p = 0;
if (read) *read = 0;
if (intermediateRank) *intermediateRank = -1;
// Get GPUs from topology
int g1, g2;
NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
// GPU not found, we can't use p2p.
return ncclSuccess;
}
int intermediateIndex = -1;
// Set intermediate GPU rank, if routing through an intermediate GPU.
struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
if (path->count == 2) {
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
if (intermediateNode->type == GPU) {
intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
}
}
// In general, use P2P whenever we can.
int p2pLevel = PATH_SYS;
// User override
if (ncclTopoUserP2pLevel == -1)
NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
if (ncclTopoUserP2pLevel != -2) {
p2pLevel = ncclTopoUserP2pLevel;
goto compare;
}
// Don't use P2P through ARM CPUs
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
p2pLevel = PATH_PXB;
}
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
}
compare:
// Compute the PCI distance and compare with the p2pLevel.
if (path->type <= p2pLevel) *p2p = 1;
if (*p2p == 1) {
// NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
// validate against NVML at all since they are pretending to be on other hw.
if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) {
int indexes[3] = {-1,-1,-1};
int verticeN = 0;
NCCLCHECK(ncclNvmlEnsureInitialized());
indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev;
if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev;
indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev;
for (int i=1; i < verticeN; i++) {
nvmlGpuP2PStatus_t status;
status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead;
bool good = status == NVML_P2P_STATUS_OK;
status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite;
good &= status == NVML_P2P_STATUS_OK;
if (!good) {
if (ncclParamIgnoreDisabledP2p()) {
*p2p = 0;
} else if (path->type <= PATH_NVB) {
WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
return ncclUnhandledCudaError;
} else if (path->type < PATH_SYS) {
INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
}
}
}
}
}
if (path->type == PATH_NVL) {
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
// Enable P2P Read for Ampere/NVLink only
if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
}
return ncclSuccess;
}
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
int ncclTopoUserGdrLevel = -1;
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
*useGdr = 0;
// Get GPU and NET
int n, g;
NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
// Check that both the NIC and GPUs support it
if (net->net.gdrSupport == 0) return ncclSuccess;
if (gpu->gpu.gdrSupport == 0) return ncclSuccess;
if (read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = ncclParamNetGdrRead();
if (gdrReadParam == 0) return ncclSuccess;
if (gdrReadParam < 0) {
int nvlink = 0;
// Since we don't know whether there are other communicators,
// it's better to keep things local if we have a single GPU.
if (system->nodes[GPU].count == 1) nvlink = 1;
for (int i=0; i<system->nodes[GPU].count; i++) {
if (i == g) continue;
if (gpu->paths[GPU][i].type == PATH_NVL) {
nvlink = 1;
break;
}
}
if (!nvlink) return ncclSuccess;
}
}
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = PATH_PXB;
NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
int distance = gpu->paths[NET][n].type;
if (distance == PATH_PXN) {
// In case of PXN, use the intermediate GPU distance instead
int proxyRank, g;
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
distance = proxyGpu->paths[NET][n].type;
}
if (distance > netGdrLevel) {
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
return ncclSuccess;
}
*useGdr = 1;
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
return ncclSuccess;
}
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
// Get GPU and NET
int n, g;
NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
struct ncclTopoLinkList* path = gpu->paths[NET]+n;
if (path->type == PATH_PXN) {
struct ncclTopoNode* node;
int type = NVS;
for (int i=0; i<path->count && type == NVS; i++) {
node = path->list[i]->remNode;
type = node->type;
}
if (type != GPU) {
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev);
return ncclInternalError;
}
*intermediateRank = node->gpu.rank;
} else {
*intermediateRank = rank;
}
return ncclSuccess;
}
NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
// remote proxies without risking deadlocks
int ncclPxnDisable() {
static int pxnDisable = -1;
if (pxnDisable == -1) {
if (ncclNetVersion() == 4) {
INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
pxnDisable = 1;
} else {
pxnDisable = ncclParamPxnDisable();
}
}
return pxnDisable;
}
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) {
struct ncclTopoSystem* system = comm->topo;
*nranks = 0;
*intermediateRanks = NULL;
if (system->nodes[NET].count == 0) return ncclSuccess;
int nr = 0;
int* ranks = NULL;
for (int rank=0; rank<comm->nRanks; rank++) {
int netDev, proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
if (proxyRank == comm->rank) continue;
int useGdr;
NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
if (useGdr == 0) continue;
int found = 0;
for (int r=0; r<nr; r++) {
if (ranks[r] == proxyRank) found = 1;
}
if (!found) {
NCCLCHECK(ncclRealloc(&ranks, nr, nr+1));
ranks[nr++] = proxyRank;
}
}
*nranks = nr;
*intermediateRanks = ranks;
return ncclSuccess;
}
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
// Precompute paths between GPUs/NICs.
// Remove everything in case we're re-computing
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
// Set direct paths from/to CPUs. We need them in many cases.
for (int c=0; c<system->nodes[CPU].count; c++) {
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
}
// Set direct paths from/to GPUs.
for (int g=0; g<system->nodes[GPU].count; g++) {
// Compute paths to GPU g
NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
// Update path when we don't want to / can't use GPU Direct P2P
for (int p=0; p<system->nodes[GPU].count; p++) {
int p2p;
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
if (p2p == 0) {
// Divert all traffic through the CPU
int cpu;
NCCLCHECK(getLocalCpu(system, g, &cpu));
NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
}
}
if (peerInfos == NULL) continue;
// Remove GPUs we can't talk to because of containers.
struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank;
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank;
int shm;
NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
int p2p;
NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
if (shm == 0 && p2p == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
}
}
}
// Set direct paths from/to NICs.
for (int n=0; n<system->nodes[NET].count; n++) {
struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
NCCLCHECK(ncclTopoSetPaths(netNode, system));
for (int g=0; g<system->nodes[GPU].count; g++) {
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
int pxnGpu = -1;
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
// PXN = PCI + NVLink.
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
if (peerNode->paths[NET][n].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
pxnGpu = p;
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
// To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
if (netDev == netNode->id) break;
}
if (pxnGpu != -1) {
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n));
}
}
// Update path when we dont want to / can't use GPU Direct RDMA.
int gdr;
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
if (gdr == 0) {
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
}
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
int *domains;
int64_t *ids;
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
int myDomain = 0;
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
domains[g] = g;
ids[g] = gpu->id;
for (int p=0; p<g; p++) {
if (gpu->paths[GPU][p].count > 0) {
domains[g] = std::min(domains[g], domains[p]);
}
}
if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
for (int i=0; i<ngpus; i++) {
if (domains[i] == myDomain) continue;
struct ncclTopoNode* gpu = NULL;
int g;
for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
gpu = system->nodes[GPU].nodes+g;
if (gpu->id == ids[i]) break; else gpu=NULL;
}
if (gpu == NULL) {
WARN("Could not find id %lx", ids[i]);
free(domains);
free(ids);
return ncclInternalError;
}
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
}
if (system->nodes[GPU].count == comm->nRanks) {
for (int n=system->nodes[NET].count-1; n>=0; n--)
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
}
free(domains);
free(ids);
return ncclSuccess;
}
void ncclTopoFree(struct ncclTopoSystem* system) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
free(system);
}
NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 2);
static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
int peer;
struct ncclTopoLinkList* path = NULL;
if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
// Same rank
if (g == peer) {
*nChannels = -1;
return ncclSuccess;
}
// Local rank
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
if (path->type == PATH_NVL) {
float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
*nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
} else {
*nChannels = 2;
}
} else {
// Remote rank, use network
*nChannels = ncclParamNChannelsPerNetPeer();
}
return ncclSuccess;
}
NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1);
NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
static int nextPow2(int v) {
int pow2 = 1;
while (pow2 < v) pow2 <<= 1;
return pow2;
}
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
int minChannels = comm->p2pnChannels;
// We need to loop through all local GPUs to have a global picture
for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
for (int r=0; r<comm->nRanks; r++) {
int nChannels;
NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels));
if (nChannels >= 0) minChannels = std::min(minChannels, nChannels);
}
}
// Round to next pow2 nChannelsPerPeer and nChannels
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
// Init channels that weren't used so far
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
// We want to spread channels used when there aren't many and progressively
// fill the whole space of nChannels. To do so we mirror the bits in the
// nChannels space.
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int mirror = 0;
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
comm->p2pChannels[c] = mirror;
}
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
return ncclSuccess;
}
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) {
int ngpus = system->nodes[GPU].count;
NCCLCHECK(ncclCalloc(ranks, ngpus));
int nvbGpus = 0;
for (int g=0; g<ngpus; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (gpu->gpu.rank != rank) continue;
for (int p=0; p<ngpus; p++) {
if (gpu->paths[GPU][p].type == PATH_NVB) {
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
}
}
}
*nranks = nvbGpus;
return ncclSuccess;
}