nccl/src/graph/topo.cc
Ke Wen 7e51592129 2.10.3-1
Add support for bfloat16.
Add ncclAvg reduction operation.
Improve performance for aggregated operations.
Improve performance for tree.
Improve network error reporting.
Add NCCL_NET parameter to force a specific network.
Add NCCL_IB_QPS_PER_CONNECTION parameter to split IB traffic onto multiple queue pairs.
Fix topology detection error in WSL2.
Fix proxy memory elements affinity (improve alltoall performance).
Fix graph search on cubemesh topologies.
Fix hang in cubemesh during NVB connections.
2021-07-08 14:30:14 -07:00

778 lines
30 KiB
C++

/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "nvmlwrap.h"
#include "net.h"
#include "coll_net.h"
#include <sys/stat.h>
#include <fcntl.h>
#include "xml.h"
#include "cpuset.h"
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
/******************************************************************/
/******************* Graph Creation Functions *********************/
/******************************************************************/
// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
char* str = path+offset;
// Remove trailing "/"
if (*str == '/') str--;
// Find next /
while (*str != '/') str--;
str++;
int64_t numid;
NCCLCHECK(busIdToInt64(str, &numid));
// Ignore subdevice because those should use the same PCI link so we want to merge nodes.
numid -= numid & 0xf;
*id = numid;
return ncclSuccess;
}
static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
*cpu = NULL;
if (node->type == CPU) {
*cpu = node;
return ncclSuccess;
}
for (int l=0; l<node->nlinks; l++) {
if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
if (*cpu != NULL) return ncclSuccess;
}
return ncclSuccess;
}
int interCpuWidth = 0;
int cpuPciWidth = 0;
static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) {
*width = LOC_WIDTH;
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) {
*width = P9_WIDTH;
return ncclSuccess;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) {
*width = ARM_WIDTH;
return ncclSuccess;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
}
return ncclSuccess;
}
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceUnknown,
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
for (int i=0; i<system->nodes[type].count; i++) {
if (system->nodes[type].nodes[i].id == id) {
*node = system->nodes[type].nodes+i;
return ncclSuccess;
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d", type);
return ncclInternalError;
}
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
system->nodes[type].count++;
n->type = type;
n->id = id;
if (type == GPU) {
// Create link to itself (used in some corner cases)
n->nlinks=1;
n->links[0].type = LINK_LOC;
n->links[0].remNode = n;
n->links[0].width = LOC_WIDTH;
n->gpu.dev = NCCL_TOPO_UNDEF;
n->gpu.rank = NCCL_TOPO_UNDEF;
n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
} else if (type == CPU) {
n->cpu.arch = NCCL_TOPO_UNDEF;
n->cpu.vendor = NCCL_TOPO_UNDEF;
n->cpu.model = NCCL_TOPO_UNDEF;
} else if (type == NET) {
n->net.asic = 0ULL;
n->net.port = NCCL_TOPO_UNDEF;
n->net.width = 0.0;
}
*node = n;
return ncclSuccess;
}
ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) {
struct ncclTopoNode* delNode = system->nodes[type].nodes+index;
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
free(delNode->paths[t]);
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
if (node == delNode) continue;
for (int l=0; l<node->nlinks; l++) {
while (l<node->nlinks && node->links[l].remNode == delNode) {
memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
node->nlinks--;
}
if (l<node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
node->links[l].remNode--;
}
}
}
}
memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode));
system->nodes[type].count--;
return ncclSuccess;
}
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
// Aggregate links into higher width for NVLink
struct ncclTopoLink* link;
for (link = node->links; link->remNode; link++) {
if (link->remNode == remNode && link->type == type) break;
}
if (link->remNode == NULL) node->nlinks++;
link->type = type;
link->remNode = remNode;
link->width += width;
// Sort links in BW descending order
struct ncclTopoLink linkSave;
memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
while (link != node->links) {
if ((link-1)->width >= linkSave.width) break;
memcpy(link, link-1, sizeof(struct ncclTopoLink));
link--;
}
memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
return ncclSuccess;
}
// BCM Gen4 Switches present themselves as a two-level hierarchical switch
// even though they're supposed to sustain full BW across all ports.
// Flatten the switch as this extra level can break the search and make
// NCCL take wrong topology decisions.
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
for (int s=0; s<system->nodes[PCI].count; s++) {
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
uint64_t device = pciSwitch->pci.device;
// Only flatten PEX Gen 4 switches in base mode
if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
// Find sub switches with the same device ID.
int64_t* subSwIds;
NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
int subs = 0;
for (int l=0; l<pciSwitch->nlinks; l++) {
struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
// Only fuse sub switches with the same device ID.
if (sub->type != PCI || sub->pci.device != device) continue;
// Save sub switch for later
subSwIds[subs++] = sub->id;
// Remove link to that sub switch
memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink)));
pciSwitch->nlinks--;
// Don't increase l for the next iteration as we just shifted all links by one.
l--;
}
for (int s=0; s<subs; s++) {
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
int index;
NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
// Connect all sub PCI devices to the parent switch
for (int l=0; l<sub->nlinks; l++) {
struct ncclTopoNode* remNode = sub->links[l].remNode;
if (remNode == pciSwitch) continue;
// Add link from parent PCI switch -> PCI device
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
pciSwitch->nlinks++;
// Update link from PCI device -> parent PCI switch
for (int rl=0; rl<remNode->nlinks; rl++) {
if (remNode->links[rl].remNode == sub) {
remNode->links[rl].remNode = pciSwitch;
break;
}
}
}
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
}
// Set subdevice to 0x0000 to make sure we don't merge this switch again.
pciSwitch->pci.device = 0x1000c01010000000;
free(subSwIds);
// Restart, as system->nodes[PCI].nodes has changed.
s = 0;
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
// And connect all CPU nodes together
for (int n=0; n<system->nodes[CPU].count; n++) {
for (int p=0; p<system->nodes[CPU].count; p++) {
if (n == p) continue;
float width;
NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width));
NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width));
}
}
return ncclSuccess;
}
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
if (node->type == GPU) {
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
} else if (node->type == CPU) {
sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else if (node->type == PCI) {
sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
} else {
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
}
INFO(NCCL_GRAPH, "%s", line);
for (int i=0; i<offset; i++) line[i] = ' ';
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_LOC) continue;
if (link->type != LINK_PCI || link->remNode != prevNode) {
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width);
int nextOffset = strlen(line);
if (link->type == LINK_PCI) {
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if (link->remNode->type == NET) {
sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width);
} else {
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
}
INFO(NCCL_GRAPH, "%s", line);
}
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
char line[1024];
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
INFO(NCCL_GRAPH, "==========================================");
NCCLCHECK(ncclTopoPrintPaths(s));
return ncclSuccess;
}
static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
// Shift all links to have upLink as last link
if (upNode) {
int l=0;
while (node->links[l].remNode != upNode) l++;
struct ncclTopoLink upLink;
memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
while (node->links[l+1].remNode) {
memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
l++;
}
memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
}
// Recursively sort the PCI tree
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
}
return ncclSuccess;
}
// We want the graph to be organized to ease/accelerate traversal :
// 1. NVLinks (already the case)
// 2. PCI down
// 3. PCI up
// 4. SYS (already the case)
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
return ncclSuccess;
}
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
int dev;
NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
struct ncclTopoNode* net;
NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
const char* str;
NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
if (str) sscanf(str, "0x%lx", &net->net.asic);
else net->net.asic = dev;
ncclDebugNoWarn = NCCL_GRAPH;
int mbps;
if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
net->net.width = mbps / 8000.0;
if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
ncclDebugNoWarn = 0;
NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
return ncclSuccess;
}
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
for (int s=0; s<xmlNic->nSubs; s++) {
struct ncclXmlNode* xmlNet = xmlNic->subs[s];
if (strcmp(xmlNet->name, "net") != 0) continue;
int index;
NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
if (index == -1) continue;
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
}
return ncclSuccess;
}
ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
// Do not go any further, nvlinks will be added in a second pass
return ncclSuccess;
}
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
const char* str;
int type;
NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
int64_t busId;
NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
NCCLCHECK(busIdToInt64(str, &busId));
struct ncclTopoNode* node = NULL;
struct ncclXmlNode* xmlGpu = NULL;
NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
if (xmlGpu != NULL) {
type = GPU;
int index;
NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
if (index == -1) return ncclSuccess;
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
}
struct ncclXmlNode* xmlNic = NULL;
NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
if (xmlNic != NULL) {
type = NIC;
// Ignore sub device ID and merge multi-port NICs into one PCI device.
busId &= 0xfffffffffffffff0;
struct ncclTopoNode* nicNode = NULL;
NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
if (nicNode == NULL) {
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
node = nicNode; // Connect it to parent later on
}
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
} else if (type == PCI) {
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 48;
NCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 32;
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 16;
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
if (str) node->pci.device += strtol(str, NULL, 0);
for (int s=0; s<xmlPci->nSubs; s++) {
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
}
}
if (node) {
int width, speed;
NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
// Manage cases where speed was not indicated in /sys
if (width == 0) width = 16;
NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));
}
return ncclSuccess;
}
struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
int numaId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
struct ncclTopoNode* cpu;
NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
const char* str;
NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
if (str != NULL) {
NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));
}
NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {
NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
int familyId, modelId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
} else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
int familyId, modelId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
}
}
for (int s=0; s<xmlCpu->nSubs; s++) {
struct ncclXmlNode* node = xmlCpu->subs[s];
if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
if (strcmp(node->name, "nic") == 0) {
struct ncclTopoNode* nic = NULL;
NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
if (nic == NULL) {
NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
}
NCCLCHECK(ncclTopoAddNic(node, system, nic));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
if (strcmp(node->name, "nvlink") == 0) {
struct ncclTopoNode* gpu = NULL;
int64_t pBusId;
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx", pBusId);
return ncclInternalError;
}
int count;
NCCLCHECK(xmlGetAttrInt(node, "count", &count));
const char* targetClass;
NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
int targetType;
NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
struct ncclTopoNode* remote = NULL;
if (targetType == GPU) {
// NVL P2P connection to another GPU
const char* target;
NCCLCHECK(xmlGetAttrStr(node, "target", &target));
int64_t busId;
NCCLCHECK(busIdToInt64(target, &busId));
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
} else if (targetType == CPU) {
// NVL connection to the local CPU
NCCLCHECK(findLocalCpu(gpu, &remote));
} else {
if (system->nodes[NVS].count == 0) {
NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
} else {
remote = system->nodes[NVS].nodes;
}
}
if (remote) {
float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
if (remote->type != GPU) {
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
}
}
} else {
const char* busId;
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
for (int s=0; s<node->nSubs; s++) {
NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
NCCLCHECK(ncclCalloc(topoSystem, 1));
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
for (int s=0; s<topNode->nSubs; s++) {
struct ncclXmlNode* node = topNode->subs[s];
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
}
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
NCCLCHECK(ncclTopoSortSystem(*topoSystem));
return ncclSuccess;
}
NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
// Only set values if not already set
static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
}
return ncclSuccess;
}
static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
struct ncclXml* xml;
NCCLCHECK(ncclCalloc(&xml, 1));
char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
} else {
// Try default XML topology location
NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
}
if (xml->maxIndex == 0) {
// Create top tag
struct ncclXmlNode* top;
NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
}
// Auto-detect GPUs if needed
for (int r=0; r<comm->nRanks; r++) {
if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}
}
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
if (ncclCollNet) {
NCCLCHECK(collNetDevices(&netDevCount));
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
}
}
if (netDevCount == 0) {
NCCLCHECK(ncclNetDevices(&netDevCount));
}
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
}
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
free(xml);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) {
int g;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
int minType = PATH_SYS;
float maxWidth = 0;
int count = 0;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
for (int n=0; n<system->nodes[NET].count; n++) {
struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
if (path->width > maxWidth || (path->width == maxWidth && path->type < minType)) {
maxWidth = path->width;
minType = path->type;
count = 0;
}
if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
}
*id = nets[rr%count];
free(nets);
return ncclSuccess;
}
/****************************/
/* External query functions */
/****************************/
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) {
*arch = system->nodes[CPU].nodes[0].cpu.arch;
*vendor = system->nodes[CPU].nodes[0].cpu.vendor;
*model = system->nodes[CPU].nodes[0].cpu.model;
return ncclSuccess;
}
NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
struct ncclTopoNode* cpu = NULL, *gpu = NULL;
for (int g=0; g<system->nodes[GPU].count; g++) {
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
gpu = system->nodes[GPU].nodes+g;
// Find closer CPU
int cpuIndex = -1, minHops = 0;
for (int c=0; c<system->nodes[CPU].count; c++) {
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
if (cpuIndex == -1 || nHops < minHops) {
cpuIndex = c;
minHops = nHops;
}
}
cpu = system->nodes[CPU].nodes+cpuIndex;
}
}
if (cpu == NULL) {
WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
return ncclInternalError;
}
// Query the CPU affinity set we were provided
cpu_set_t mask;
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
#ifdef ENABLE_TRACE
{
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
}
#endif
// Get the affinity of the CPU close to our GPU.
cpu_set_t cpuMask = cpu->cpu.affinity;
#ifdef ENABLE_TRACE
{
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr));
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
}
#endif
cpu_set_t finalMask;
if (ncclParamIgnoreCpuAffinity())
// Ignore the CPU affinity set and use the GPU one instead
finalMask = cpuMask;
else
// Use a subset of the GPU affinity set
CPU_AND(&finalMask, &mask, &cpuMask);
memcpy(affinity, &finalMask, sizeof(cpu_set_t));
// If there is a non empty set, use it to set affinity
if (CPU_COUNT(&finalMask)) {
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[NET].count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
if (system->nodes[GPU].count == 0) return ncclInternalError;
int min, max;
min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
for (int g=1; g<system->nodes[GPU].count; g++) {
min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
}
if (ccMin) *ccMin = min;
if (ccMax) *ccMax = max;
return ncclSuccess;
}