/************************************************************************* * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "graph.h" #include "topo.h" #include "comm.h" #include "nvmlwrap.h" #include "net.h" #include "coll_net.h" #include #include #include "xml.h" #include "cpuset.h" #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" }; const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ /******************************************************************/ // Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000. ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { char* str = path+offset; // Remove trailing "/" if (*str == '/') str--; // Find next / while (*str != '/') str--; str++; int64_t numid; NCCLCHECK(busIdToInt64(str, &numid)); // Ignore subdevice because those should use the same PCI link so we want to merge nodes. numid -= numid & 0xf; *id = numid; return ncclSuccess; } static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) { *cpu = NULL; if (node->type == CPU) { *cpu = node; return ncclSuccess; } for (int l=0; lnlinks; l++) { if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); if (*cpu != NULL) return ncclSuccess; } return ncclSuccess; } int interCpuWidth = 0; int cpuPciWidth = 0; static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) { *width = LOC_WIDTH; if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) { *width = P9_WIDTH; return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) { *width = ARM_WIDTH; return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH; } return ncclSuccess; } enum ncclNvLinkDeviceType { ncclNvLinkDeviceUnknown, ncclNvLinkDeviceGpu, ncclNvLinkDeviceSwitch, ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) }; ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { for (int i=0; inodes[type].count; i++) { if (system->nodes[type].nodes[i].id == id) { *node = system->nodes[type].nodes+i; return ncclSuccess; } } return ncclSuccess; } ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { WARN("Error : tried to create too many nodes of type %d", type); return ncclInternalError; } struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; system->nodes[type].count++; n->type = type; n->id = id; if (type == GPU) { // Create link to itself (used in some corner cases) n->nlinks=1; n->links[0].type = LINK_LOC; n->links[0].remNode = n; n->links[0].width = LOC_WIDTH; n->gpu.dev = NCCL_TOPO_UNDEF; n->gpu.rank = NCCL_TOPO_UNDEF; n->gpu.cudaCompCap = NCCL_TOPO_UNDEF; } else if (type == CPU) { n->cpu.arch = NCCL_TOPO_UNDEF; n->cpu.vendor = NCCL_TOPO_UNDEF; n->cpu.model = NCCL_TOPO_UNDEF; } else if (type == NET) { n->net.asic = 0ULL; n->net.port = NCCL_TOPO_UNDEF; n->net.width = 0.0; } *node = n; return ncclSuccess; } ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) { struct ncclTopoNode* delNode = system->nodes[type].nodes+index; for (int t=0; tpaths[t]); for (int n=0; nnodes[t].count; n++) { struct ncclTopoNode* node = system->nodes[t].nodes+n; if (node == delNode) continue; for (int l=0; lnlinks; l++) { while (lnlinks && node->links[l].remNode == delNode) { memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink)); node->nlinks--; } if (lnlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) { node->links[l].remNode--; } } } } memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode)); system->nodes[type].count--; return ncclSuccess; } ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) { // Aggregate links into higher width for NVLink struct ncclTopoLink* link; for (link = node->links; link->remNode; link++) { if (link->remNode == remNode && link->type == type) break; } if (link->remNode == NULL) node->nlinks++; link->type = type; link->remNode = remNode; link->width += width; // Sort links in BW descending order struct ncclTopoLink linkSave; memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); while (link != node->links) { if ((link-1)->width >= linkSave.width) break; memcpy(link, link-1, sizeof(struct ncclTopoLink)); link--; } memcpy(link, &linkSave, sizeof(struct ncclTopoLink)); return ncclSuccess; } // BCM Gen4 Switches present themselves as a two-level hierarchical switch // even though they're supposed to sustain full BW across all ports. // Flatten the switch as this extra level can break the search and make // NCCL take wrong topology decisions. ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { for (int s=0; snodes[PCI].count; s++) { struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s; uint64_t device = pciSwitch->pci.device; // Only flatten PEX Gen 4 switches in base mode if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) { // Find sub switches with the same device ID. int64_t* subSwIds; NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks)); int subs = 0; for (int l=0; lnlinks; l++) { struct ncclTopoNode* sub = pciSwitch->links[l].remNode; // Only fuse sub switches with the same device ID. if (sub->type != PCI || sub->pci.device != device) continue; // Save sub switch for later subSwIds[subs++] = sub->id; // Remove link to that sub switch memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink))); pciSwitch->nlinks--; // Don't increase l for the next iteration as we just shifted all links by one. l--; } for (int s=0; snodes[PCI].nodes is changing every time we remove a node) int index; NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index)); struct ncclTopoNode* sub = system->nodes[PCI].nodes+index; // Connect all sub PCI devices to the parent switch for (int l=0; lnlinks; l++) { struct ncclTopoNode* remNode = sub->links[l].remNode; if (remNode == pciSwitch) continue; // Add link from parent PCI switch -> PCI device memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink)); pciSwitch->nlinks++; // Update link from PCI device -> parent PCI switch for (int rl=0; rlnlinks; rl++) { if (remNode->links[rl].remNode == sub) { remNode->links[rl].remNode = pciSwitch; break; } } } NCCLCHECK(ncclTopoRemoveNode(system, PCI, index)); } // Set subdevice to 0x0000 to make sure we don't merge this switch again. pciSwitch->pci.device = 0x1000c01010000000; free(subSwIds); // Restart, as system->nodes[PCI].nodes has changed. s = 0; } } return ncclSuccess; } ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) { // And connect all CPU nodes together for (int n=0; nnodes[CPU].count; n++) { for (int p=0; pnodes[CPU].count; p++) { if (n == p) continue; float width; NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width)); NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width)); } } return ncclSuccess; } static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) { if (node->type == GPU) { sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank); } else if (node->type == CPU) { sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model); } else if (node->type == PCI) { sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device); } else { sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id); } INFO(NCCL_GRAPH, "%s", line); for (int i=0; inlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_LOC) continue; if (link->type != LINK_PCI || link->remNode != prevNode) { sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width); int nextOffset = strlen(line); if (link->type == LINK_PCI) { NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); } else { if (link->remNode->type == NET) { sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width); } else { sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); } INFO(NCCL_GRAPH, "%s", line); } } } return ncclSuccess; } ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) { INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth); char line[1024]; for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0)); INFO(NCCL_GRAPH, "=========================================="); NCCLCHECK(ncclTopoPrintPaths(s)); return ncclSuccess; } static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) { // Shift all links to have upLink as last link if (upNode) { int l=0; while (node->links[l].remNode != upNode) l++; struct ncclTopoLink upLink; memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink)); while (node->links[l+1].remNode) { memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink)); l++; } memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink)); } // Recursively sort the PCI tree for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node)); } return ncclSuccess; } // We want the graph to be organized to ease/accelerate traversal : // 1. NVLinks (already the case) // 2. PCI down // 3. PCI up // 4. SYS (already the case) ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) { for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL)); return ncclSuccess; } ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) { int dev; NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); struct ncclTopoNode* net; NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev)); const char* str; NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); if (str) sscanf(str, "0x%lx", &net->net.asic); else net->net.asic = dev; ncclDebugNoWarn = NCCL_GRAPH; int mbps; if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0; if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.width = mbps / 8000.0; if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0; if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0; if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS; if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0; ncclDebugNoWarn = 0; NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width)); NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width)); return ncclSuccess; } ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) { for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlNet = xmlNic->subs[s]; if (strcmp(xmlNet->name, "net") != 0) continue; int index; NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index)); if (index == -1) continue; NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic)); } return ncclSuccess; } ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport)); // Do not go any further, nvlinks will be added in a second pass return ncclSuccess; } struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) { const char* str; int type; NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str)); NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass)); int64_t busId; NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str)); NCCLCHECK(busIdToInt64(str, &busId)); struct ncclTopoNode* node = NULL; struct ncclXmlNode* xmlGpu = NULL; NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu)); if (xmlGpu != NULL) { type = GPU; int index; NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index)); if (index == -1) return ncclSuccess; NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node)); } struct ncclXmlNode* xmlNic = NULL; NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic)); if (xmlNic != NULL) { type = NIC; // Ignore sub device ID and merge multi-port NICs into one PCI device. busId &= 0xfffffffffffffff0; struct ncclTopoNode* nicNode = NULL; NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId)); if (nicNode == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId)); node = nicNode; // Connect it to parent later on } NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode)); } else if (type == PCI) { NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 48; NCCLCHECK(xmlGetAttr(xmlPci, "device", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 32; NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 16; NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str)); if (str) node->pci.device += strtol(str, NULL, 0); for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlSubPci = xmlPci->subs[s]; NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node)); } } if (node) { int width, speed; NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width)); NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str)); // Manage cases where speed was not indicated in /sys if (width == 0) width = 16; NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end) NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0)); NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0)); } return ncclSuccess; } struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } }; struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } }; ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) { int numaId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId)); struct ncclTopoNode* cpu; NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId)); const char* str; NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str)); if (str != NULL) { NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity)); } NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str)); NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch)); if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) { NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str)); NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor)); if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW; } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG; } } for (int s=0; snSubs; s++) { struct ncclXmlNode* node = xmlCpu->subs[s]; if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu)); if (strcmp(node->name, "nic") == 0) { struct ncclTopoNode* nic = NULL; NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0)); if (nic == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0)); NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH)); NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH)); } NCCLCHECK(ncclTopoAddNic(node, system, nic)); } } return ncclSuccess; } ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) { if (strcmp(node->name, "nvlink") == 0) { struct ncclTopoNode* gpu = NULL; int64_t pBusId; NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); if (gpu == NULL) { WARN("Add NVLink error : could not find GPU %lx", pBusId); return ncclInternalError; } int count; NCCLCHECK(xmlGetAttrInt(node, "count", &count)); const char* targetClass; NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass)); int targetType; NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass)); struct ncclTopoNode* remote = NULL; if (targetType == GPU) { // NVL P2P connection to another GPU const char* target; NCCLCHECK(xmlGetAttrStr(node, "target", &target)); int64_t busId; NCCLCHECK(busIdToInt64(target, &busId)); NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId)); } else if (targetType == CPU) { // NVL connection to the local CPU NCCLCHECK(findLocalCpu(gpu, &remote)); } else { if (system->nodes[NVS].count == 0) { NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0)); } else { remote = system->nodes[NVS].nodes; } } if (remote) { float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap); NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed)); if (remote->type != GPU) { NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed)); } } } else { const char* busId; NCCLCHECK(xmlGetAttr(node, "busid", &busId)); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId)); } } return ncclSuccess; } ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) { NCCLCHECK(ncclCalloc(topoSystem, 1)); struct ncclXmlNode* topNode; NCCLCHECK(xmlFindTag(xml, "system", &topNode)); for (int s=0; snSubs; s++) { struct ncclXmlNode* node = topNode->subs[s]; if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); } NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL)); NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem)); NCCLCHECK(ncclTopoConnectCpus(*topoSystem)); NCCLCHECK(ncclTopoSortSystem(*topoSystem)); return ncclSuccess; } NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0); // Only set values if not already set static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); } return ncclSuccess; } static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value); } return ncclSuccess; } ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { struct ncclXml* xml; NCCLCHECK(ncclCalloc(&xml, 1)); char* xmlTopoFile = getenv("NCCL_TOPO_FILE"); if (xmlTopoFile) { INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1)); } else { // Try default XML topology location NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0)); } if (xml->maxIndex == 0) { // Create top tag struct ncclXmlNode* top; NCCLCHECK(xmlAddNode(xml, NULL, "system", &top)); NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION)); } // Auto-detect GPUs if needed for (int r=0; rnRanks; r++) { if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); struct ncclXmlNode* node; NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); if (node == NULL) continue; NCCLCHECK(xmlSetAttrInt(node, "keep", 1)); NCCLCHECK(xmlSetAttrInt(node, "rank", r)); NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); } } // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. int netDevCount = 0; if (ncclCollNet) { NCCLCHECK(collNetDevices(&netDevCount)); for (int n=0; nrank == ncclParamTopoDumpFileRank()) { INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml)); } NCCLCHECK(ncclTopoGetSystemFromXml(xml, system)); free(xml); return ncclSuccess; } ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) { int g; NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); int minType = PATH_SYS; float maxWidth = 0; int count = 0; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); for (int n=0; nnodes[NET].count; n++) { struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g; if (path->width > maxWidth || (path->width == maxWidth && path->type < minType)) { maxWidth = path->width; minType = path->type; count = 0; } if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } *id = nets[rr%count]; free(nets); return ncclSuccess; } /****************************/ /* External query functions */ /****************************/ ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) { *arch = system->nodes[CPU].nodes[0].cpu.arch; *vendor = system->nodes[CPU].nodes[0].cpu.vendor; *model = system->nodes[CPU].nodes[0].cpu.model; return ncclSuccess; } NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) { struct ncclTopoNode* cpu = NULL, *gpu = NULL; for (int g=0; gnodes[GPU].count; g++) { if (system->nodes[GPU].nodes[g].gpu.rank == rank) { gpu = system->nodes[GPU].nodes+g; // Find closer CPU int cpuIndex = -1, minHops = 0; for (int c=0; cnodes[CPU].count; c++) { int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count; if (cpuIndex == -1 || nHops < minHops) { cpuIndex = c; minHops = nHops; } } cpu = system->nodes[CPU].nodes+cpuIndex; } } if (cpu == NULL) { WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank); return ncclInternalError; } // Query the CPU affinity set we were provided cpu_set_t mask; SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); #ifdef ENABLE_TRACE { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); } #endif // Get the affinity of the CPU close to our GPU. cpu_set_t cpuMask = cpu->cpu.affinity; #ifdef ENABLE_TRACE { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr)); TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); } #endif cpu_set_t finalMask; if (ncclParamIgnoreCpuAffinity()) // Ignore the CPU affinity set and use the GPU one instead finalMask = cpuMask; else // Use a subset of the GPU affinity set CPU_AND(&finalMask, &mask, &cpuMask); memcpy(affinity, &finalMask, sizeof(cpu_set_t)); // If there is a non empty set, use it to set affinity if (CPU_COUNT(&finalMask)) { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr); } return ncclSuccess; } ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) { *count = system->nodes[NET].count; return ncclSuccess; } ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) { if (system->nodes[GPU].count == 0) return ncclInternalError; int min, max; min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap; for (int g=1; gnodes[GPU].count; g++) { min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap); max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap); } if (ccMin) *ccMin = min; if (ccMax) *ccMax = max; return ncclSuccess; }