Minimize the performance impact of the device kernel profiling support when the profiler plugin is not loaded. Reduce the overheads of CUDA graph capturing, which increased in NCCL 2.26.2 for large graphs. Fix the exchange of enhanced connection establishment (ECE) options to address potential slowdowns on networks utilizing RoCE. Test if cuMem host allocations work and if not, disable them. Enabled by default since NCCL 2.24 if the CUDA driver version is at least 12.6, such allocations rely on NUMA support, which is by default not available under Docker. We recommend invoking Docker with "--cap-add SYS_NICE" to enable it. Fix an initialization error when running with NCCL_NET_GDR_C2C=1 on multiple MNNVL domains with non-uniform network configurations across nodes. Fix the printing of sub-seconds in the debug log when using a custom NCCL_DEBUG_TIMESTAMP_FORMAT setting.
134 lines
6.0 KiB
C
134 lines
6.0 KiB
C
/*************************************************************************
|
|
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_GRAPH_H_
|
|
#define NCCL_GRAPH_H_
|
|
|
|
#include "nccl.h"
|
|
#include "device.h"
|
|
#include <limits.h>
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <sched.h>
|
|
|
|
ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
|
|
|
|
struct ncclTopoSystem;
|
|
// Build the topology
|
|
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile=NULL);
|
|
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
|
|
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
|
|
|
|
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
|
|
void ncclTopoFree(struct ncclTopoSystem* system);
|
|
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
|
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
|
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
|
|
ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink);
|
|
|
|
ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
|
|
|
|
// Query topology
|
|
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
|
|
ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
|
|
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
|
|
enum ncclTopoGdrMode {
|
|
ncclTopoGdrModeDisable = 0,
|
|
ncclTopoGdrModeDefault = 1,
|
|
ncclTopoGdrModePci = 2,
|
|
ncclTopoGdrModeNum = 3
|
|
};
|
|
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode);
|
|
ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush);
|
|
ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
|
|
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
|
|
int ncclPxnDisable(struct ncclComm* comm);
|
|
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
|
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
|
|
|
|
// Find CPU affinity
|
|
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
|
|
|
|
#define NCCL_TOPO_CPU_ARCH_X86 1
|
|
#define NCCL_TOPO_CPU_ARCH_POWER 2
|
|
#define NCCL_TOPO_CPU_ARCH_ARM 3
|
|
#define NCCL_TOPO_CPU_ARCH_MIXED 4
|
|
#define NCCL_TOPO_CPU_VENDOR_INTEL 1
|
|
#define NCCL_TOPO_CPU_VENDOR_AMD 2
|
|
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
|
|
#define NCCL_TOPO_CPU_VENDOR_MIXED 4
|
|
#define NCCL_TOPO_CPU_MODEL_INTEL_BDW 1
|
|
#define NCCL_TOPO_CPU_MODEL_INTEL_SKL 2
|
|
#define NCCL_TOPO_CPU_MODEL_INTEL_SRP 3
|
|
#define NCCL_TOPO_CPU_MODEL_INTEL_ERP 4
|
|
#define NCCL_TOPO_CPU_MODEL_YONGFENG 1
|
|
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
|
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
|
|
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
|
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
|
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev);
|
|
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
|
|
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
|
|
|
|
#define NCCL_TOPO_MAX_NODES 256
|
|
|
|
// Init search. Needs to be done before calling ncclTopoCompute
|
|
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
|
|
|
|
#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
|
|
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
|
|
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
|
|
#define NCCL_TOPO_PATTERN_RING 4 // Ring
|
|
#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree
|
|
#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct
|
|
struct ncclTopoGraph {
|
|
// Input / output
|
|
int id; // ring : 0, tree : 1, collnet : 2, nvls : 3, collnetDirect : 4
|
|
int pattern;
|
|
int crossNic;
|
|
int collNet;
|
|
int minChannels;
|
|
int maxChannels;
|
|
// Output
|
|
int nChannels;
|
|
float bwIntra;
|
|
float bwInter;
|
|
float latencyInter;
|
|
int typeIntra;
|
|
int typeInter;
|
|
int sameChannels;
|
|
int nHops;
|
|
int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
|
|
int64_t inter[MAXCHANNELS*2];
|
|
};
|
|
ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
|
|
|
|
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
|
|
ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
|
|
|
|
struct ncclTopoRanks {
|
|
int ringRecv[MAXCHANNELS];
|
|
int ringSend[MAXCHANNELS];
|
|
int ringPrev[MAXCHANNELS];
|
|
int ringNext[MAXCHANNELS];
|
|
int treeToParent[MAXCHANNELS];
|
|
int treeToChild0[MAXCHANNELS];
|
|
int treeToChild1[MAXCHANNELS];
|
|
int nvlsHeads[MAXCHANNELS];
|
|
int nvlsHeadNum;
|
|
};
|
|
|
|
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
|
|
|
|
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
|
|
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
|
|
|
|
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
|
|
ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time);
|
|
|
|
#endif
|