nccl/src/init.cc
Kamil Iskra 3ea7eedf3b NCCL 2.27.5-1
Improvements for GB200 systems
* Optimize the network performance by alternating the direction of the
  rings and the NIC to GPU assignment across communicators to limit
  unnecessary sharing.
* Fix the detection of C2C links in case GPU Direct RDMA is disabled
  between a GPU and a NIC.
* Fix PXN support on MNNVL systems, where NCCL would try (and fail) to
  share regular host memory across multiple nodes.
* Fix P2C (PXN over C2C), which is now preferred over regular PXN.  This
  support is currently preliminary and is disabled by default; use
  NCCL_PXN_C2C=1 to enable.

Further reduce the overheads of CUDA graph capturing, which increased in
NCCL 2.26.2 for large graphs.

Optimize the network performance on DGX B200 systems by adjusting the
bandwidths provided to the graph search algorithm.

Enable fp8 reductions in symmetric kernels on Blackwell with CUDA 12.8.

Restore the plugin name handling logic to make it possible to specify a
path to the plugin (Issue #1732).

Restore the ability to change NCCL_COLLNET_ENABLE during execution
(Issue #1741).

Add an example tuner plugin with CSV-based overrides.

Remove an x86 dependency from the example profiler.
2025-06-18 10:34:47 -07:00

2452 lines
100 KiB
C++

/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "channel.h"
#include "nvmlwrap.h"
#include "gdrwrap.h"
#include "bootstrap.h"
#include "transport.h"
#include "group.h"
#include "net.h"
#include "coll_net.h"
#include "enqueue.h"
#include "graph.h"
#include "argcheck.h"
#include "tuner.h"
#include "ras.h"
#include "profiler.h"
#include "mnnvl.h"
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <dlfcn.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "param.h"
#include "nvtx_payload_schemas.h"
#include "utils.h"
#define STR2(v) #v
#define STR(v) STR2(v)
#if CUDART_VERSION >= 9020
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
#else
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
#endif
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree", "PAT" };
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
NCCL_PARAM(WinEnable, "WIN_ENABLE", 1);
NCCL_PARAM(CollnetEnable, "COLLNET_ENABLE", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(CtaPolicy, "CTA_POLICY", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT);
static ncclResult_t commReclaim(ncclComm_t comm);
// GDRCOPY support: Off by default
NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
// GDRCOPY support
gdr_t ncclGdrCopy = NULL;
ncclResult_t initGdrCopy() {
if (ncclParamGdrCopyEnable() == 1) {
ncclGdrCopy = ncclGdrInit();
}
return ncclSuccess;
}
static ncclResult_t initResult = ncclSuccess;
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static void initOnceFunc() {
initEnv();
initGdrCopy();
// Always initialize bootstrap network
NCCLCHECKGOTO(bootstrapNetInit(), initResult, exit);
initNvtxRegisteredEnums();
exit:;
}
static ncclResult_t ncclInit() {
pthread_once(&initOnceControl, initOnceFunc);
return initResult;
}
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
ncclResult_t ncclGetVersion(int* version) {
if (version == NULL) return ncclInvalidArgument;
*version = NCCL_VERSION_CODE;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
NCCLCHECK(ncclInit());
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
struct ncclBootstrapHandle handle;
NCCLCHECK(bootstrapGetUniqueId(&handle));
// ncclUniqueId and bootstrapHandle don't have the same size and alignment
// reset to 0 to avoid undefined data
memset(out, 0, sizeof(*out));
// copy to avoid alignment mismatch
memcpy(out, &handle, sizeof(handle));
TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(out->internal, NCCL_UNIQUE_ID_BYTES));
return ncclSuccess;
}
// Prevent compiler from optimizing out these operations
#ifdef __clang__
#define NCCL_NO_OPTIMIZE __attribute__((optnone))
#else
#define NCCL_NO_OPTIMIZE __attribute__((optimize("O0")))
#endif
void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
// Important that this does not trash intraComm0.
comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1;
comm->startMagic = comm->endMagic = 0;
}
#undef NCCL_NO_OPTIMIZE
static ncclResult_t ncclDestructorFnFree(struct ncclDestructor* dtor) {
free(dtor->obj);
return ncclSuccess;
}
void ncclCommPushFree(struct ncclComm* comm, void* obj) {
struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
dtor->fn = ncclDestructorFnFree;
dtor->obj = obj;
dtor->next = comm->destructorHead;
comm->destructorHead = dtor;
}
static ncclResult_t ncclDestructorFnCudaFree(struct ncclDestructor* dtor) {
NCCLCHECK(ncclCudaFree(dtor->obj));
return ncclSuccess;
}
void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) {
struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
dtor->fn = ncclDestructorFnCudaFree;
dtor->obj = obj;
dtor->next = comm->destructorHead;
comm->destructorHead = dtor;
}
static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) {
NCCLCHECK(ncclCudaHostFree(dtor->obj));
return ncclSuccess;
}
void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) {
struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
dtor->fn = ncclDestructorFnCudaHostFree;
dtor->obj = obj;
dtor->next = comm->destructorHead;
comm->destructorHead = dtor;
}
static ncclResult_t ncclDestructorFnCudaGdrFree(struct ncclDestructor* dtor) {
NCCLCHECK(ncclGdrCudaFree(dtor->obj));
return ncclSuccess;
}
void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) {
struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
dtor->fn = ncclDestructorFnCudaGdrFree;
dtor->obj = handle;
dtor->next = comm->destructorHead;
comm->destructorHead = dtor;
}
static ncclResult_t commFree(ncclComm_t comm) {
int abort = 0;
/* commFree() should not involve any sync among ranks. */
if (comm == NULL)
return ncclSuccess;
if (comm->symmetricSupport && comm->symDevComm.base) {
NCCLCHECK(ncclCommSymmetricFreeInternal(comm, comm->baseUCSymPtr + comm->rank * comm->baseStride));
}
NCCLCHECK(ncclRasCommFini(comm));
/* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
* free all intra-process communicators; therefore, we only need to focus on local
* resource cleanup in commFree(). */
if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
if (comm->proxyState->threadUDS) {
// UDS support
PTHREADCHECK(pthread_join(comm->proxyState->threadUDS, nullptr), "pthread_join");
}
}
if (comm->memPool) CUDACHECK(cudaMemPoolDestroy(comm->memPool));
delete[] comm->userRedOps;
free(comm->connectSend);
free(comm->connectRecv);
free(comm->peerInfo);
if (comm->topo)
ncclTopoFree(comm->topo);
if (comm->nodeRanks) {
for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
free(comm->nodeRanks);
}
free(comm->rankToNode);
free(comm->rankToLocalRank);
free(comm->collNetHeads);
free(comm->clique.ranks);
if (comm->bootstrap)
NCCLCHECK(bootstrapClose(comm->bootstrap));
for (int channel=0; channel<MAXCHANNELS; channel++)
NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks, 1, comm->localRanks));
if (comm->sharedRes) {
if (ncclAtomicRefCountDecrement(&comm->sharedRes->refCount) == 0) {
for (int c=0; c<MAXCHANNELS; c++) {
if (comm->sharedRes->peers[c]) free(comm->sharedRes->peers[c]);
if (comm->sharedRes->devPeers[c]) ncclCudaFree(comm->sharedRes->devPeers[c]);
}
free(comm->sharedRes->tpRankToLocalRank);
NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
CUDACHECK(cudaEventDestroy(comm->sharedRes->launchEvent));
CUDACHECK(cudaEventDestroy(comm->sharedRes->scratchEvent));
NCCLCHECK(ncclProxyDestroy(comm));
free(comm->sharedRes);
}
}
if (comm->nvlsSupport) NCCLCHECK(ncclNvlsFree(comm));
struct ncclDestructor* dtor = comm->destructorHead;
while (dtor != nullptr) {
NCCLCHECK(dtor->fn(dtor));
dtor = dtor->next;
}
ncclMemoryStackDestruct(&comm->memScoped);
ncclMemoryStackDestruct(&comm->memPermanent);
abort = *comm->abortFlag;
if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) {
free(comm->abortFlag);
NCCLCHECK(ncclCudaHostFree((void*)comm->abortFlagDev));
free(comm->abortFlagRefCount);
}
free((void*)comm->config.netName);
free(comm->topParentRanks);
free(comm->topParentLocalRanks);
free(comm->gproxyConn);
NCCLCHECK(ncclRegCleanup(comm));
if (comm->symmetricSupport) {
NCCLCHECK(ncclNvlsSymmetricFinalize(comm));
NCCLCHECK(ncclIpcSymmetricFinalize(comm));
}
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
commPoison(comm); // poison comm before free to avoid comm reuse.
NCCLCHECK(ncclProfilerPluginFinalize(comm));
NCCLCHECK(ncclNetFinalize(comm));
ncclCudaContextDrop(comm->context);
free(comm);
return ncclSuccess;
}
NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0);
// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
#define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20)
NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT);
NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX);
enum ncclLaunchMode ncclParamLaunchMode;
NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1);
// Detect DMA-BUF support
static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL || ncclCudaLibraryInit() != ncclSuccess) return ncclInternalError;
#if CUDA_VERSION >= 11070
int flag = 0;
CUdevice dev;
int cudaDriverVersion;
CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion));
if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError;
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
// Query device to see if DMA-BUF support is available
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
if (flag == 0) return ncclInternalError;
INFO(NCCL_INIT, "DMA-BUF is available on GPU device %d", comm->cudaDev);
return ncclSuccess;
#endif
return ncclInternalError;
}
ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
/* comm must be ready, or error will be reported */
ncclResult_t ret = ncclSuccess;
if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
ncclGroupJobAbort(comm->groupJob);
} else {
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
if (ret == ncclInProgress) {
WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
ret = ncclInvalidArgument;
goto exit;
}
/* if ret is not ncclInProgress, we just keep it. */
}
exit:
return ret;
}
static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
if (ndev < 1) {
WARN("invalid device count (%d) requested", ndev);
return ncclInvalidArgument;
}
if (rank >= ndev || rank < 0) {
WARN("rank %d exceeds ndev=%d", rank, ndev);
return ncclInvalidArgument;
}
ncclMemoryStackConstruct(&comm->memPermanent);
ncclMemoryStackConstruct(&comm->memScoped);
comm->destructorHead = nullptr;
comm->rank = rank;
comm->nRanks = ndev;
NCCLCHECK(ncclNetInit(comm));
INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
if (parent && parent->shareResources) {
if (parent->ncclNet != comm->ncclNet) {
WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
return ncclInvalidUsage;
}
}
// Try to create a CUDA object right away. If there is something wrong with
// the device we're on (failure cause #1) , better know it early.
CUDACHECK(cudaGetDevice(&comm->cudaDev));
NCCLCHECK(ncclCudaContextTrack(&comm->context));
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
nvmlDevice_t nvmlDev;
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->busId, busId));
NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&comm->nvmlDev));
comm->compCap = ncclCudaCompCap();
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
for (int i = 0; i < ncclGroupTaskTypeNum; i++) {
comm->groupNext[i] = reinterpret_cast<struct ncclComm*>(0x1);
}
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
// Mark channels as non initialized.
for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
if (parent == NULL || !parent->shareResources) {
struct ncclSharedResources* sharedRes = NULL;
NCCLCHECK(ncclCalloc(&sharedRes, 1));
/* most of attributes are assigned later in initTransportsRank(). */
sharedRes->owner = comm;
sharedRes->tpNRanks = comm->nRanks;
NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
comm->sharedRes = sharedRes;
sharedRes->refCount = 1;
} else {
comm->sharedRes = parent->sharedRes;
ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
}
if (comm->topParentRanks == NULL) {
NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
for (int i = 0; i < comm->nRanks; ++i)
comm->topParentRanks[i] = i;
}
ncclIntruQueueMpscConstruct(&comm->callbackQueue);
ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
do {
cudaMemPoolProps props = {};
props.allocType = cudaMemAllocationTypePinned;
props.handleTypes = cudaMemHandleTypeNone;
props.location.type = cudaMemLocationTypeDevice;
props.location.id = comm->cudaDev;
CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
uint64_t releaseThreshold = ~uint64_t(0);
CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
} while (0);
ncclIntruQueueConstruct(&comm->eventCallbackQueue);
return ncclSuccess;
}
static ncclResult_t devCommSetup(ncclComm_t comm) {
ncclResult_t ret = ncclSuccess;
int nRanks = comm->nRanks;
struct ncclDevCommAndChannels tmpCommAndChans;
struct ncclDevCommAndChannels *devCommAndChans = NULL;
struct ncclNvmlCCStatus ccStatus;
bool ccEnable;
cudaStream_t deviceStream;
memset(&tmpCommAndChans, '\0', sizeof(tmpCommAndChans));
NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail);
ncclCommPushCudaFree(comm, devCommAndChans);
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
comm->devComm = &devCommAndChans->comm;
tmpCommAndChans.comm.rank = comm->rank;
tmpCommAndChans.comm.nRanks = nRanks;
tmpCommAndChans.comm.node = comm->node;
tmpCommAndChans.comm.nNodes = comm->nNodes;
tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
tmpCommAndChans.comm.isAllNvlink = comm->isAllNvlink;
for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
}
tmpCommAndChans.comm.p2pChunkSize = comm->p2pChunkSize;
tmpCommAndChans.comm.channels = &devCommAndChans->channels[0];
comm->workArgsBytes = std::min<size_t>(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch));
memset(&ccStatus, 0, sizeof(ccStatus));
ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE);
if (ccEnable) {
comm->workFifoBytes = 0;
} else {
comm->workFifoBytes = ncclParamWorkFifoBytes();
if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes);
comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
}
comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30);
}
if (comm->rank == 0) {
INFO(NCCL_INIT, "CC %s, workFifoBytes %d", ccEnable ? "On" : "Off", comm->workFifoBytes);
}
if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
// The workFifoBuf lives in GDR mapped CUDA memory.
NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoBuf, &comm->workFifoBufDev, comm->workFifoBytes, &comm->workFifoBufGdrHandle), ret, fail);
ncclCommPushCudaGdrFree(comm, comm->workFifoBufGdrHandle);
} else {
// The workFifoBuf lives in cudaHost memory.
comm->workFifoBufGdrHandle = nullptr;
NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoBuf, comm->workFifoBytes), ret, fail);
ncclCommPushCudaHostFree(comm, comm->workFifoBuf);
comm->workFifoBufDev = comm->workFifoBuf;
}
comm->workFifoProduced = 0;
comm->workFifoProducedLastRecorded = 0;
comm->workFifoConsumed = 0;
// Alloc profiler counters for the kernel
NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workCompleted, MAXCHANNELS), ret, fail);
tmpCommAndChans.comm.workStarted = comm->profiler.workStarted;
tmpCommAndChans.comm.workCompleted = comm->profiler.workCompleted;
ncclCommPushCudaHostFree(comm, comm->profiler.workStarted);
ncclCommPushCudaHostFree(comm, comm->profiler.workCompleted);
if (comm->collNetDenseToUserRank != nullptr) {
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
}
for (int c=0; c < MAXCHANNELS; c++) {
tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers;
tmpCommAndChans.channels[c].ring = comm->channels[c].ring;
tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks;
tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
if (comm->channels[c].ring.userRanks != nullptr) {
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, deviceStream), ret, fail);
}
}
NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, deviceStream), ret, fail);
exit:
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream));
return ret;
fail:
goto exit;
}
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
static void showVersion() {
if (ncclDebugLevel == NCCL_LOG_VERSION || ncclDebugLevel == NCCL_LOG_WARN) {
VERSION("%s", VERSION_STRING);
} else {
INFO(NCCL_ALL,"%s", VERSION_STRING);
}
}
NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1);
NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
cudaDeviceProp prop;
info->rank = comm->rank;
info->cudaDev = comm->cudaDev;
info->nvmlDev = comm->nvmlDev;
NCCLCHECK(ncclGetVersion(&info->version));
info->hostHash=getHostHash()+commHash;
info->pidHash=getPidHash()+commHash;
info->cuMemSupport = ncclCuMemEnable();
CUDACHECK(cudaGetDeviceProperties(&prop, comm->cudaDev));
info->totalGlobalMem = ROUNDUP(prop.totalGlobalMem, (1L << 32));
// Get the device MAJOR:MINOR of /dev/shm so we can use that
// information to decide whether we can use SHM for inter-process
// communication in a container environment
struct stat statbuf;
SYSCHECK(stat("/dev/shm", &statbuf), "stat");
info->shmDev = statbuf.st_dev;
info->busId = comm->busId;
NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport));
info->comm = comm;
info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap;
// MNNVL support
{
// MNNVL: Request the fabric UUID and partition info
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
nvmlDevice_t nvmlDev;
NCCLCHECK(int64ToBusId(info->busId, busId));
NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
(void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
if (ncclParamMNNVLUUID() != -1) {
((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID();
((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID();
}
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
info->busId,
((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
}
}
return ncclSuccess;
}
static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
NCCLCHECK(initChannel(comm, channelId));
struct ncclRing* ring = &comm->channels[channelId].ring;
// Find our ring-distance from rank zero and reorganize ranks to start with rank.
int ixZero=0, ixRank=0;
for (int i=0; i < nranks; i++) {
if (ringRanks[i] == 0) ixZero = i;
if (ringRanks[i] == rank) ixRank = i;
}
ring->index = (ixRank-ixZero + nranks)%nranks;
for (int i=0; i<nranks; i++) {
ring->userRanks[i] = ringRanks[(i+ixRank)%nranks];
}
return ncclSuccess;
}
#define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
#define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */
NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */
NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */
static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
}
if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
else if (comm->isAllNvlink) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
// Make sure P2P chunksize is not larger than coll chunksize.
if (comm->p2pChunkSize * NCCL_STEPS > comm->buffSizes[NCCL_PROTO_SIMPLE]) comm->p2pChunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (comm->sharedRes->owner != comm) {
/* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */
comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize);
} else {
comm->sharedRes->tpP2pChunkSize = comm->p2pChunkSize;
}
INFO(NCCL_INIT, "P2P Chunksize set to %d", comm->p2pChunkSize);
return ncclSuccess;
}
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
#define TIMER_INIT_TOTAL 0
#define TIMER_INIT_KERNELS 1
#define TIMER_INIT_BOOTSTRAP 2
#define TIMER_INIT_ALLGATHER 3
#define TIMER_INIT_TOPO 4
#define TIMER_INIT_GRAPHS 5
#define TIMER_INIT_CONNECT 6
#define TIMER_INIT_ALLOC 7
#define TIMERS_INIT_COUNT 8
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
// We use 2 AllGathers
// 1. { peerInfo, comm, compCap}
// 2. { nChannels, graphInfo, topoRanks }
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
int nNodes = 1;
cpu_set_t affinitySave;
struct ncclTopoGraph* ringGraph = &comm->graphs[NCCL_ALGO_RING];
struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE];
struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
struct graphInfo {
int pattern;
int nChannels;
int sameChannels;
float bwIntra;
float bwInter;
int typeIntra;
int typeInter;
int crossNic;
};
struct allGatherInfo {
struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
struct ncclTopoRanks topoRanks;
int cpuArch;
int cpuVendor;
int localRanks;
};
int nChannelsOrig;
struct allGatherInfo *allGather3Data = NULL;
struct ncclTopoRanks** allTopoRanks = NULL;
int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
int *rings = NULL;
int* nvbPeers = NULL;
struct ncclProxyConnector proxyConn;
int* pxnPeers = NULL;
int *topParentLocalRanks = NULL;
int p2pLevel = -1;
timers[TIMER_INIT_ALLGATHER] = clockNano();
// AllGather1 - begin
NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
__atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
comm->cuMemSupport = 1;
for (int i = 0; i < nranks; i++) {
if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
ret = ncclInvalidUsage;
goto fail;
}
if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
ret = ncclInvalidUsage;
goto fail;
}
}
// AllGather1 - end
timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
// Check for MNNVL support
NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
}
do {
// Compute intra-process ranks
int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
comm->nvlsRegSupport = 1;
for (int i = 0; i < nranks; i++) {
if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
&& (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
// Rank is in same process
if (intraProcRanks == 0) intraProcRank0 = i;
if (i == rank) intraProcRank = intraProcRanks;
intraProcRanks++;
if (intraProcRank0 == rank && rank != i) {
comm->peerInfo[i].comm->intraNext = comm->intraNext;
comm->intraNext = comm->peerInfo[i].comm;
}
}
if (comm->nvlsRegSupport) {
for (int j = i + 1; j < nranks; j++) {
if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash &&
comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) {
comm->nvlsRegSupport = 0;
break;
}
}
}
}
// Buffer Registration is not supported with MNNVL
if (comm->MNNVL) comm->nvlsRegSupport = 0;
TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
intraProcRank, intraProcRanks, intraProcRank0);
ret = ncclInternalError;
goto fail;
}
struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
assert(intraProcRank==0 ? comm==comm0 : true);
comm->intraComm0 = comm0;
comm->intraRank = intraProcRank;
comm->intraRanks = intraProcRanks;
comm->intraBarrierPhase = 0;
comm->intraBarrierCounter = 0;
comm->intraBarrierGate = 0;
} while(0);
timers[TIMER_INIT_TOPO] = clockNano();
// Dump XML if requested by user
const char* dumpXmlFile;
dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
if (dumpXmlFile) {
NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
}
// Topo detection / System graph creation
NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
// Compute paths between GPUs and NICs
NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
// Remove inaccessible GPUs and unused NICs
NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail);
// Recompute paths after trimming
NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
// Init search
NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
// Decide on comm's CPU architecture.
NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail);
// Print final topology
NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO];
// Set Affinity to a CPU local the our GPU, so that all memory we allocate
// on the host is local.
NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail);
if (CPU_COUNT(&comm->cpuAffinity)) {
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
}
// Determine local CollNet support
if (!collNetSupport(comm)) {
comm->config.collnetEnable = 0;
}
// Determine local Nvls support
NCCLCHECK(ncclNvlsInit(comm));
timers[TIMER_INIT_GRAPHS] = clockNano();
// Get rings and trees
memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
ringGraph->id = 0;
ringGraph->pattern = NCCL_TOPO_PATTERN_RING;
ringGraph->minChannels = 1;
ringGraph->maxChannels = MAXCHANNELS/2;
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail);
memset(treeGraph, 0, sizeof(struct ncclTopoGraph));
treeGraph->id = 1;
treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
treeGraph->minChannels = ringGraph->nChannels;
treeGraph->maxChannels = ringGraph->nChannels;
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail);
memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph));
collNetChainGraph->id = 2;
collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE;
collNetChainGraph->collNet = 1;
collNetChainGraph->minChannels = ringGraph->nChannels;
collNetChainGraph->maxChannels = ringGraph->nChannels;
memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
collNetDirectGraph->id = 4;
collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
collNetDirectGraph->collNet = 1;
collNetDirectGraph->minChannels = 1;
collNetDirectGraph->maxChannels = MAXCHANNELS;
if (comm->config.collnetEnable) {
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail);
}
memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph));
nvlsGraph->id = 3;
nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS;
nvlsGraph->minChannels = 1;
nvlsGraph->maxChannels = MAXCHANNELS;
if (comm->nvlsSupport) {
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
}
timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
// Initialize num P2P LL buffers for this communicator
comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
if (comm->rank == ncclParamGraphDumpFileRank()) {
struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph };
NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
}
// Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
// we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
timers[TIMER_INIT_CONNECT] = clockNano();
// AllGather3 - begin
NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
}
allGather3Data[rank].cpuArch = comm->cpuArch;
allGather3Data[rank].cpuVendor = comm->cpuVendor;
comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
// Determine nNodes, firstRanks, ...
NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
for (int r=0; r<nranks; r++) {
int node;
int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
if (node == comm->nNodes) {
comm->nNodes++;
nodesFirstRank[node] = firstRank;
// Record tree pattern of each node as they can be different depending on sm arch
nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
}
comm->rankToNode[r] = node;
if (comm->cpuArch != allGather3Data[r].cpuArch &&
comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
}
if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
}
}
// Alert the user to the presence of mixed CPUs. In the past this has caused
// locks in some collective routines. This may help debug issues in the future.
if (rank==0) {
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
}
if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
}
}
// Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
for (int r=0; r<comm->nRanks; r++) {
int node = comm->rankToNode[r];
comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
comm->nodeRanks[node].localRanks++;
}
// Allocate ranks arrays for each node
for (int n=0; n<comm->nNodes; n++) {
NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
comm->nodeRanks[n].localRanks = 0;
}
// And fill the ranks arrays
for (int r=0; r<comm->nRanks; r++) {
int node = comm->rankToNode[r];
comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
}
comm->node = comm->rankToNode[rank];
comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
comm->localRank = comm->rankToLocalRank[rank];
comm->localRanks = comm->nodeRanks[comm->node].localRanks;
TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
comm->localRank, comm->localRanks, comm->localRankToRank[0]);
ret = ncclInternalError;
goto fail;
}
INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
nChannelsOrig = comm->nChannels;
NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
for (int i=0; i<nranks; i++) {
allTopoRanks[i] = &allGather3Data[i].topoRanks;
// Make sure we align all ranks so that the tuning is consistent across ranks
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
}
comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
}
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
if (comm->nChannels < nChannelsOrig) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
}
// Determine CollNet support after all-gather now that we know nNodes and each node localRanks
if (comm->config.collnetEnable == 1) {
int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
if (comm->nNodes < collNetNodeThreshold) {
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
comm->config.collnetEnable = 0;
}
}
NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
comm->isOneRPN = (comm->maxLocalRanks == 1);
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
// AllGather3 - end
timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT];
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
char line[1024];
line[0]='\0';
for (int c=0; c<comm->nChannels; c++) {
struct ncclTree* tree = &comm->channels[c].tree;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s", line);
NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
// Compute nChannels per peer for p2p
NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
/* until now, all info of comm should be known. We can initialize shared resources and
* map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
* all proxy operations. */
if (comm->sharedRes->owner == comm) {
comm->sharedRes->tpNLocalRanks = comm->localRanks;
comm->sharedRes->magic = comm->magic;
comm->sharedRes->tpNChannels = comm->nChannels;
comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
}
NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
for (int i = 0; i < comm->localRanks; ++i) {
int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank];
}
comm->topParentLocalRanks = topParentLocalRanks;
// Profiler plugin context has to be initialized before proxy thread
NCCLCHECK(ncclProfilerPluginInit(comm));
NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
// Launch proxy service thread, after this, the proxy calls can be used.
if (parent && parent->shareResources) {
comm->proxyState = parent->sharedRes->proxyState;
ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
} else {
NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
}
NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
timers[TIMER_INIT_CONNECT] = clockNano();
do { // Build p2p schedule
int node = comm->node;
int nNodes = comm->nNodes;
int nRanks = comm->nRanks;
int local = comm->localRank;
int nLocals = comm->maxLocalRanks;
struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
bool flat = false;
for (int node = 0; node < nNodes; node++) {
if (nodeRanks[node].localRanks != nLocals) {
flat = true;
nNodes = 1; node = 0;
nLocals = nRanks; local = rank;
break;
}
}
int nNodesPow2 = pow2Up(nNodes);
int nLocalsPow2 = pow2Up(nLocals);
comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
uint32_t nodeRound = 0;
uint32_t nodeDelta = 0;
int round = 0;
// When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
// Since that formula only produces valid permutations when N is a pow of 2,
// we let N = pow2Up(n) and filter out results greater-eq to n.
// Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
do {
if (nodeDelta < nNodes) { // Filter nonsensical node deltas
int sendNode = (node + nodeDelta) % nNodes;
int recvNode = (node - nodeDelta + nNodes) % nNodes;
uint32_t localRound = 0;
uint32_t localDelta = 0;
do {
if (localDelta < nLocals) { // Filter nonsensical node-local deltas
int sendLocal = (local + localDelta) % nLocals;
int recvLocal = (local - localDelta + nLocals) % nLocals;
comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
round += 1;
}
localRound += 1;
localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
} while (localRound != nLocalsPow2);
}
nodeRound += 1;
nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
} while (nodeRound != nNodesPow2);
if (round != nRanks) {
WARN("P2p schedule creation has bugs.");
ret = ncclInternalError;
goto fail;
}
} while (0);
comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
if (comm->runtimeConn) {
for (int c=0; c<comm->nChannels; c++) {
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
}
// Attempt to setup NVLS, may silently fail and disable NVLS
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
// Check if we can setup CollNet
if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
} else {
for (int c=0; c<comm->nChannels; c++) {
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
}
NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
// Connect Trees
NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
// Connect PAT only for communicators with 1 GPU per node
if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
// Attempt to setup NVLS, may silently fail and disable NVLS
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
// And NVLS trees if needed
NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
// Check if we can setup CollNet
if (comm->config.collnetEnable) {
ncclCollNetSetup(comm, parent, graphs);
NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
}
}
// Connect to local net proxy
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
// Then to remote ones when using PXN
if (ncclPxnDisable(comm) == 0) {
int nranks;
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
for (int r=0; r<nranks; r++) {
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
}
}
if (ncclParamNvbPreconnect()) {
// Connect p2p when using NVB path
int nvbNpeers;
NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
for (int r=0; r<nvbNpeers; r++) {
int peer = nvbPeers[r];
int sendRound=0, recvRound=0;
while (comm->p2pSchedule[sendRound].sendRank != peer) sendRound++;
while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++;
uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound);
uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound);
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId;
channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c);
if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
comm->connectSend[peer] |= (1UL<<channelId);
}
channelId = ncclP2pChannelForPart(comm->p2pnChannels, recvBase, c);
if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
comm->connectRecv[peer] |= (1UL<<channelId);
}
}
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
}
}
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
// Compute time models for algorithm and protocol combinations
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
enum ncclLaunchMode mode, modeOld;
if (str && strcasecmp(str, "GROUP") == 0) {
mode = ncclLaunchModeGroup;
} else {
mode = ncclLaunchModeParallel;
}
// In theory we could be racing with other communicators not associated with
// this one if the user is connecting to multiple ncclUniqueId's concurrently.
modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
}
}
comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
comm->baseStride = 0;
// Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
// launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT];
/* Local intra-node barrier */
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
exit:
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
/* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
* attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
* properly cleaned up. */
if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
free(allTopoRanks);
free(nodesTreePatterns);
free(nodesFirstRank);
free(allGather3Data);
free(rings);
free(nvbPeers);
free(pxnPeers);
return ret;
fail:
goto exit;
}
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT);
// Match config max/minCTAs
NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
#define NCCL_COMMINIT_FUNCNAME_LEN 128
struct ncclCommInitRankAsyncJob {
struct ncclAsyncJob base;
struct ncclComm* comm;
struct ncclComm** newcomm;
int cudaDev;
// For ncclCommInitRank
int nranks, myrank, nId;
ncclUniqueId* commId;
// for ncclCommSplit
struct ncclComm* parent;
int color, key;
int splitCount;
// For Shrink
int* excludeRanksList;
int excludeRanksCount;
// name of the function calling
char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
};
struct ncclCommFinalizeAsyncJob {
struct ncclAsyncJob base;
ncclComm_t comm;
};
NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(CommShrinkShareResources, "COMM_SHRINK_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
typedef struct{
int key;
int color;
} commSplitInfo;
static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
int nRanks = 0, myRank = 0;
ncclResult_t ret = ncclSuccess;
commSplitInfo* info = NULL;
NCCLCHECKGOTO(ncclCalloc(&info, parent->nRanks), ret, fail);
// Compute nRanks, my rank and the ranks (of the original comm) before and after me
info[parent->rank].color = color;
info[parent->rank].key = key;
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, info, sizeof(commSplitInfo)), ret, fail);
// Negative color does not create a new comm. Return now.
if (color == NCCL_SPLIT_NOCOLOR) goto exit;
memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks);
for (int i = 0; i < parent->nRanks; i++) {
if (info[i].color != color) continue;
// Find where to insert this rank
int insert = 0;
while (insert < nRanks && info[parentRanksRet[insert]].key <= info[i].key) insert++;
// Shift ranks by one after insert
for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1];
// Insert our rank
parentRanksRet[insert] = i;
nRanks++;
}
for (int i = 0; i < nRanks; i++) {
if (parentRanksRet[i] == parent->rank) myRank = i;
}
*nRanksRet = nRanks;
*myRankRet = myRank;
exit:
free(info);
return ret;
fail:
goto exit;
}
static ncclResult_t getParentRanks(int parentRanks, int parentRank, int* excludeRanksList, int excludeRanksCount, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
int count = 0, j = 0;
for (int i = 0; i < parentRanks; i++) {
// we assume excludeRanksList is sorted
if (j < excludeRanksCount && excludeRanksList[j] == i) {
j++;
continue;
}
if (i == parentRank) *myRankRet = count;
parentRanksRet[count++] = i;
}
*nRanksRet = parentRanks - excludeRanksCount;
return ncclSuccess;
}
static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
ncclComm_t comm = job->comm;
ncclResult_t res = ncclSuccess;
int archMajor, archMinor;
size_t maxLocalSizeBytes = 0;
int cudaDev = job->cudaDev;
int* parentRanks = NULL;
int cudaArch;
int maxSharedMem = 0;
double sum_timers = 0;
uint64_t timers[TIMERS_INIT_COUNT] = {0};
unsigned long long commIdHash;
timers[TIMER_INIT_TOTAL] = clockNano();
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem, cudaDevAttrMaxSharedMemoryPerBlockOptin, cudaDev), res, fail);
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
cudaArch = 100*archMajor + 10*archMinor;
timers[TIMER_INIT_KERNELS] = clockNano();
NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
// Set the maximum kernel stack size of all kernels to avoid
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zu", maxLocalSizeBytes);
CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes));
}
timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS];
if (job->parent) {
NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
if (job->excludeRanksCount) {
NCCLCHECKGOTO(getParentRanks(job->parent->nRanks, job->parent->rank, job->excludeRanksList, job->excludeRanksCount, &job->nranks, &job->myrank, parentRanks), res, fail);
} else {
NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
// Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
}
timers[TIMER_INIT_ALLOC] = clockNano();
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
// child hash obtained from (parent hash, split count, color)
uint64_t hacc[2] = {1, 1};
eatHash(hacc, &job->parent->commHash);
eatHash(hacc, &job->splitCount);
eatHash(hacc, &job->color);
comm->commHash = digestHash(hacc);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
// debug info, no commId was used
commIdHash = 0;
} else {
timers[TIMER_INIT_ALLOC] = clockNano();
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
// obtain a unique hash using the first commId
comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
NCCLCHECKGOTO(bootstrapInit(job->nId, (struct ncclBootstrapHandle*)job->commId, comm), res, fail);
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
}
comm->cudaArch = cudaArch;
NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail);
if (comm->tuner) {
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext));
}
// update communicator state
comm->initState = ncclSuccess;
timers[TIMER_INIT_TOTAL] = clockNano() - timers[TIMER_INIT_TOTAL];
// Trace this call for replay tool
if (job->parent) {
/* unlink child abort flag. */
__atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d - Init COMPLETE", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
} else {
// the name for the replay tool is ncclCommInitRank for all the variations
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
}
sum_timers = 0.0;
for (int it = 1; it < TIMERS_INIT_COUNT; ++it)
sum_timers += (timers[it] / 1e9);
INFO(NCCL_INIT | NCCL_PROFILE,
"Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
"connections %.2f, rest %.2f)",
job->funcName, comm->rank, comm->nRanks,
timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
exit:
if (job->newcomm) {
/* assign it to user pointer. */
__atomic_store_n(job->newcomm, comm, __ATOMIC_RELEASE);
}
free(parentRanks);
return res;
fail:
comm->initState = res;
goto exit;
}
#define NCCL_CONFIG_DEFAULT(config, field, undef, defvalue, fieldStr, format) \
if (config->field == undef) { \
config->field = defvalue; \
} else { \
INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \
}
static ncclResult_t envConfigOverride(ncclComm_t comm) {
ncclResult_t ret = ncclSuccess;
const char* tmpNetName = comm->config.netName;
const char* envNetName;
int blockingEnv;
int cgaClusterSizeEnv;
int minCTAsEnv;
int maxCTAsEnv;
int splitShareEnv;
const char* collnetEnableEnv;
int ctaPolicyEnv;
int shrinkShareEnv;
int nvlsCTAsEnv;
/* override configuration from env variable. */
blockingEnv = ncclParamCommBlocking();
if (blockingEnv == 0 || blockingEnv == 1)
comm->config.blocking = blockingEnv;
cgaClusterSizeEnv = ncclParamCGAClusterSize();
if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
comm->config.cgaClusterSize = cgaClusterSizeEnv;
} else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
}
minCTAsEnv = ncclParamMinCTAs();
if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
if (minCTAsEnv <= 0)
INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs);
else
comm->config.minCTAs = minCTAsEnv;
}
maxCTAsEnv = ncclParamMaxCTAs();
if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
if (maxCTAsEnv <= 0)
INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs);
else
comm->config.maxCTAs = maxCTAsEnv;
}
envNetName = ncclGetEnv("NCCL_NET");
if (envNetName)
tmpNetName = envNetName;
if (tmpNetName != NULL) {
int netNameLen = strlen(tmpNetName) + 1;
comm->config.netName = (char*)malloc(netNameLen);
memcpy((void*)comm->config.netName, tmpNetName, netNameLen);
} else {
comm->config.netName = NULL;
}
splitShareEnv = ncclParamCommSplitShareResources();
if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) {
comm->config.splitShare = splitShareEnv;
}
shrinkShareEnv = ncclParamCommShrinkShareResources();
if (shrinkShareEnv != NCCL_CONFIG_UNDEF_INT) {
comm->config.shrinkShare = shrinkShareEnv;
}
// NCCL_COLLNET_ENABLE needs to be reloaded each time for comm init
// since users might change the env on the fly to enable/disable collnet
collnetEnableEnv = ncclGetEnv("NCCL_COLLNET_ENABLE");
if (collnetEnableEnv != NULL) {
int collnetEnableInt = (int)strtol(collnetEnableEnv, NULL, 0);
if (collnetEnableInt != NCCL_CONFIG_UNDEF_INT) {
comm->config.collnetEnable = collnetEnableInt;
INFO(NCCL_ENV, "NCCL_COLLNET_ENABLE set by environment to %d.", collnetEnableInt);
}
}
ctaPolicyEnv = ncclParamCtaPolicy();
if (ctaPolicyEnv != NCCL_CONFIG_UNDEF_INT) {
comm->config.CTAPolicy = ctaPolicyEnv;
}
nvlsCTAsEnv = ncclParamNvlsChannels();
if (nvlsCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
comm->config.nvlsCTAs = nvlsCTAsEnv;
}
/* cap channels if needed */
if (comm->config.minCTAs > MAXCHANNELS) {
INFO(NCCL_ENV, "minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
comm->config.minCTAs = MAXCHANNELS;
}
if (comm->config.maxCTAs > MAXCHANNELS) {
INFO(NCCL_ENV, "maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
comm->config.maxCTAs = MAXCHANNELS;
}
if (comm->config.minCTAs > comm->config.maxCTAs) {
INFO(NCCL_ENV, "minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
comm->config.minCTAs = comm->config.maxCTAs;
}
if (comm->config.splitShare != 1 && comm->config.splitShare != 0) {
INFO(NCCL_ENV, "splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
comm->config.splitShare = 0;
}
if (comm->config.collnetEnable != 1 && comm->config.collnetEnable != 0) {
INFO(NCCL_ENV, "collnetEnable %d is not a valid value 0/1, set it to 0", comm->config.collnetEnable);
comm->config.collnetEnable = 0;
}
if (comm->config.CTAPolicy < NCCL_CTA_POLICY_DEFAULT || comm->config.CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY) {
INFO(NCCL_ENV, "CTAPolicy %d is not a valid value, set it to %d", comm->config.CTAPolicy, NCCL_CTA_POLICY_DEFAULT);
comm->config.CTAPolicy = NCCL_CTA_POLICY_DEFAULT;
}
if (comm->config.nvlsCTAs != NCCL_CONFIG_UNDEF_INT && comm->config.nvlsCTAs <= 0) {
INFO(NCCL_ENV, "nvlsCTAs %d is not a valid value, NCCL will decide the default value automatically", comm->config.nvlsCTAs);
comm->config.nvlsCTAs = NCCL_CONFIG_UNDEF_INT;
}
return ret;
}
static ncclResult_t copyCommConfig(ncclComm_t childComm, ncclComm_t parnet) {
memcpy(&childComm->config, &parnet->config, sizeof(ncclConfig_t));
NCCLCHECK(envConfigOverride(childComm));
return ncclSuccess;
}
static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
ncclResult_t ret = ncclSuccess;
/* config must not be NULL in this function */
ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr;
size_t realSize;
internalConfig.magic = 0;
internalConfigPtr = &internalConfig;
if (config) {
memcpy((void*)&realSize, (void*)config, sizeof(size_t));
realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
memcpy((void*)internalConfigPtr, (void*)config, realSize);
if (internalConfigPtr->magic != 0xcafebeef) {
WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
ret = ncclInvalidArgument;
goto fail;
}
/* check version. */
if (internalConfigPtr->version < NCCL_VERSION(2, 14, 0)) {
internalConfigPtr->blocking = defaultConfig.blocking;
}
if (internalConfigPtr->version < NCCL_VERSION(2, 17, 0)) {
internalConfigPtr->cgaClusterSize = defaultConfig.cgaClusterSize;
internalConfigPtr->minCTAs = defaultConfig.minCTAs;
internalConfigPtr->maxCTAs = defaultConfig.maxCTAs;
internalConfigPtr->netName = defaultConfig.netName;
}
if (internalConfigPtr->version < NCCL_VERSION(2, 25, 0)) {
internalConfigPtr->trafficClass = defaultConfig.trafficClass;
}
if (internalConfigPtr->version < NCCL_VERSION(2, 27, 0)) {
internalConfigPtr->collnetEnable = defaultConfig.collnetEnable;
internalConfigPtr->CTAPolicy = defaultConfig.CTAPolicy;
internalConfigPtr->shrinkShare = defaultConfig.shrinkShare;
internalConfigPtr->nvlsCTAs = defaultConfig.nvlsCTAs;
}
}
/* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */
if (internalConfigPtr->blocking != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
ret = ncclInvalidArgument;
goto fail;
}
if (internalConfigPtr->cgaClusterSize != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->cgaClusterSize < 0) {
WARN("Invalid config cgaClusterSize attribute value %d", internalConfigPtr->cgaClusterSize);
ret = ncclInvalidArgument;
goto fail;
}
if ((internalConfigPtr->minCTAs != NCCL_CONFIG_UNDEF_INT &&
internalConfigPtr->minCTAs <= 0) ||
(internalConfigPtr->maxCTAs != NCCL_CONFIG_UNDEF_INT &&
internalConfigPtr->maxCTAs <= 0) ||
(internalConfigPtr->minCTAs > internalConfigPtr->maxCTAs)) {
WARN("Invalid config min/max channels attribute value %d/%d", internalConfigPtr->minCTAs, internalConfigPtr->maxCTAs);
ret = ncclInvalidArgument;
goto fail;
}
if (internalConfigPtr->splitShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->splitShare != 0 && internalConfigPtr->splitShare != 1) {
WARN("Invalid config splitShare attribute value %d", internalConfigPtr->splitShare);
ret = ncclInvalidArgument;
goto fail;
}
if (internalConfigPtr->collnetEnable != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->collnetEnable < 0 || internalConfigPtr->collnetEnable > 1)) {
WARN("Invalid config collnetEnable attribute value %d", internalConfigPtr->collnetEnable);
ret = ncclInvalidArgument;
goto fail;
}
if (internalConfigPtr->CTAPolicy != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->CTAPolicy < NCCL_CTA_POLICY_DEFAULT ||
internalConfigPtr->CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY)) {
WARN("Invalid config policy attribute value %d", internalConfigPtr->CTAPolicy);
ret = ncclInvalidArgument;
goto fail;
}
if (internalConfigPtr->shrinkShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->shrinkShare != 0 && internalConfigPtr->shrinkShare != 1) {
WARN("Invalid config shrinkShare attribute value %d", internalConfigPtr->shrinkShare);
ret = ncclInvalidArgument;
goto fail;
}
if (internalConfigPtr->nvlsCTAs != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->nvlsCTAs <= 0) {
WARN("Invalid config nvlsCTAs attribute value %d", internalConfigPtr->nvlsCTAs);
ret = ncclInvalidArgument;
goto fail;
}
/* default config value can be tuned on different platform. */
NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, commName, NCCL_CONFIG_UNDEF_PTR, NULL, "Comm name", "%s");
NCCL_CONFIG_DEFAULT(internalConfigPtr, collnetEnable, NCCL_CONFIG_UNDEF_INT, 0, "Collnet enable", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, CTAPolicy, NCCL_CONFIG_UNDEF_INT, NCCL_CTA_POLICY_DEFAULT, "CTA policy flags", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, shrinkShare, NCCL_CONFIG_UNDEF_INT, 0, "shrinkShare", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, nvlsCTAs, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "nvlsCTAs", "%d");
/* assign config to communicator */
comm->config.blocking = internalConfigPtr->blocking;
comm->config.cgaClusterSize = internalConfigPtr->cgaClusterSize;
comm->config.minCTAs = internalConfigPtr->minCTAs;
comm->config.maxCTAs = internalConfigPtr->maxCTAs;
comm->config.netName = internalConfigPtr->netName;
comm->config.splitShare = internalConfigPtr->splitShare;
comm->config.trafficClass = internalConfigPtr->trafficClass;
comm->config.commName = internalConfigPtr->commName;
comm->config.collnetEnable = internalConfigPtr->collnetEnable;
comm->config.CTAPolicy = internalConfigPtr->CTAPolicy;
comm->config.shrinkShare = internalConfigPtr->shrinkShare;
comm->config.nvlsCTAs = internalConfigPtr->nvlsCTAs;
NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
exit:
return ret;
fail:
goto exit;
}
static void ncclCommInitJobFree(void* _job) {
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)_job;
free(job->commId);
free(_job);
}
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId, ncclUniqueId* commId, int myrank, int cudaDev, ncclConfig_t *config, const char funcName[]) {
if (nId <= 0 || nId > nranks) {
WARN("improper usage of ncclCommInitRank: nId = %d, nranks=%d", nId, nranks);
return ncclInvalidArgument;
}
ncclResult_t res = ncclSuccess;
const char* commIdEnv = NULL;
ncclComm_t comm = NULL;
struct ncclCommInitRankAsyncJob* job = NULL;
bool launchedJob = false;
// first call ncclInit, this will setup the environment
NCCLCHECKGOTO(ncclInit(), res, fail);
if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, showVersion);
}
// Make sure the CUDA runtime is initialized.
CUDACHECKGOTO(cudaFree(NULL), res, fail);
NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail);
NCCLCHECKGOTO(PtrCheck(config, "CommInitRank", "config"), res, fail);
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
WARN("Invalid rank requested : %d/%d", myrank, nranks);
res = ncclInvalidArgument;
goto fail;
}
NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->abortFlag, 1), res, fail);
NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->abortFlagDev, 1), res, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->abortFlagRefCount, 1), res, fail);
comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption.
*comm->abortFlagRefCount = 1;
NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail);
/* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
comm->initState = ncclInProgress;
*newcomm = comm;
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->nId = nId;
job->comm = comm;
job->nranks = nranks;
job->myrank = myrank;
job->cudaDev = cudaDev;
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", funcName);
// need to copy the commIds to allow async commInit and to avoid alignement issues when casting from ncclUNiqueId and ncclBootstrapHandle
// ncclUniqueIds and ncclBootstrapHandle don't have the same alignment requirements.
// Therefore the array of Ids coming from the user might not be properly aligned to be cast into a ncclBootstrapHandle
// copying into allocated memory guarantees that the memory is properly aligned for any objects, removing that issue
NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), res, fail);
memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
commIdEnv = ncclGetEnv("NCCL_COMM_ID");
if (commIdEnv && myrank == 0) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commIdEnv);
if (nId > 1) {
INFO(NCCL_INIT | NCCL_ENV, "NCCL_COMM_ID cannot be used with more than one ncclUniqueId");
job->nId = 1;
}
// start the bootstrap root before bootstrapping, use only the first handle
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
}
launchedJob = true;
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
exit:
return ncclGroupErrCheck(res);
fail:
if (job && !launchedJob) ncclCommInitJobFree(job);
if (comm) {
free(comm->abortFlag);
if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
free(comm->abortFlagRefCount);
free(comm);
}
if (newcomm) *newcomm = NULL;
goto exit;
}
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
NVTX3_RANGE(NcclNvtxParamsCommInitRank)
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
(void)ncclCudaLibraryInit();
int cudaDev;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
CUDACHECK(cudaGetDevice(&cudaDev));
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
ncclResult_t ret = ncclSuccess;
int totalnDev;
int *gpuFlags = NULL;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
int oldDev = 0;
NVTX3_RANGE(NcclNvtxParamsCommInitAll);
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
(void)ncclCudaLibraryInit();
CUDACHECK(cudaGetDevice(&oldDev));
NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
if (ndev < 0) {
WARN("Invalid device count requested : %d", ndev);
ret = ncclInvalidArgument;
goto fail;
}
CUDACHECKGOTO(cudaGetDeviceCount(&totalnDev), ret, fail);
if (devlist) {
NCCLCHECKGOTO(ncclCalloc(&gpuFlags, totalnDev), ret, fail);
for (int i = 0; i < ndev; ++i) {
/* invalid device check. */
if (devlist[i] < 0 || devlist[i] >= totalnDev) {
WARN("Invalid device %d (totalnDev=%d)", devlist[i], totalnDev);
ret = ncclInvalidArgument;
goto fail;
}
/* duplicate device check. */
if (gpuFlags[devlist[i]] != 0) {
ret = ncclInvalidUsage;
goto fail;
}
gpuFlags[devlist[i]] = 1;
}
free(gpuFlags);
gpuFlags = nullptr;
}
ncclUniqueId uniqueId;
NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail);
NCCLCHECKGOTO(ncclGroupStartInternal(), ret, fail);
for (int i=0; i<ndev; i++) {
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
int dev = devlist ? devlist[i] : i;
CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
}
NCCLCHECKGOTO(ncclGroupEndInternal(), ret, fail);
NVTX3_RANGE_ADD_PAYLOAD(CommInitAll, NcclNvtxParamsCommInitAllSchema,
NVTX3_PAYLOAD(comms[0]->commHash, ndev));
exit:
(void)cudaSetDevice(oldDev);
free(gpuFlags);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
if (nextState < 0 || nextState >= ncclNumResults || comm == NULL) {
WARN("ncclCommSetAsyncError: error comm %p sets state %d", comm, nextState);
return ncclInvalidArgument;
}
__atomic_store_n(&comm->asyncResult, nextState, __ATOMIC_RELEASE);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
int cudaDev;
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr = NULL;
NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig);
NCCLCHECK(ncclGroupStartInternal());
(void)ncclCudaLibraryInit();
CUDACHECK(cudaGetDevice(&cudaDev));
if (config == NULL)
internalConfigPtr = &internalConfig;
else
internalConfigPtr = config;
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (newcomm && *newcomm) {
if (!(*newcomm)->config.blocking) {
(void) ncclCommGetAsyncError(*newcomm, &ret);
}
NVTX3_RANGE_ADD_PAYLOAD(CommInitRankConfig, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
}
return ret;
fail:
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
goto exit;
}
NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable);
int cudaDev;
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr = NULL;
NCCLCHECK(ncclGroupStartInternal());
(void)ncclCudaLibraryInit();
CUDACHECK(cudaGetDevice(&cudaDev));
if (config == NULL)
internalConfigPtr = &internalConfig;
else
internalConfigPtr = config;
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, nId, commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (newcomm && *newcomm) {
if (!(*newcomm)->config.blocking) {
(void) ncclCommGetAsyncError(*newcomm, &ret);
}
NVTX3_RANGE_ADD_PAYLOAD(CommInitRankScalable, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
}
return ret;
fail:
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
goto exit;
}
static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_;
ncclComm_t comm = job->comm;
ncclResult_t ret = ncclSuccess;
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
if (comm->initState == ncclSuccess) {
if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->hostStream)) != ncclSuccess) {
WARN("commDestroySync: comm %p rank %d sync hostStream error %d\n", comm, comm->rank, ret);
}
if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)) != ncclSuccess) {
WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret);
}
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, true), ret, fail);
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
// And keep polling until all graphs referencing us die.
while (comm->localPersistentRefs != 0) {
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
}
while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
struct ncclCommCallback* cb = ncclIntruQueueDequeue(&comm->legacyRegCleanupQueue);
if (cb->fn(comm, cb) != ncclSuccess) {
WARN("Legacy IPC cleanup callback failed comm %p (rank = %d) cb %p", comm, comm->rank, cb);
}
}
}
if ((ret = ncclProxyStop(comm)) != ncclSuccess) {
WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, ret);
}
exit:
return ret;
fail:
goto exit;
}
static ncclResult_t commCleanup(ncclComm_t comm) {
CUDACHECK(cudaSetDevice(comm->cudaDev));
if (comm->tuner != NULL) {
NCCLCHECK(comm->tuner->destroy(comm->tunerContext));
NCCLCHECK(ncclTunerPluginUnload(comm));
}
NCCLCHECK(commFree(comm));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
ncclResult_t ncclCommFinalize(ncclComm_t comm) {
NVTX3_RANGE(NcclNvtxParamsCommFinalize);
ncclResult_t ret = ncclSuccess;
struct ncclCommFinalizeAsyncJob *job = NULL;
NCCLCHECK(ncclGroupStartInternal());
if (comm == NULL) goto exit;
/* wait comm ready before finalize. */
NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail);
/* prevent double finalize. */
if (comm->finalizeCalled) {
ret = ncclInvalidArgument;
goto fail;
}
comm->finalizeCalled = true;
/* launch async thread to finalize comm. */
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commDestroySync, NULL, free, comm), ret, fail);
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (comm) {
if (!comm->config.blocking) {
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
}
NVTX3_RANGE_ADD_PAYLOAD(CommFinalize, NcclNvtxParamsCommFinalizeSchema,
NVTX3_PAYLOAD(comm->commHash));
}
return ret;
fail:
if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
goto exit;
}
static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_;
ncclComm_t comm = job->comm;
ncclResult_t ret = ncclSuccess;
if (comm->intraComm0 != NULL) {
int curRankCnt;
int curRank; /* Debug info */
int intraRanks = comm->intraRanks;
ncclComm_t intracomm0 = comm->intraComm0;
int *finalizeRankCnt = &intracomm0->finalizeRankCnt;
assert(intracomm0 != NULL && finalizeRankCnt != NULL);
curRankCnt = __atomic_add_fetch(finalizeRankCnt, 1, __ATOMIC_ACQ_REL);
if (curRankCnt == intraRanks) {
ncclComm_t curIntraComm;
ncclComm_t nextIntraComm = intracomm0;
/* this is the last call to ncclCommDestroy/Abort, we need to make sure all comms
* in the process have been finalized before we free local resources. */
while (nextIntraComm) {
curIntraComm = nextIntraComm;
curRank = curIntraComm->rank;
nextIntraComm = nextIntraComm->intraNext;
if (curIntraComm->finalizeCalled == false) {
struct ncclCommFinalizeAsyncJob job;
job.comm = curIntraComm;
/* every comm aborts, commDestroySync should not be blocked. */
if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess)
WARN("commReclaim: comm %p (rank = %d) in commDestroySync, error %d", curIntraComm, curRank, ret);
}
}
/* free local resources. */
nextIntraComm = intracomm0;
while (nextIntraComm) {
curIntraComm = nextIntraComm;
curRank = curIntraComm->rank;
nextIntraComm = nextIntraComm->intraNext;
if ((ret = commCleanup(curIntraComm)) != ncclSuccess) {
// We pass a freed pointer, but we don't dereference; we merely print its value, so it's OK.
// coverity[pass_freed_arg]
WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret);
}
}
}
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == NULL) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
return ncclSuccess;
}
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
struct ncclCommFinalizeAsyncJob *job = NULL;
ncclResult_t res = ncclSuccess;
NVTX3_FUNC_WITH_PARAMS(CommDestroy, NcclNvtxParamsCommInitRank,
NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
// Try and prevent a double free of the comm struct (user error)
if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
WARN("comm %p has already been destroyed", comm);
return ncclInvalidArgument;
}
comm->destroyFlag = 1;
/* init thread must be joined before we destroy the comm. */
NCCLCHECK(ncclCommEnsureReady(comm));
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
exit:
return res;
fail:
goto exit;
}
static ncclResult_t setCommAbortFlags(ncclComm_t comm, int value) {
// Set abort flags
if (comm->childAbortFlag != nullptr) {
__atomic_store_n(comm->childAbortFlag, value, __ATOMIC_RELEASE);
__atomic_store_n(comm->childAbortFlagDev, value, __ATOMIC_RELEASE);
}
__atomic_store_n(comm->abortFlag, value, __ATOMIC_RELEASE);
__atomic_store_n(comm->abortFlagDev, value, __ATOMIC_RELEASE);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
ncclResult_t ncclCommAbort(ncclComm_t comm) {
NVTX3_RANGE(NcclNvtxParamsCommAbort);
if (comm == NULL) {
return ncclSuccess;
}
// Ask anything that might still be running on the device to quit
NCCLCHECK(setCommAbortFlags(comm,1));
comm->destroyFlag = 1;
/* init thread must be joined before we destroy the comm,
* and we should ignore the init error here. */
(void)ncclCommEnsureReady(comm);
// once the comm is ready, we can access ranks etc
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
struct ncclCommFinalizeAsyncJob *job = NULL;
ncclResult_t res = ncclSuccess;
NVTX3_RANGE_ADD_PAYLOAD(CommAbort, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
exit:
return ncclSuccess;
fail:
goto exit;
}
static void childCommCleanupJob(void* job) {
struct ncclCommInitRankAsyncJob* initJob = (struct ncclCommInitRankAsyncJob*)job;
if (initJob->excludeRanksList) free(initJob->excludeRanksList);
free(job);
}
// initializing a child communicator (for both split and shrink)
static ncclResult_t ncclCommInitChildComm(ncclComm_t comm, ncclComm_t* newcomm, bool isShrink, int flags, int color, int key, int* excludeRanksList, int excludeRanksCount,
ncclConfig_t* config, const char* caller) {
struct ncclCommInitRankAsyncJob *job = NULL;
struct ncclComm* childComm = NCCL_COMM_NULL;
ncclResult_t res = ncclSuccess;
int oldDev;
CUDACHECK(cudaGetDevice(&oldDev));
NCCLCHECKGOTO(CommCheck(comm, caller, "comm"), res, exit);
NCCLCHECKGOTO(PtrCheck(newcomm, caller, "newcomm"), res, exit);
if (isShrink) {
NCCLCHECKGOTO(PtrCheck(excludeRanksList, caller, "excludeRanksList"), res, exit);
NCCLCHECKGOTO(excludeRanksCount > 0 ? ncclSuccess : ncclInvalidArgument, res, exit);
// excludeRanksList may not be sorted, need to sort it
qsort(excludeRanksList, excludeRanksCount, sizeof(int), compareInts);
// ranks in excludeRanksList should not call into this function
NCCLCHECKGOTO(bsearch(&comm->rank, excludeRanksList, excludeRanksCount, sizeof(int), compareInts) ? ncclInvalidArgument : ncclSuccess, res, exit);
}
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, exit);
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, exit);
/* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
*newcomm = NCCL_COMM_NULL;
if (!isShrink && color == NCCL_SPLIT_NOCOLOR) {
INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank);
} else {
NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail);
childComm->startMagic = childComm->endMagic = NCCL_MAGIC;
// Set the shareResource field, this is used throughout the init and must be reset every time.
// If we shrink, we only reuse resources if we shrink in the default mode
comm->shareResources = isShrink ? (!(flags & NCCL_SHRINK_ABORT) && comm->config.shrinkShare) : comm->config.splitShare;
if (comm->shareResources) {
childComm->abortFlag = comm->abortFlag;
childComm->abortFlagDev = comm->abortFlagDev;
childComm->abortFlagRefCount = comm->abortFlagRefCount;
comm->childAbortFlag = NULL;
ncclAtomicRefCountIncrement(comm->abortFlagRefCount);
} else {
NCCLCHECKGOTO(ncclCalloc(&childComm->abortFlag, 1), res, fail);
NCCLCHECKGOTO(ncclCudaHostCalloc(&childComm->abortFlagDev, 1), res, fail);
NCCLCHECKGOTO(ncclCalloc(&childComm->abortFlagRefCount, 1), res, fail);
/* temporarily used to abort everything during child comm init. */
comm->childAbortFlag = childComm->abortFlag;
comm->childAbortFlagDev = childComm->abortFlagDev;
*childComm->abortFlagRefCount = 1;
}
if (config == NULL) {
NCCLCHECKGOTO(copyCommConfig(childComm, comm), res, fail);
} else {
NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail);
}
/* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
childComm->initState = ncclInternalError;
}
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->comm = childComm;
job->newcomm = newcomm;
job->parent = comm;
job->color = color;
job->key = key;
if (excludeRanksList) {
// need to copy the list of ranks to exclude because the job is async
job->excludeRanksCount = excludeRanksCount;
NCCLCHECKGOTO(ncclCalloc(&job->excludeRanksList, excludeRanksCount), res, fail);
memcpy(job->excludeRanksList, excludeRanksList, excludeRanksCount * sizeof(int));
} else {
// each split has to lead to a unique comm, so increment the splitCount
job->splitCount = ++comm->splitCount;
job->excludeRanksList = NULL;
}
job->cudaDev = comm->cudaDev;
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", caller);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, /*undo=*/NULL, /*destructor=*/childCommCleanupJob, comm), res, fail);
exit:
(void)cudaSetDevice(oldDev);
return res;
fail:
if (childComm) {
if (!comm->shareResources) {
if (childComm->abortFlag) free(childComm->abortFlag);
if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev);
if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount);
}
free(childComm);
}
if (newcomm) *newcomm = NULL;
goto exit;
}
NCCL_API(ncclResult_t, ncclCommShrink, ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
ncclResult_t ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t *newcomm, ncclConfig_t* config, int shrinkFlags) {
NVTX3_RANGE(NcclNvtxParamsCommShrink)
ncclResult_t res = ncclSuccess;
NCCLCHECK(ncclGroupStartInternal());
// Handle error mode by setting abort flags and waiting for kernels to complete and unset the flags to avoid bootstrap issues
if (shrinkFlags & NCCL_SHRINK_ABORT) {
NCCLCHECKGOTO(setCommAbortFlags(comm, 1), res, exit);
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), res, exit);
NCCLCHECKGOTO(setCommAbortFlags(comm, 0), res, exit);
}
NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/true, shrinkFlags, /*color=*/0, /*key=*/comm->rank, excludeRanksList, excludeRanksCount, config, __func__), res, exit);
if (*newcomm) NVTX3_RANGE_ADD_PAYLOAD(CommShrink, NcclNvtxParamsCommShrinkSchema, NVTX3_PAYLOAD(comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, excludeRanksCount));
exit:
(void)ncclGroupErrCheck(res);
NCCLCHECK(ncclGroupEndInternal());
return res;
}
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
NVTX3_RANGE(NcclNvtxParamsCommSplit)
ncclResult_t res = ncclSuccess;
NCCLCHECK(ncclGroupStartInternal());
NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/false, /*shrink mode=*/NCCL_SHRINK_DEFAULT, color, key, NULL, 0, config, __func__), res, exit);
if (*newcomm)
NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
exit:
(void)ncclGroupErrCheck(res);
NCCLCHECK(ncclGroupEndInternal());
return res;
}
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
const char* ncclGetErrorString(ncclResult_t code) {
switch (code) {
case ncclSuccess : return "no error";
case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)";
case ncclSystemError : return "unhandled system error (run with NCCL_DEBUG=INFO for details)";
case ncclInternalError : return "internal error - please report this issue to the NCCL developers";
case ncclInvalidArgument : return "invalid argument (run with NCCL_DEBUG=WARN for details)";
case ncclInvalidUsage : return "invalid usage (run with NCCL_DEBUG=WARN for details)";
case ncclRemoteError : return "remote process exited or there was a network error";
case ncclInProgress : return "NCCL operation in progress";
default : return "unknown result code";
}
}
/* Returns a human-readable message of the last error that occurred.
* comm is currently unused and can be set to NULL
*/
NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
const char* ncclGetLastError(ncclComm_t comm) {
return ncclLastError;
}
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
NCCLCHECK(CommCheck(comm, "ncclGetAsyncError", "comm"));
NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
*asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
/* if there is linked group job, we should complete it. */
if (*asyncError == ncclSuccess && comm->groupJob) {
NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
comm->groupJob = NULL;
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(CommCheck(comm, "CommCount", "comm"));
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
/* init thread must be joined before we access the attributes of comm. */
NCCLCHECK(ncclCommEnsureReady(comm));
*count = comm->nRanks;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(CommCheck(comm, "CommCuDevice", "comm"));
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
NCCLCHECK(ncclCommEnsureReady(comm));
*devid = comm->cudaDev;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(CommCheck(comm, "CommUserRank", "comm"));
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
NCCLCHECK(ncclCommEnsureReady(comm));
*rank = comm->rank;
return ncclSuccess;
}