From e261ecea18786f777ded7506ce3ea976f1d50c08 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 10 Sep 2024 05:57:10 -0700 Subject: [PATCH 1/2] 2.23.4-1 Add scalable init API * Add new ncclCommInitRankScalable to allow for passing multiple unique IDs to the init function. * Spreads the load onto multiple bootstrap roots, allowing for constant bootstrap time. * Requires multiple ranks to create a unique ID, and the CPU-side ID exchange code to call allgather[v] instead of broadcast. Accelerate init bootstrap operations * Reduce the number of calls to allgather. * Allow roots to reply early to ranks when information is already available. * Add an option to use ncclNet instead of sockets to perform bootstrap allgather operations. Add PAT algorithms for Allgather and ReduceScatter * Parallel Aggregated Trees, variation of Bruck algorithm. * Logarithmic number of network steps for small sizes at scale. * Only supports one rank per node at the moment. Add support for registered buffers for intra-node communication. * Allow registered user buffers to be accessed directly intra-node * Avoids extra copies in algorithms which permit it, saving memory bandwidth and helping with compute overlap. Add profiler plugin API * New plugin API for profiling * Supports various levels of profiling, with a hierarchy. Asynchronous graph allocation * Make calls to cudaMalloc and cudaMemcpy during graph allocation asynchronous. * Significantly speeds up graph capture. Use fatal IB asynchronous events to stop network operation * Avoids many other error messages * Only fatal errors are affected; potentially transient errors (e.g. port down) do not cause an immediate stop. Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node * P2P would cause a significant performance degradation when using many GPUs, and therefore many interleaved data flows. * Disable P2P through the CPU when we have 3+ GPUs per node; keep it enabled when we only have 2 GPUs. Improve the init logs to report the real NCCL function. * Make the log report ncclCommInitRank or ncclCommSplit, rather than the generic ncclCommInitRankFunc. Add a parameter to set the location of the user configuration file. * Add NCCL_CONF_FILE environment variable to set where the user's configuration file resides. Increase default IB timeout * Increase IB timeout value from 18 to 20. * Should help avoid fatal errors on large RoCE systems. Add new check for nvidia peermem * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer present; check for /sys/module/nvidia_peermem/version instead. Fix old performance regression when mixing small and large operations. * Improves distribution of work on channels. Fix crash when NUMA IDs are equal to -1. * Can happen when a NIC is a virtual NIC, or when linux doesn't know which NUMA node a device is attached to * Issue NVIDIA/nccl-tests#233 Fix tree graph search when NCCL_CROSS_NIC is set to 1. * Would force NCCL to use the balanced_tree pattern, thereby disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch. * Would also try to use alternate rings even though it was not needed. Compiler tweaks and fixes * PR #1177 * PR #1228 Fix stack smash * PR #1325 Fixes for multi-node NVLink + IB operation Coverity fixes and comments. --- ext-profiler/example/Makefile | 16 + ext-profiler/example/event.c | 30 + ext-profiler/example/event.h | 167 +++++ ext-profiler/example/nccl/common.h | 15 + ext-profiler/example/nccl/err.h | 19 + ext-profiler/example/nccl/profiler.h | 18 + ext-profiler/example/nccl/profiler_v1.h | 150 ++++ ext-profiler/example/nccl/types.h | 21 + ext-profiler/example/plugin.c | 492 ++++++++++++ ext-profiler/example/print_event.c | 277 +++++++ ext-profiler/example/print_event.h | 13 + ext-tuner/example/nccl/tuner.h | 3 +- makefiles/common.mk | 7 + makefiles/version.mk | 4 +- src/bootstrap.cc | 950 ++++++++++++++++++------ src/collectives.cc | 1 + src/debug.cc | 32 +- src/device/all_gather.h | 59 +- src/device/all_reduce.h | 115 ++- src/device/broadcast.h | 15 +- src/device/common.h | 16 +- src/device/common_kernel.h | 18 +- src/device/generate.py | 11 +- src/device/network/unpack/unpack.h | 6 +- src/device/op128.h | 49 +- src/device/primitives.h | 12 +- src/device/prims_ll.h | 47 +- src/device/prims_ll128.h | 10 +- src/device/prims_simple.h | 681 +++++++++++------ src/device/reduce.h | 3 + src/device/reduce_kernel.h | 56 +- src/device/reduce_scatter.h | 56 +- src/device/sendrecv.h | 13 +- src/enqueue.cc | 525 ++++++++++--- src/graph/connect.cc | 51 +- src/graph/paths.cc | 108 ++- src/graph/rings.cc | 28 +- src/graph/search.cc | 108 ++- src/graph/topo.cc | 181 ++--- src/graph/topo.h | 7 +- src/graph/tuning.cc | 80 +- src/graph/xml.cc | 7 +- src/group.cc | 58 +- src/include/alloc.h | 145 +++- src/include/bitops.h | 11 + src/include/bootstrap.h | 4 +- src/include/checks.h | 61 +- src/include/collectives.h | 486 ++++++++++++ src/include/comm.h | 131 +++- src/include/cudawrap.h | 2 + src/include/device.h | 23 +- src/include/graph.h | 7 +- src/include/nccl_common.h | 3 +- src/include/nvtx.h | 27 +- src/include/p2p.h | 31 +- src/include/profiler.h | 58 +- src/include/proxy.h | 41 +- src/include/register.h | 13 +- src/include/shm.h | 47 +- src/include/timer.h | 14 +- src/include/transport.h | 10 +- src/include/utils.h | 1 - src/init.cc | 422 ++++++++--- src/misc/argcheck.cc | 4 + src/misc/cudawrap.cc | 28 +- src/misc/ipcsocket.cc | 23 +- src/misc/nvmlwrap.cc | 4 + src/misc/param.cc | 28 +- src/misc/profiler.cc | 595 ++++++++++++--- src/misc/shmutils.cc | 13 +- src/misc/socket.cc | 16 +- src/misc/tuner.cc | 2 + src/misc/utils.cc | 21 +- src/nccl.h.in | 7 + src/net.cc | 9 +- src/proxy.cc | 304 ++++++-- src/register.cc | 25 +- src/transport.cc | 108 +-- src/transport/coll_net.cc | 39 +- src/transport/generic.cc | 23 + src/transport/net.cc | 135 ++-- src/transport/net_ib.cc | 344 ++++++--- src/transport/net_socket.cc | 54 +- src/transport/nvls.cc | 76 +- src/transport/p2p.cc | 564 ++++++++++++-- src/transport/shm.cc | 414 ++++++++--- 86 files changed, 6943 insertions(+), 1965 deletions(-) create mode 100644 ext-profiler/example/Makefile create mode 100644 ext-profiler/example/event.c create mode 100644 ext-profiler/example/event.h create mode 100644 ext-profiler/example/nccl/common.h create mode 100644 ext-profiler/example/nccl/err.h create mode 100644 ext-profiler/example/nccl/profiler.h create mode 100644 ext-profiler/example/nccl/profiler_v1.h create mode 100644 ext-profiler/example/nccl/types.h create mode 100644 ext-profiler/example/plugin.c create mode 100644 ext-profiler/example/print_event.c create mode 100644 ext-profiler/example/print_event.h diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile new file mode 100644 index 0000000..ee8e0cf --- /dev/null +++ b/ext-profiler/example/Makefile @@ -0,0 +1,16 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# +NCCL_HOME := ../../build +INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl +PLUGIN_SO := libnccl-profiler.so + +default: $(PLUGIN_SO) + +$(PLUGIN_SO): plugin.c event.c print_event.c + $(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ + +clean: + rm -f $(PLUGIN_SO) diff --git a/ext-profiler/example/event.c b/ext-profiler/example/event.c new file mode 100644 index 0000000..717fe86 --- /dev/null +++ b/ext-profiler/example/event.c @@ -0,0 +1,30 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "event.h" + +int taskEventQueueEmpty(struct group* g) { + return g->eventHead == NULL; +} + +void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) { + event->next = NULL; + if (g->eventHead) g->eventTail->next = event; + else g->eventHead = event; + g->eventTail = event; +} + +struct taskEventBase* taskEventQueueHead(struct group* g) { + return g->eventHead; +} + +struct taskEventBase* taskEventQueueDequeue(struct group* g) { + struct taskEventBase* tmp = g->eventHead; + g->eventHead = g->eventHead->next; + if (g->eventHead == NULL) g->eventTail = NULL; + return tmp; +} diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h new file mode 100644 index 0000000..7432808 --- /dev/null +++ b/ext-profiler/example/event.h @@ -0,0 +1,167 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef EVENT_H_ +#define EVENT_H_ + +#include +#include +#include +#include "profiler.h" + +#define MAX_CHANNELS 32 +#define MAX_STEPS 16 + +#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted) +#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted) +#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait) +#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait) + +#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1) +#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1) +#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1) +#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1) + +#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET) +#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET) +#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET) +#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET) + +#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES) +#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES) + +#define MAX_COMM_CLIQUES (32 * 8) + +struct proxyOp; + +struct proxyStep { + uint8_t type; // type of event: network transfer + int step; // network transfer id in given channel + int isSend; // send/recv channel operation + double timestamp[MAX_PROXY_STEP_STATES]; + double startTs; + double stopTs; + struct proxyOp* parent; +}; + +struct proxyOp { + uint8_t type; // type of event: proxy operation + uint8_t channelId; // channel id for this proxy operation + pid_t pid; + int rank; + int peer; // peer rank for this proxy operation + int nSteps; // total number of network transfers for this proxy operation + int chunkSize; // chunk size for this proxy operation + int isSend; // send/recv channel operation + size_t transSize; // transfer data size for this proxy operation + struct { + int steps; // completed steps for this proxy operation state + double timestamp; + } states[MAX_PROXY_OP_STATES]; + double startTs; + double stopTs; + int stepCount; // last processed network operation for this proxy operation + struct proxyStep step[MAX_STEPS]; // array of network transfer events + struct taskEventBase* parent; // parent event p2p/collective +}; + +struct group; +struct context; + +struct proxyCtrl { + uint8_t type; + struct context* ctx; // profiler context + double startTs; + double stopTs; + int state; + int appended; // appended proxy operations +}; + +// task level event base structure +struct taskEventBase { + uint8_t type; // event type: collective/p2p + int rank; // rank of the operation in NCCL communicator + const char* name; // FIXME: unused + uint64_t commHash; // communicator identifier + uint8_t func; // ncclFunc* + int refCount; // number of references for this operation + struct group* parent; // parent event group + struct taskEventBase* next; // next top level event in group + double startTs; + double stopTs; +}; + +struct collective { + struct taskEventBase base; // base structure for this event + uint64_t seqNumber; // sequence number for this collective in communicator + void const* sendBuff; + void* recvBuff; + size_t count; + size_t trafficBytes; + int root; + uint8_t datatype; + uint8_t nMaxChannels; + uint8_t algo; + uint8_t proto; + int op; + int nWarps; + int isCollnet; + int isNvls; + struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events + struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events +}; + +struct p2p { + struct taskEventBase base; // base structure for this event + uint8_t func; + void const* buff; + size_t count; + uint8_t datatype; + int peer; + struct proxyOp op; +}; + +struct group { + uint8_t type; + struct context* ctx; // profiler context + int groupId; + int refCount; + struct taskEventBase* eventHead; // queue head for task events + struct taskEventBase* eventTail; // queue tail for task events + double startTs; + double stopTs; + struct group* next; // next group event in queue +}; + +// arrays for different event objects +struct context { + int groupPoolSize; + int groupPoolBase; + int groupPoolIndex; + struct group* groupPool; + + int collPoolSize; + int collPoolBase; + int collPoolIndex; + struct collective* collPool; + + int p2pPoolSize; + int p2pPoolBase; + int p2pPoolIndex; + struct p2p* p2pPool; + + int proxyCtrlPoolSize; + int proxyCtrlPoolBase; + int proxyCtrlPoolIndex; + struct proxyCtrl* proxyCtrlPool; +}; + +int taskEventQueueEmpty(struct group* g); +void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event); +struct taskEventBase* taskEventQueueHead(struct group* g); +struct taskEventBase* taskEventQueueDequeue(struct group* g); + +#endif diff --git a/ext-profiler/example/nccl/common.h b/ext-profiler/example/nccl/common.h new file mode 100644 index 0000000..9129252 --- /dev/null +++ b/ext-profiler/example/nccl/common.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COMMON_H_ +#define COMMON_H_ + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#endif diff --git a/ext-profiler/example/nccl/err.h b/ext-profiler/example/nccl/err.h new file mode 100644 index 0000000..6443924 --- /dev/null +++ b/ext-profiler/example/nccl/err.h @@ -0,0 +1,19 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ERR_H_ +#define NCCL_ERR_H_ + +/* Error type for plugins */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6 } ncclResult_t; + +#endif diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h new file mode 100644 index 0000000..db7bc3f --- /dev/null +++ b/ext-profiler/example/nccl/profiler.h @@ -0,0 +1,18 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_H_ +#define NCCL_PROFILER_H_ + +#include +#include + +#include "common.h" +#include "err.h" + +#include "profiler_v1.h" + +#endif // end include guard diff --git a/ext-profiler/example/nccl/profiler_v1.h b/ext-profiler/example/nccl/profiler_v1.h new file mode 100644 index 0000000..8724a1c --- /dev/null +++ b/ext-profiler/example/nccl/profiler_v1.h @@ -0,0 +1,150 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_V1_H_ +#define NCCL_PROFILER_V1_H_ + +#include + +enum { + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileNumEvents = ( 6), +}; + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + uint8_t func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + uint8_t datatype; + uint32_t op; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + uint8_t algo; + uint8_t proto; + int isCollnet; + int isNvls; + } coll; + + struct { + const char* name; + uint64_t commHash; + uint8_t func; + void* buff; + uint8_t datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v1_t; + +typedef enum { + ncclProfilerProxyOpSendPosted, + ncclProfilerProxyOpSendRemFifoWait, + ncclProfilerProxyOpSendTransmitted, + ncclProfilerProxyOpSendDone, + ncclProfilerProxyOpRecvPosted, + ncclProfilerProxyOpRecvReceived, + ncclProfilerProxyOpRecvTransmitted, + ncclProfilerProxyOpRecvDone, + + /* Legacy proxy profiler states */ + ncclProfilerProxyStepSendGPUWait, + ncclProfilerProxyStepSendWait, + ncclProfilerProxyStepRecvWait, + ncclProfilerProxyStepRecvFlushWait, + ncclProfilerProxyStepRecvGPUWait, + + /* Legacy proxy control states */ + ncclProfilerProxyCtrlIdle, + ncclProfilerProxyCtrlActive, + ncclProfilerProxyCtrlSleep, + ncclProfilerProxyCtrlWakeup, + ncclProfilerProxyCtrlAppend, + ncclProfilerProxyCtrlAppendEnd, +} ncclProfilerEventState_v1_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v1_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v1_t; + +typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t; +typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v1_t ncclProfiler_t; + +#endif diff --git a/ext-profiler/example/nccl/types.h b/ext-profiler/example/nccl/types.h new file mode 100644 index 0000000..f43fdc1 --- /dev/null +++ b/ext-profiler/example/nccl/types.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_TYPES_H_ +#define NCCL_TYPES_H_ + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclBfloat16 = 9, +} ncclDataType_t; + +#endif diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c new file mode 100644 index 0000000..f9de608 --- /dev/null +++ b/ext-profiler/example/plugin.c @@ -0,0 +1,492 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "event.h" +#include "print_event.h" + +#define __hidden __attribute__ ((visibility("hidden"))) + +static int initialized; // initialization counter for profiler +static double startTime; // profiler start time + +static int groupPoolSize = 16; +static int collPoolSize = 16; +static int p2pPoolSize = 1024; +static int proxyCtrlPoolSize = 16; +static int detachPoolSize = 128; +static int detachPoolBase; +static int detachPoolIndex; +static int detachPoolDone; +static struct proxyOp* detachPool; + +static double freq = -1; +__hidden void calibrate() { + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t timeCycles = __rdtsc(); + double time = - tv.tv_sec*1e6 - tv.tv_usec; + uint64_t total = 0ULL; + for (int i = 0; i < 10000; i++) total += __rdtsc(); + gettimeofday(&tv, NULL); + timeCycles = __rdtsc() - timeCycles; + time += tv.tv_sec*1e6 + tv.tv_usec; + freq = timeCycles / time; +} + +__hidden double gettime(void) { + return __rdtsc() / freq; +} + +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; +static pid_t pid; + +__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) { + pthread_mutex_lock(&lock); + if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) { + // first thread initializes event mask, environment and detach pool + __atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED); + if (getenv("NCCL_PROFILE_EVENT_MASK")) { + __atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED); + } + if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) { + groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE")); + } + if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) { + collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE")); + } + if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) { + p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE")); + } + if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) { + proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")); + } + if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) { + detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")); + } + // detach pool is used to store PXN proxyOps and is shared among threads + detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool)); + if (detachPool == NULL) { + pthread_mutex_unlock(&lock); + return ncclSystemError; + } + // Pid of the process initializing the profiler first. + // This is compared against the pid of proxyOp events + // to figure out if they have a parent event in this + // process address space. + pid = getpid(); + + // calibrate and start timer + calibrate(); + startTime = gettime(); + } + pthread_mutex_unlock(&lock); + + // pre-allocate memory for event object pools in dedicated profiler context + struct context* ctx = (struct context *)calloc(1, sizeof(*ctx)); + ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool)); + if (ctx->groupPool == NULL) goto fail; + + ctx->collPool = (struct collective *)calloc(collPoolSize, sizeof(*ctx->collPool)); + if (ctx->collPool == NULL) goto fail; + + ctx->p2pPool = (struct p2p *)calloc(p2pPoolSize, sizeof(*ctx->p2pPool)); + if (ctx->p2pPool == NULL) goto fail; + + ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool)); + if (ctx->proxyCtrlPool == NULL) goto fail; + + *context = ctx; + return ncclSuccess; + +fail: + // cleanup resources + if (ctx->proxyCtrlPool) free(ctx->proxyCtrlPool); + if (ctx->p2pPool) free(ctx->p2pPool); + if (ctx->collPool) free(ctx->collPool); + if (ctx->groupPool) free(ctx->groupPool); + free(ctx); + if (detachPool) free(detachPool); + return ncclSystemError; +} + +__hidden ncclResult_t exampleProfilerFinalize(void* context) { + FILE* fh = NULL; + char filename[PATH_MAX] = { 0 }; + char hostname[64] = { 0 }; + gethostname(hostname, 64); + const char* dump = getenv("NCCL_PROFILE_DUMP_FILE"); + if (dump) { + sprintf(filename, "%s-%s-%ld.txt", dump, hostname, syscall(SYS_gettid)); + fh = fopen(filename, "w"); + fprintf(fh, "[\n"); + } + + // print last N groups/collectives/p2ps + struct context* ctx = (struct context *)context; + int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0; + int end = ctx->groupPoolIndex; + for (int i = start; i < end; i++) { + printEvent(fh, &ctx->groupPool[i%groupPoolSize]); + } + + start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0; + end = ctx->proxyCtrlPoolIndex; + for (int i = start; i < end; i++) { + printEvent(fh, &ctx->proxyCtrlPool[i%proxyCtrlPoolSize]); + } + + free(ctx->groupPool); + free(ctx->collPool); + free(ctx->p2pPool); + free(ctx->proxyCtrlPool); + free(ctx); + + // last thread cleans up shared detach pool + if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) { + start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0; + end = detachPoolIndex; + for (int i = start; i < end; i++) { + printEvent(fh, &detachPool[i%detachPoolSize]); + } + free(detachPool); + } + + if (fh) fprintf(fh, "{}]\n"); + if (fh) fclose(fh); + + return ncclSuccess; +} + +__hidden void updateEvent(void* handle); + +__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) { + *eHandle = NULL; + struct context* ctx = (struct context *)context; + if (eDescr->type == ncclProfileGroup) { + struct group* event; + int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED); + if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) { + // if there are available group events grab one + event = &ctx->groupPool[groupId%groupPoolSize]; + while (!taskEventQueueEmpty(event)) { + struct taskEventBase* base = taskEventQueueDequeue(event); + if (base->type == ncclProfileColl) { + struct collective* c = (struct collective *)base; + // reset event proxyOps & proxySteps + memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS); + memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS); + // release collective events in the group and return them to the collective pool + __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED); + } else if (base->type == ncclProfileP2p) { + struct p2p* p = (struct p2p *)base; + // reset event proxyOp and proxySteps + memset(&p->op, 0, sizeof(struct proxyOp)); + // release p2p events in the group and return them to the p2p pool + __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED); + } + } + } else { + // else drop this event + __atomic_fetch_sub(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED); + return ncclSuccess; + } + event->type = ncclProfileGroup; + __atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED); + event->ctx = ctx; + event->groupId = groupId; + event->startTs = gettime() - startTime; + *eHandle = event; + debugEvent(event, "GroupStart"); + } else if (eDescr->type == ncclProfileColl) { + // the parent might be null if we run out of events + struct group* parent = (struct group *)eDescr->parentObj; + if (parent == NULL) return ncclSuccess; + + struct collective* event; + int collId = __atomic_fetch_add(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED); + if ((collId - __atomic_load_n(&ctx->collPoolBase, __ATOMIC_RELAXED)) < collPoolSize) { + // if there are available collective events grab one + event = &ctx->collPool[collId%collPoolSize]; + } else { + // else drop this event + __atomic_fetch_sub(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED); + return ncclSuccess; + } + + event->base.type = ncclProfileColl; + event->base.rank = eDescr->rank; + event->base.name = eDescr->coll.name; + event->base.commHash = eDescr->coll.commHash; + event->base.func = eDescr->coll.func; + event->base.startTs = gettime() - startTime; + event->base.parent = parent; + event->seqNumber = eDescr->coll.seqNumber; + event->sendBuff = eDescr->coll.sendBuff; + event->recvBuff = eDescr->coll.recvBuff; + event->count = eDescr->coll.count; + event->root = eDescr->coll.root; + event->datatype = eDescr->coll.datatype; + event->op = eDescr->coll.op; + event->trafficBytes = eDescr->coll.trafficBytes; + event->nMaxChannels = eDescr->coll.nMaxChannels; + event->nWarps = eDescr->coll.nWarps; + event->algo = eDescr->coll.algo; + event->proto = eDescr->coll.proto; + event->isCollnet = eDescr->coll.isCollnet; + event->isNvls = eDescr->coll.isNvls; + *eHandle = event; + taskEventQueueEnqueue(parent, (struct taskEventBase *)event); + // increment the group ref counter so the event will staty open + __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED); + debugEvent(event, "CollStart"); + } else if (eDescr->type == ncclProfileP2p) { + // the parent might be null if we run out of events + struct group* parent = (struct group *)eDescr->parentObj; + if (parent == NULL) return ncclSuccess; + + struct p2p* event; + int p2pId = __atomic_fetch_add(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED); + if ((p2pId - __atomic_load_n(&ctx->p2pPoolBase, __ATOMIC_RELAXED)) < p2pPoolSize) { + // if there are available p2p events grab one + event = &ctx->p2pPool[p2pId%p2pPoolSize]; + } else { + // else drop this event + __atomic_fetch_sub(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED); + return ncclSuccess; + } + + event->base.type = ncclProfileP2p; + event->base.rank = eDescr->rank; + event->base.name = eDescr->p2p.name; + event->base.commHash = eDescr->p2p.commHash; + event->base.func = eDescr->p2p.func; + event->base.next = parent->eventHead; + event->base.startTs = gettime() - startTime; + event->base.parent = parent; + event->buff = eDescr->p2p.buff; + event->count = eDescr->p2p.count; + event->datatype = eDescr->p2p.datatype; + event->peer = eDescr->p2p.peer; + *eHandle = event; + // increment the group ref counter so the event will staty open + taskEventQueueEnqueue(parent, (struct taskEventBase *)event); + __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED); + debugEvent(event, "P2pStart"); + } else if (eDescr->type == ncclProfileProxyCtrl) { + int proxyCtrlId = __atomic_fetch_add(&ctx->proxyCtrlPoolIndex, 1, __ATOMIC_RELAXED); + struct proxyCtrl* event = &ctx->proxyCtrlPool[proxyCtrlId%proxyCtrlPoolSize]; + event->type = ncclProfileProxyCtrl; + event->ctx = ctx; + event->startTs = gettime() - startTime; + *eHandle = event; + } else if (eDescr->type == ncclProfileProxyOp) { + // the eventBase might be null if we run out of events + struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj; + if (eventBase == NULL) return ncclSuccess; + + if (eDescr->proxyOp.pid != pid) { + // PXN captured proxyOp events + struct proxyOp* event; + int detachId = __atomic_fetch_add(&detachPoolIndex, 1, __ATOMIC_RELAXED); + if ((detachId - detachPoolBase) < detachPoolSize) { + // if there are available detached proxyOp events grab one + event = &detachPool[detachId%detachPoolSize]; + } else { + // else drop this event + __atomic_fetch_sub(&detachPoolIndex, 1, __ATOMIC_RELAXED); + return ncclSuccess; + } + + event->type = ncclProfileProxyOp; + event->channelId = eDescr->proxyOp.channelId; + event->pid = eDescr->proxyOp.pid; + event->rank = eDescr->rank; + event->peer = eDescr->proxyOp.peer; + event->nSteps = eDescr->proxyOp.nSteps; + event->chunkSize = eDescr->proxyOp.chunkSize; + event->isSend = eDescr->proxyOp.isSend; + event->startTs = gettime() - startTime; + event->parent = NULL; + *eHandle = event; + debugEvent(event, "PxnProxyOpStart"); + return ncclSuccess; + } + + if (eventBase->type == ncclProfileColl) { + struct collective* parent = (struct collective *)eDescr->parentObj; + struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId]; + event->type = ncclProfileProxyOp; + event->channelId = eDescr->proxyOp.channelId; + event->pid = eDescr->proxyOp.pid; + event->rank = eDescr->rank; + event->peer = eDescr->proxyOp.peer; + event->nSteps = eDescr->proxyOp.nSteps; + event->chunkSize = eDescr->proxyOp.chunkSize; + event->isSend = eDescr->proxyOp.isSend; + event->parent = eventBase; + event->startTs = gettime() - startTime; + *eHandle = event; + __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED); + debugEvent(event, "ProxyOpStart"); + } else { // ncclProfileP2p + struct p2p* parent = (struct p2p *)eDescr->parentObj; + struct proxyOp* event = &parent->op; + event->type = ncclProfileProxyOp; + event->channelId = eDescr->proxyOp.channelId; + event->pid = eDescr->proxyOp.pid; + event->rank = eDescr->rank; + event->peer = eDescr->proxyOp.peer; + event->nSteps = eDescr->proxyOp.nSteps; + event->chunkSize = eDescr->proxyOp.chunkSize; + event->isSend = eDescr->proxyOp.isSend; + event->parent = eventBase; + event->startTs = gettime() - startTime; + *eHandle = event; + __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED); + debugEvent(event, "ProxyOpStart"); + } + } else if (eDescr->type == ncclProfileProxyStep) { + // the parent might be null if we run out of events + struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj; + if (parent == NULL) return ncclSuccess; + + int s = parent->stepCount++ % MAX_STEPS; + struct proxyStep* event = &parent->step[s]; + event->type = ncclProfileProxyStep; + event->step = eDescr->proxyStep.step; + event->isSend = parent->isSend; + event->parent = parent; + event->startTs = gettime() - startTime; + *eHandle = event; + debugEvent(event, "ProxyStepStart"); + } + return ncclSuccess; +} + +void updateEvent(void* handle) { + uint8_t type = *(uint8_t *)handle; + if (type == ncclProfileGroup) { + struct group* event = (struct group *)handle; + if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) { + event->stopTs = gettime() - startTime; + // return group event to the pool + __atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED); + } + debugEvent(event, "GroupStop"); + } else if (type == ncclProfileColl) { + struct collective* event = (struct collective *)handle; + if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) { + event->base.stopTs = gettime() - startTime; + debugEvent(event, "CollStop"); + updateEvent(event->base.parent); + return; + } + debugEvent(event, "CollStop"); + } else if (type == ncclProfileP2p) { + struct p2p* event = (struct p2p *)handle; + if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) { + event->base.stopTs = gettime() - startTime; + debugEvent(event, "P2pStop"); + updateEvent(event->base.parent); + return; + } + debugEvent(event, "P2pStop"); + } else if (type == ncclProfileProxyOp) { + struct proxyOp* event = (struct proxyOp *)handle; + event->stopTs = gettime() - startTime; + if (event->pid != pid) { + // only for proxyOps that don't have a parent collective/p2p (i.e., PXN) + int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1; + if (done == detachPoolSize) { + // reset the event completed (done) counter + __atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED); + // update the base pointer to the top of the pool + int index = __atomic_load_n(&detachPoolIndex, __ATOMIC_RELAXED); + __atomic_store_n(&detachPoolBase, index, __ATOMIC_RELAXED); + } + debugEvent(event, "ProxyOpStop"); + return; + } + updateEvent(event->parent); + debugEvent(event, "ProxyOpStop"); + } else if (type == ncclProfileProxyStep) { + struct proxyStep* event = (struct proxyStep *)handle; + event->stopTs = gettime() - startTime; + debugEvent(event, "ProxyStepStop"); + } else if (type == ncclProfileProxyCtrl) { + struct proxyCtrl* event = (struct proxyCtrl *)handle; + event->stopTs = gettime() - startTime; + debugEvent(event, "ProxyCtrlStop"); + } +} + +__hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) { + // the event handle might be null if we run out of events + if (eHandle == NULL) return ncclSuccess; + + uint8_t type = *(uint8_t *)eHandle; + if (type == ncclProfileGroup) { + // stopping the group event in NCCL core does not + // mean the group has completed. It means the group + // was submitted/enqueued so we need to keep the event open + struct group* event = (struct group *)eHandle; + event->stopTs = gettime() - startTime; + return ncclSuccess; + } else if (type == ncclProfileColl) { + // stopping the collective event in NCCL core does not + // mean the collective has completed. It means the collective + // was submitted/enqueued so we need to keep the event open + struct collective* event = (struct collective *)eHandle; + event->base.stopTs = gettime() - startTime; + return ncclSuccess; + } + updateEvent(eHandle); + return ncclSuccess; +} + +__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) { + // the event handle might be null if we run out of events + if (eHandle == NULL) return ncclSuccess; + + debugEvent(eHandle, "RecordEventState"); + uint8_t type = *(uint8_t *)eHandle; + if (type == ncclProfileProxyOp) { + struct proxyOp* event = (struct proxyOp *)eHandle; + int steps = event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps; + if (eState == ncclProfilerProxyOpSendRemFifoWait && eStateArgs->proxyOp.steps == steps) return ncclSuccess; + event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps = eStateArgs->proxyOp.steps; + event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].timestamp = gettime() - startTime; + event->transSize = eStateArgs->proxyOp.transSize; + } else if (type == ncclProfileProxyStep) { + struct proxyStep* event = (struct proxyStep *)eHandle; + event->timestamp[event->isSend ? PROXY_STEP_SEND_STATE_IDX(eState) : PROXY_STEP_RECV_STATE_IDX(eState)] = gettime() - startTime; + } else if (type == ncclProfileProxyCtrl) { + struct proxyCtrl* event = (struct proxyCtrl *)eHandle; + if (eState == ncclProfilerProxyCtrlAppendEnd) { + event->appended = eStateArgs->proxyCtrl.appendedProxyOps; + } + event->state = eState; + } + return ncclSuccess; +} + +ncclProfiler_v1_t ncclProfiler_v1 = { + "Example-profiler", + exampleProfilerInit, + exampleProfilerStartEvent, + exampleProfilerStopEvent, + exampleProfilerRecordEventState, + exampleProfilerFinalize, +}; diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c new file mode 100644 index 0000000..490ba7c --- /dev/null +++ b/ext-profiler/example/print_event.c @@ -0,0 +1,277 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "profiler.h" +#include "event.h" +#include "print_event.h" + +#define __hidden __attribute__ ((visibility("hidden"))) + +__hidden const char* ncclFuncToString(int func) { + switch(func) { + case 0: + return "ncclBroadcast"; + case 1: + return "ncclReduce"; + case 2: + return "ncclAllGather"; + case 3: + return "ncclReduceScatter"; + case 4: + return "ncclAllReduce"; + case 5: + return "ncclSendRecv"; + case 6: + return "ncclSend"; + case 7: + return "ncclRecv"; + } + return NULL; +} + +__hidden const char* ncclAlgoToString(int algo) { + switch(algo) { + case 0: + return "Tree"; + case 1: + return "Ring"; + case 2: + return "CollnetDirect"; + case 3: + return "CollnetChain"; + case 4: + return "Nvls"; + case 5: + return "NvlsTree"; + } +} + +__hidden const char* ncclProtoToString(int proto) { + switch(proto) { + case 0: + return "LL"; + case 1: + return "LL128"; + case 2: + return "Simple"; + } +} + +// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category +// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a +// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET) +static __thread int groupId; +__hidden void printGroupEventHeader(FILE* fh, struct group* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n", + "Group", groupId, getpid(), 1, event->startTs, event->groupId); +} + +__hidden void printGroupEventTrailer(FILE* fh, struct group* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "Group", groupId++, getpid(), 1, event->stopTs); +} + +static __thread int collId; +__hidden void printCollEventHeader(FILE* fh, struct collective* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n", + ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels); +} + +__hidden void printCollEventTrailer(FILE* fh, struct collective* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs); +} + +static __thread int p2pId; +__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n", + ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype); +} + +__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs); +} + +static __thread int proxyOpId; +__hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) { + if (event->isSend) { + int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted); + int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait); + int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted); + int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n", + "Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp); + } else { + int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted); + int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived); + int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted); + int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n", + "Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp); + } +} + +__hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs); +} + +static __thread int proxyStepId; +__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { + if (event->isSend) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", + "SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", + "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", + "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "SendWait", proxyStepId++, getpid(), 1, event->stopTs); + } else { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", + "RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", + "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", + "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", + "RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs); + } +} + +static __thread int proxyCtrlId; +__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) { + const char* str; + if (event->state == ncclProfilerProxyCtrlIdle || event->state == ncclProfilerProxyCtrlActive) { + str = "Idle"; + } else if (event->state == ncclProfilerProxyCtrlSleep || event->state == ncclProfilerProxyCtrlWakeup) { + str = "Sleep"; + } else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) { + str = "Append"; + } + if (event->state == ncclProfilerProxyCtrlAppendEnd) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n", + str, proxyCtrlId, getpid(), 1, event->startTs, event->appended); + } else { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + str, proxyCtrlId, getpid(), 1, event->startTs); + } + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + str, proxyCtrlId++, getpid(), 1, event->stopTs); +} + +//#define DEBUG_EVENTS +void debugEvent(void* eHandle, const char* tag) { +#ifdef DEBUG_EVENTS + char filename[64] = { 0 }; + sprintf(filename, "EventDebug-%d", getpid()); + FILE* fh = fopen(filename, "a+"); + uint8_t type = *(uint8_t *)eHandle; + if (type == ncclProfileGroup) { + struct group* event = (struct group *)eHandle; + fprintf(fh, "Group event %p tag = %s {\n", event, tag); + fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->refCount, __ATOMIC_RELAXED)); + fprintf(fh, " startTs = %f\n", event->startTs); + fprintf(fh, " stopTs = %f\n", event->stopTs); + fprintf(fh, "}\n"); + } else if (type == ncclProfileColl) { + struct collective* event = (struct collective *)eHandle; + fprintf(fh, "Collective event %p tag = %s {\n", event, tag); + fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED)); + fprintf(fh, " parent = %p\n", event->base.parent); + for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]); + for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]); + fprintf(fh, " startTs = %f\n", event->base.startTs); + fprintf(fh, " stopTs = %f\n", event->base.stopTs); + fprintf(fh, "}\n"); + } else if (type == ncclProfileP2p) { + struct p2p* event = (struct p2p *)eHandle; + fprintf(fh, "P2p event %p tag = %s {\n", event, tag); + fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED)); + fprintf(fh, " parent = %p\n", event->base.parent); + fprintf(fh, " op = %p\n", &event->op); + fprintf(fh, " startTs = %f\n", event->base.startTs); + fprintf(fh, " stopTs = %f\n", event->base.stopTs); + fprintf(fh, "}\n"); + } else if (type == ncclProfileProxyOp) { + struct proxyOp* event = (struct proxyOp *)eHandle; + fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag); + fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv"); + fprintf(fh, " channel = %d\n", event->channelId); + fprintf(fh, " parent = %p\n", event->parent); + fprintf(fh, " rank = %d\n", event->rank); + fprintf(fh, " startTs = %f\n", event->startTs); + fprintf(fh, " stopTs = %f\n", event->stopTs); + fprintf(fh, "}\n"); + } else if (type == ncclProfileProxyStep) { + struct proxyStep* event = (struct proxyStep *)eHandle; + fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag); + fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv"); + fprintf(fh, " parent = %p\n", event->parent); + fprintf(fh, " startTs = %f\n", event->startTs); + fprintf(fh, " stopTs = %f\n", event->stopTs); + fprintf(fh, "}\n"); + } + fclose(fh); +#endif +} + +void printEvent(FILE* fh, void* handle) { + if (handle == NULL || fh == NULL) return; + uint8_t type = *(uint8_t *)handle; + if (type == ncclProfileGroup) { + struct group* g = (struct group *)handle; + printGroupEventHeader(fh, g); + struct taskEventBase* base = taskEventQueueHead(g); + while (base) { + struct taskEventBase* next = base->next; + printEvent(fh, base); + base = next; + } + printGroupEventTrailer(fh, g); + } else if (type == ncclProfileColl) { + struct collective* c = (struct collective *)handle; + printCollEventHeader(fh, c); + for (int i = 0; i < MAX_CHANNELS; i++) { + printEvent(fh, &c->send[i]); + printEvent(fh, &c->recv[i]); + } + printCollEventTrailer(fh, c); + } else if (type == ncclProfileP2p) { + struct p2p* p = (struct p2p *)handle; + printP2pEventHeader(fh, p); + printEvent(fh, &p->op); + printP2pEventTrailer(fh, p); + } else if (type == ncclProfileProxyOp) { + struct proxyOp* p = (struct proxyOp *)handle; + printProxyOpEventHeader(fh, p); + for (int i = 0; i < MAX_STEPS; i++) { + printEvent(fh, &p->step[i]); + } + printProxyOpEventTrailer(fh, p); + } else if (type == ncclProfileProxyStep) { + struct proxyStep* p = (struct proxyStep *)handle; + printProxyStepEvent(fh, p); + } else if (type == ncclProfileProxyCtrl) { + struct proxyCtrl* p = (struct proxyCtrl *)handle; + printProxyCtrlEvent(fh, p); + } + return; +} diff --git a/ext-profiler/example/print_event.h b/ext-profiler/example/print_event.h new file mode 100644 index 0000000..8e2db4c --- /dev/null +++ b/ext-profiler/example/print_event.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PRINT_EVENT_H_ +#define PRINT_EVENT_H_ + +void debugEvent(void* eHandle, const char* tag); +void printEvent(FILE* fh, void* handle); + +#endif diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h index a1f18d3..aafabd7 100644 --- a/ext-tuner/example/nccl/tuner.h +++ b/ext-tuner/example/nccl/tuner.h @@ -27,7 +27,7 @@ typedef enum { ncclNumFuncs = 8 } ncclFunc_t; -#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* +#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* #define NCCL_ALGO_UNDEF -1 #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 @@ -35,6 +35,7 @@ typedef enum { #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 #define NCCL_ALGO_NVLS_TREE 5 +#define NCCL_ALGO_PAT 6 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_UNDEF -1 diff --git a/makefiles/common.mk b/makefiles/common.mk index a037cf3..59e4151 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -10,6 +10,7 @@ VERBOSE ?= 0 KEEP ?= 0 DEBUG ?= 0 ASAN ?= 0 +UBSAN ?= 0 TRACE ?= 0 PROFAPI ?= 1 NVTX ?= 1 @@ -93,6 +94,12 @@ LDFLAGS += -fsanitize=address -static-libasan NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan endif +ifneq ($(UBSAN), 0) +CXXFLAGS += -fsanitize=undefined +LDFLAGS += -fsanitize=undefined -static-libubsan +NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan +endif + ifneq ($(VERBOSE), 0) NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter CXXFLAGS += -Wall -Wextra diff --git a/makefiles/version.mk b/makefiles/version.mk index 9039cb7..bcc0ff3 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 22 -NCCL_PATCH := 3 +NCCL_MINOR := 23 +NCCL_PATCH := 4 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/bootstrap.cc b/src/bootstrap.cc index a7d7754..c1d085e 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -14,6 +14,67 @@ #include "proxy.h" #include "param.h" +#define BOOTSTRAP_N_CHECK_ABORT 10000 +#define BOOTSTRAP_TAG_CONNECT (0x1 << 31) +#define BOOTSTRAP_TAG_ALLGATHER (0x1 << 30) +#define BOOTSTRAP_TAG_COMMSPLIT (0x1 << 29) +#define BOOTSTRAP_TAG_INTRANODE_ALLGATHER (0x1 << 28) + +#define BOOTSTRAP_INIT_TIME_CREATE 0 +#define BOOTSTRAP_INIT_TIME_SEND 1 +#define BOOTSTRAP_INIT_TIME_RECV 2 +#define BOOTSTRAP_INIT_TIME_RING 3 +#define BOOTSTRAP_INIT_TIME_TOTAL 4 +#define BOOTSTRAP_INIT_TIME_DELAY 5 +#define BOOTSTRAP_INIT_TIME_N 6 +#define BOOTSTRAP_INIT_ROOT_WAIT 0 +#define BOOTSTRAP_INIT_ROOT_SEND 1 +#define BOOTSTRAP_INIT_ROOT_RECV 2 +#define BOOTSTRAP_INIT_ROOT_N 3 +#define BOOTSTRAP_PROF_OPEN(time) \ + do { \ + time = clockNano(); \ + } while (0) +#define BOOTSTRAP_PROF_CLOSE(time) \ + do { \ + time = clockNano() - time; \ + } while (0) + +#define BOOTSTRAP_PID(i, n) (((i) + (n)) % (n)) +// returns the first rank associated to the root. must have root >=0 +// if root >= n_roots, it does NOT assume periodicity +static int firstRankFromRoot(int root, int n_ranks, int nRoots) { + return root * (n_ranks / nRoots) + std::min(root, n_ranks % nRoots); +} +// returns the root of a rank, must have rank >=0 +// if rank >= n_ranks, it does NOT assume periodicity +static int rootIdFromRank(int rank, int nRanks, int nRoots) { + int rmr = nRanks % nRoots; // rank mod root + int rpr = nRanks / nRoots; // rank per root + int D = rmr * (rpr + 1); + if (rank < D) + return rank / (rpr + 1); + else + return (rank - D) / rpr + rmr; +} +// return the number of child for a root, root will be periodized +static int nRankFromRoot(int root, int nRanks, int nRoots) { + int ir = BOOTSTRAP_PID(root, nRoots); + int rmr = nRanks % nRoots; // rank mod root + int rpr = nRanks / nRoots; // rank per root + return rpr + ((ir < rmr) ? 1 : 0); +} +// return the local id of a given rank for a given root +// root will be periodize, rank will not +static int localIdFromRoot(int rank, int root, int nRanks, int nRoots) { + int ir = BOOTSTRAP_PID(root, nRoots); + return rank - firstRankFromRoot(ir, nRanks, nRoots); +} +// return the number of child for a root, root will be periodized +static int isFirstFromRoot(int rank, int root, int nRanks, int nRoots) { + return (rank == firstRankFromRoot(root, nRanks, nRoots)); +} + struct bootstrapRootArgs { struct ncclSocket* listenSock; uint64_t magic; @@ -25,6 +86,8 @@ static union ncclSocketAddress bootstrapNetIfAddr; static int bootstrapNetInitDone = 0; pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; +NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0); + ncclResult_t bootstrapNetInit() { if (bootstrapNetInitDone == 0) { pthread_mutex_lock(&bootstrapNetLock); @@ -53,7 +116,7 @@ ncclResult_t bootstrapNetInit() { char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2]; sprintf(line, " %s:", bootstrapNetIfName); ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line)); - INFO(NCCL_INIT, "Bootstrap : Using%s", line); + INFO(NCCL_BOOTSTRAP, "Bootstrap : Using%s", line); bootstrapNetInitDone = 1; } pthread_mutex_unlock(&bootstrapNetLock); @@ -64,40 +127,119 @@ ncclResult_t bootstrapNetInit() { /* Socket Interface Selection type */ enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; -// Additional sync functions -static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) { - NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int))); - NCCLCHECK(ncclSocketSend(sock, data, size)); +// check abort function +static ncclResult_t checkAbort(volatile uint32_t* flag, int* cntr) { + if ((*cntr % BOOTSTRAP_N_CHECK_ABORT) == 0) { + if (flag && __atomic_load_n(flag, __ATOMIC_ACQUIRE)) { + TRACE(NCCL_BOOTSTRAP, "bootstrap: abort called"); + return ncclInternalError; + } + } + *cntr = (*cntr + 1) % BOOTSTRAP_N_CHECK_ABORT; return ncclSuccess; } -static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) { +// send/recv functions +static ncclResult_t netReg(ncclNet_t* net, void* comm, void* data, int size, void** handle) { + NCCLCHECK(net->regMr(comm, data, size, NCCL_PTR_HOST, handle)); + return ncclSuccess; +} +static ncclResult_t netDereg(ncclNet_t* net, void* comm, void** handle) { + NCCLCHECK(net->deregMr(comm, *handle)); + *handle = NULL; + return ncclSuccess; +} +static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int size, void* dataHandle, int tag, void** sendReq, + int* done) { + if (*done) return ncclSuccess; + if (!*sendReq) { + NCCLCHECK(net->isend(sendComm, data, size, tag, dataHandle, sendReq)); + } + if (*sendReq) { + NCCLCHECK(net->test(*sendReq, done, NULL)); + if (*done) { + *sendReq = NULL; + } + } + return ncclSuccess; +} +static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int size, void* dataHandle, int tag, void** recvReq, + int* done) { + if (*done) return ncclSuccess; + if (!*recvReq) { + NCCLCHECK(net->irecv(recvComm, 1, &data, &size, &tag, &dataHandle, recvReq)); + } + if (*recvReq) { + NCCLCHECK(net->test(*recvReq, done, NULL)); + if (*done) { + *recvReq = NULL; + } + } + return ncclSuccess; +} +static ncclResult_t netSendRecv(ncclNet_t* net, void* sendComm, void* sendData, int sendSize, void* sendDataHandle, void* recvComm, + void* recvData, int recvSize, void* recvDataHandle, int tag, volatile uint32_t* abortFlag) { + int abortCounter = 0; + int doneSend = 0, doneRecv = 0; + void *sendReq = NULL, *recvReq = NULL; + do { + NCCLCHECK(checkAbort(abortFlag, &abortCounter)); + if (!doneRecv) { + NCCLCHECK(netIrecv(net, recvComm, recvData, recvSize, recvDataHandle, tag, &recvReq, &doneRecv)); + } + if (!doneSend) { + NCCLCHECK(netIsend(net, sendComm, sendData, sendSize, sendDataHandle, tag, &sendReq, &doneSend)); + } + } while (!doneSend || !doneRecv); + return ncclSuccess; +} + +// Additional socket based functions, first send the size, then send the message +static ncclResult_t socketSend(struct ncclSocket* sock, void* data, int size) { + NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int))); + if (size > 0) + NCCLCHECK(ncclSocketSend(sock, data, size)); + return ncclSuccess; +} +static ncclResult_t socketRecv(struct ncclSocket* sock, void* data, int size) { int recvSize; NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { WARN("Message truncated : received %d bytes instead of %d", recvSize, size); return ncclInternalError; } - NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size))); + int actualSize = std::min(recvSize, size); + if (actualSize > 0) + NCCLCHECK(ncclSocketRecv(sock, data, actualSize)); return ncclSuccess; } -static ncclResult_t bootstrapNetSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock, void* recvData, int recvSize) { +static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock, + void* recvData, int recvSize) { int senderRecvSize; NCCLCHECK(ncclSocketSendRecv(sendSock, &sendSize, sizeof(int), recvSock, &senderRecvSize, sizeof(int))); if (senderRecvSize > recvSize) { WARN("Message truncated : received %d bytes instead of %d", senderRecvSize, recvSize); return ncclInternalError; } - NCCLCHECK(ncclSocketSendRecv(sendSock, sendData, sendSize, recvSock, recvData, recvSize)); + NCCLCHECK(ncclSocketSendRecv(sendSock, sendData, sendSize, recvSock, recvData, std::min(recvSize, senderRecvSize))); return ncclSuccess; } -struct extInfo { - int rank; - int nranks; - union ncclSocketAddress extAddressListenRoot; - union ncclSocketAddress extAddressListen; +union ringConnectInfo { + union ncclSocketAddress addr; + char handle[NCCL_NET_HANDLE_MAXSIZE]; }; +struct extInfo { + int rank; // rank of the process reaching out + int nranks; // total number of ranks + int iroot; // current root index + int nroots; // total number of roots + union ncclSocketAddress listenRootAddress; // address of my listenSocket for the root + union ringConnectInfo connectInfo; +}; +#define NET_HANDLE(h, rank) ((h) + (rank * NCCL_NET_HANDLE_MAXSIZE)) +#define BOOTSTRAP_HANDLE(h, i) ((struct ncclBootstrapHandle*)((char*)h + i * NCCL_UNIQUE_ID_BYTES)) + #include static ncclResult_t setFilesLimit() { @@ -108,95 +250,148 @@ static ncclResult_t setFilesLimit() { return ncclSuccess; } -static void *bootstrapRoot(void* rargs) { +static ncclResult_t rootSend(union ncclSocketAddress* addr, uint64_t magic, union ringConnectInfo* info) { + ncclResult_t res = ncclSuccess; + struct ncclSocket sock; + NCCLCHECKGOTO(ncclSocketInit(&sock, addr, magic, ncclSocketTypeBootstrap), res, fail); + NCCLCHECKGOTO(ncclSocketConnect(&sock), res, fail); + NCCLCHECKGOTO(socketSend(&sock, info, sizeof(union ringConnectInfo)), res, fail); + NCCLCHECK(ncclSocketClose(&sock)); + return res; +fail: + (void)ncclSocketClose(&sock); + return res; +} +static void* bootstrapRoot(void* rargs) { + uint64_t timers[BOOTSTRAP_INIT_ROOT_N] = {0}; struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs; struct ncclSocket* listenSock = args->listenSock; uint64_t magic = args->magic; ncclResult_t res = ncclSuccess; int nranks = 0, c = 0; + int iroot = 0, nroots = 0, localId = 0; + int nrecv = 0, n2send = 0; struct extInfo info; - union ncclSocketAddress *rankAddresses = NULL; - union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange - union ncclSocketAddress *zero = NULL; - NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out); + union ringConnectInfo* rankInfo = NULL; + union ncclSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange + // get zeros for comparison + char zeroHandle[NCCL_NET_HANDLE_MAXSIZE]; + union ncclSocketAddress zeroAddress; + union ringConnectInfo zeroInfo; + memset(&zeroAddress, 0, sizeof(union ncclSocketAddress)); + memset(&zeroHandle, 0, NCCL_NET_HANDLE_MAXSIZE); + memset(&zeroInfo, 0, sizeof(union ringConnectInfo)); setFilesLimit(); - TRACE(NCCL_INIT, "BEGIN"); + TRACE(NCCL_BOOTSTRAP, "BEGIN"); + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_ROOT_WAIT]); /* Receive addresses from all ranks */ do { struct ncclSocket sock; NCCLCHECKGOTO(ncclSocketInit(&sock), res, out); NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out); - NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); + NCCLCHECKGOTO(socketRecv(&sock, &info, sizeof(info)), res, out); NCCLCHECKGOTO(ncclSocketClose(&sock), res, out); if (c == 0) { + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_WAIT]); + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_ROOT_RECV]); nranks = info.nranks; - NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out); - NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out); + iroot = info.iroot; + nroots = info.nroots; + // if the number of root > 1, we will receive one extra info from the first local_id of the next root + n2send = nRankFromRoot(iroot, nranks, nroots); + nrecv = n2send + ((nroots > 1) ? 1 : 0); + NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv * sizeof(union ringConnectInfo)), res, out); + NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nrecv), res, out); } - if (nranks != info.nranks) { - WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks); + if (nranks != info.nranks || nroots != info.nroots || iroot != info.iroot) { + WARN("Bootstrap Root : mismatch in info from procs, nranks %d vs %d, nroots %d vs %d, iroot %d vs %d", nranks, info.nranks, nroots, info.nroots, iroot, info.iroot); goto out; } - if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) { + localId = localIdFromRoot(info.rank, iroot, nranks, nroots); + if (memcmp(&zeroAddress, &rankAddressesRoot[localId], sizeof(union ncclSocketAddress)) != 0 || + memcmp(&zeroInfo, &rankInfo[localId], sizeof(union ringConnectInfo)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); goto out; } - - // Save the connection handle for that rank - memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress)); - memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress)); - + // if the previous has already checked in, send the newly received handle, if not save the handle for later + // if we have more than 1 root, I do not own the previous of local_id = 0 + // if we have prev > n2send, we do not send anything + int prev = (nroots > 1) ? (localId - 1) : BOOTSTRAP_PID(localId - 1, nrecv); + if (prev >= 0 && prev < n2send && memcmp(&zeroAddress, &rankAddressesRoot[prev], sizeof(union ncclSocketAddress)) != 0) { + NCCLCHECKGOTO(rootSend(&rankAddressesRoot[prev], magic, &info.connectInfo), res, out); + } else { + memcpy(&rankInfo[localId], &info.connectInfo, sizeof(union ringConnectInfo)); + } + // if the next rank has checked in, send the newly received info, if not save the addr for later + // for nroots >=1, I will always own the information of the next connection + // if the local_id id must be [0 ; n2send[ otherwise we do not answer + int next = BOOTSTRAP_PID(localId + 1, nrecv); + if (localId >= 0 && localId < n2send && memcmp(&zeroInfo, &rankInfo[next], sizeof(union ringConnectInfo)) != 0) { + NCCLCHECKGOTO(rootSend(&info.listenRootAddress, magic, &rankInfo[next]), res, out); + } else { + memcpy(rankAddressesRoot + localId, &info.listenRootAddress, sizeof(union ncclSocketAddress)); + } ++c; - TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); - } while (c < nranks); - TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks); + TRACE(NCCL_BOOTSTRAP, "Received connect from rank %d total %d/%d", info.rank, c, nrecv); + } while (c < nrecv); + TRACE(NCCL_BOOTSTRAP, "COLLECTED ALL %d HANDLES", nrecv); + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_RECV]); - // Send the connect handle for the next rank in the AllGather ring - for (int r=0; r1 roots we will send the additional one we have received + int next = BOOTSTRAP_PID(r + 1, nrecv); + if (memcmp(&zeroAddress, &rankAddressesRoot[r], sizeof(union ncclSocketAddress)) != 0 && + memcmp(&zeroInfo, &rankInfo[next], sizeof(union ringConnectInfo)) != 0) { + NCCLCHECKGOTO(rootSend(&rankAddressesRoot[r], magic, &rankInfo[next]), res, out); + } } - TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks); - + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_SEND]); + TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "Root timings (wait %f, recv %f, send %f)", timers[BOOTSTRAP_INIT_ROOT_WAIT] / 1e9, timers[BOOTSTRAP_INIT_ROOT_RECV] / 1e9, timers[BOOTSTRAP_INIT_ROOT_SEND] / 1e9); out: if (listenSock != NULL) { - ncclSocketClose(listenSock); + (void)ncclSocketClose(listenSock); free(listenSock); } - if (rankAddresses) free(rankAddresses); - if (rankAddressesRoot) free(rankAddressesRoot); - if (zero) free(zero); + if (rankInfo) + free(rankInfo); + if (rankAddressesRoot) + free(rankAddressesRoot); free(rargs); - TRACE(NCCL_INIT, "DONE"); + TRACE(NCCL_BOOTSTRAP, "DONE"); return NULL; } ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) { - struct ncclSocket* listenSock; - struct bootstrapRootArgs* args; + ncclResult_t ret = ncclSuccess; + struct ncclSocket* listenSock = NULL; + struct bootstrapRootArgs* args = NULL; pthread_t thread; NCCLCHECK(ncclCalloc(&listenSock, 1)); - NCCLCHECK(ncclSocketInit(listenSock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, NULL, 0)); - NCCLCHECK(ncclSocketListen(listenSock)); - NCCLCHECK(ncclSocketGetAddr(listenSock, &handle->addr)); + NCCLCHECKGOTO(ncclSocketInit(listenSock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, NULL, 0), ret, fail); + NCCLCHECKGOTO(ncclSocketListen(listenSock), ret, fail); + NCCLCHECKGOTO(ncclSocketGetAddr(listenSock, &handle->addr), ret, fail); - NCCLCHECK(ncclCalloc(&args, 1)); + NCCLCHECKGOTO(ncclCalloc(&args, 1), ret, fail); args->listenSock = listenSock; args->magic = handle->magic; - NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0); + PTHREADCHECKGOTO(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), "pthread_create", ret, fail); ncclSetThreadName(thread, "NCCL BootstrapR"); - NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d - return ncclSuccess; + PTHREADCHECKGOTO(pthread_detach(thread), "pthread_detach", ret, fail); // will not be pthread_join()'d +exit: + return ret; +fail: + if (listenSock) free(listenSock); + if (args) free(args); + goto exit; } ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) { @@ -226,218 +421,419 @@ struct unexConn { struct unexConn* next; }; +struct bootstrapRing_t { + union { + struct { + void *sendComm, *recvComm; + ncclNetDeviceHandle_t *sendDevHandle, *recvDevHandle; + } net; + struct { + struct ncclSocket recv; + struct ncclSocket send; + } socket; + }; +}; +struct bootstrapListen_t { + struct ncclSocket peerSocket; // socket for peers to contact me in P2P + union { + struct { + int dev; + void* comm; + char handle[NCCL_NET_HANDLE_MAXSIZE]; + } net; + struct ncclSocket socket; // socket to be used for the ring + }; +}; + struct bootstrapState { - struct ncclSocket listenSock; - struct ncclSocket ringRecvSocket; - struct ncclSocket ringSendSocket; - union ncclSocketAddress* peerCommAddresses; - union ncclSocketAddress* peerProxyAddresses; + struct bootstrapRing_t ring; + struct bootstrapListen_t listen; + ncclNet_t* net; uint64_t* peerProxyAddressesUDS; + union ncclSocketAddress* peerProxyAddresses; + union ncclSocketAddress* peerP2pAddresses; struct unexConn* unexpectedConnections; int cudaDev; int rank; int nranks; uint64_t magic; - volatile uint32_t *abortFlag; + volatile uint32_t* abortFlag; }; +#define STATE_RING(s, f) (s->ring.f) +#define STATE_LISTEN(s, f) (s->listen.f) -ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) { +// helper functions +static ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t magic, struct ncclSocket* socket, union ncclSocketAddress* addr, + ncclSocketType type) { + NCCLCHECK(ncclSocketInit(socket, &bootstrapNetIfAddr, magic, type, comm->abortFlag)); + NCCLCHECK(ncclSocketListen(socket)); + NCCLCHECK(ncclSocketGetAddr(socket, addr)); + return ncclSuccess; +} +static ncclResult_t getUDS(uint64_t* peerUDS) { + uint64_t randId; + NCCLCHECK(getRandomData(&randId, sizeof(randId))); + *peerUDS = getPidHash() + randId; + return ncclSuccess; +} +#define MAX_OOB_DEVS 16 +static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { + static int devOOB = -1; + if (devOOB < 0) { + pthread_mutex_lock(&bootstrapNetLock); + if (devOOB < 0) { + char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME"); + if (userIfEnv && strlen(userIfEnv) > 0) { + INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv); + bool searchNot = userIfEnv && userIfEnv[0] == '^'; + if (searchNot) userIfEnv++; + bool searchExact = userIfEnv && userIfEnv[0] == '='; + if (searchExact) userIfEnv++; + struct netIf userIfs[MAX_OOB_DEVS]; + int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS); + // loop over the device and return the first one matching + int devId = 0; + int nDev = 0; + NCCLCHECK(comm->ncclNet->devices(&nDev)); + while (devId < nDev) { + ncclNetProperties_t props; + comm->ncclNet->getProperties(devId, &props); + // check against user specified HCAs/ports + bool found = matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot; + if (found) { + devOOB = devId; + break; + } + devId++; + } + if (devOOB == -1) { + WARN("no device found matching NCCL_OOB_NET_IFNAME=%s, ignoring", userIfEnv); + goto noEnv; + } + } else { + noEnv: + // default choice is device 0 + devOOB = 0; + } + } + pthread_mutex_unlock(&bootstrapNetLock); + } + *dev = devOOB; + return ncclSuccess; +} + +static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE], + void** sendComm, ncclNetDeviceHandle_t** sendDevHandle, + void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) { + + int abortCounter = 0; + do { + NCCLCHECK(checkAbort(abortFlag, &abortCounter)); + if (!*sendComm) + NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle)); + if (!*recvComm) + NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle)); + } while (!*sendComm || !*recvComm); + return ncclSuccess; +} +static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket* sendSocket, struct ncclSocket* listenSock, struct ncclSocket* recvSocket, uint64_t magic, volatile uint32_t* abortFlag) { + NCCLCHECK(ncclSocketInit(sendSocket, addr, magic, ncclSocketTypeBootstrap, abortFlag)); + NCCLCHECK(ncclSocketConnect(sendSocket)); + NCCLCHECK(ncclSocketInit(recvSocket)); + NCCLCHECK(ncclSocketAccept(recvSocket, listenSock)); + return ncclSuccess; +} +static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* state, + union ncclSocketAddress* peerAddresss, + union ncclSocketAddress* peerProxy, uint64_t* peerUDS) { + ncclResult_t res = ncclSuccess; + int rank = comm->rank; + int nRanks = comm->nRanks; + struct bootstrapRingData { + union ncclSocketAddress peerAddress; + union ncclSocketAddress peerProxy; + uint64_t peerUDS; + }* ringData = NULL; + + NCCLCHECK(ncclCalloc(&ringData, nRanks)); + // pack + if (peerAddresss) + memcpy(&(ringData[rank].peerAddress), peerAddresss + rank, sizeof(union ncclSocketAddress)); + if (peerProxy) + memcpy(&(ringData[rank].peerProxy), peerProxy + rank, sizeof(union ncclSocketAddress)); + if (peerUDS) + memcpy(&(ringData[rank].peerUDS), peerUDS + rank, sizeof(uint64_t)); + + // allgather + NCCLCHECKGOTO(bootstrapAllGather(state, ringData, sizeof(struct bootstrapRingData)), res, exit); + + // unpack + for (int irank = 0; irank < nRanks; ++irank) { + if (peerAddresss) + memcpy(peerAddresss + irank, &(ringData[irank].peerAddress), sizeof(union ncclSocketAddress)); + if (peerProxy) + memcpy(peerProxy + irank, &(ringData[irank].peerProxy), sizeof(union ncclSocketAddress)); + if (peerUDS) + memcpy(peerUDS + irank, &(ringData[irank].peerUDS), sizeof(uint64_t)); + } + +exit: + free(ringData); + return ncclSuccess; +} + +static ncclResult_t sendToRoot(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct extInfo* info) { + ncclResult_t ret = ncclSuccess; + struct ncclSocket sock; + NCCLCHECK(ncclSocketInit(&sock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, comm->abortFlag)); + NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail); + NCCLCHECKGOTO(socketSend(&sock, info, sizeof(struct extInfo)), ret, fail); + NCCLCHECK(ncclSocketClose(&sock)); + return ret; +fail: + (void)ncclSocketClose(&sock); + return ret; +} + +NCCL_PARAM(StaggerRate, "UID_STAGGER_RATE", 7000); +NCCL_PARAM(StaggerThreshold, "UID_STAGGER_THRESHOLD", 256); + +ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { int rank = comm->rank; int nranks = comm->nRanks; + // char nextPeerHandle[NCCL_NET_HANDLE_MAXSIZE]; struct bootstrapState* state; struct ncclSocket* proxySocket; - ncclSocketAddress nextAddr; struct ncclSocket sock, listenSockRoot; - struct extInfo info = { 0 }; + struct extInfo info = {0}; + union ringConnectInfo nextPeer; + + uint64_t timers[BOOTSTRAP_INIT_TIME_N] = {0}; NCCLCHECK(ncclCalloc(&state, 1)); state->rank = rank; state->nranks = nranks; + state->cudaDev = comm->cudaDev; state->abortFlag = comm->abortFlag; + state->net = comm->ncclNet; comm->bootstrap = state; - comm->magic = state->magic = handle->magic; + comm->magic = state->magic = BOOTSTRAP_HANDLE(handles, 0)->magic; // state and comm magic set to the first magic ID - TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); + TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d", rank, nranks); - info.rank = rank; + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_TOTAL]); + // fill up the info info.nranks = nranks; - // Create socket for other ranks to contact me - NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); - NCCLCHECK(ncclSocketListen(&state->listenSock)); - NCCLCHECK(ncclSocketGetAddr(&state->listenSock, &info.extAddressListen)); - - // Create socket for root to contact me - NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); - NCCLCHECK(ncclSocketListen(&listenSockRoot)); - NCCLCHECK(ncclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); + info.nroots = nHandles; + // get the ring connection info + memset(&nextPeer, 0, sizeof(union ringConnectInfo)); + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_CREATE]); + if (ncclParamBootstrapNetEnable()) { + // Create net interface for other ranks to contact me (all gather) + NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev))); + NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm))); + memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE); + } else { + // create socket for ring neightbor to contact mee + NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, socket), &info.connectInfo.addr, ncclSocketTypeBootstrap)); + } + // Create socket for root to contact me using the root's magic + int curr_root = rootIdFromRank(rank, nranks, nHandles); + NCCLCHECK(createListenSocket(comm, BOOTSTRAP_HANDLE(handles, curr_root)->magic, &listenSockRoot, &info.listenRootAddress, ncclSocketTypeBootstrap)); + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_CREATE]); // stagger connection times to avoid an overload of the root - if (nranks > 128) { - long msec = rank; + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_DELAY]); + int nRankRoot = nRankFromRoot(curr_root, nranks, nHandles); + if (nRankRoot > ncclParamStaggerThreshold()) { + // for socket the message rate in microsec + double msg_rate = ncclParamStaggerRate() / 1.0e6; + long musec = localIdFromRoot(rank, curr_root, nranks, nHandles) / msg_rate; struct timespec tv; - tv.tv_sec = msec / 1000; - tv.tv_nsec = 1000000 * (msec % 1000); - TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); - (void) nanosleep(&tv, NULL); + long c_1e6 = 1e6; + tv.tv_sec = musec / c_1e6; + tv.tv_nsec = 1e3 * (musec % c_1e6); + TRACE(NCCL_BOOTSTRAP, "rank %d delaying connection to root by %ld microsec", rank, musec); + (void)nanosleep(&tv, NULL); } + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_DELAY]); // send info on my listening socket to root - NCCLCHECK(ncclSocketInit(&sock, &handle->addr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); - NCCLCHECK(ncclSocketConnect(&sock)); - NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); - NCCLCHECK(ncclSocketClose(&sock)); + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_SEND]); + // send contact info to my own root + info.rank = rank; + info.iroot = curr_root; + NCCLCHECK(sendToRoot(BOOTSTRAP_HANDLE(handles, curr_root), comm, &info)); + // if needed, send the connection info to the previous root + if (nHandles > 1 && isFirstFromRoot(rank, curr_root, nranks, nHandles)) { + int prev_rank = BOOTSTRAP_PID(rank - 1, nranks); + int prev_root = rootIdFromRank(prev_rank, nranks, nHandles); + info.rank = prev_rank + 1; // my rank as seen by the previous root + info.iroot = prev_root; + NCCLCHECK(sendToRoot(BOOTSTRAP_HANDLE(handles, prev_root), comm, &info)); + } + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_SEND]); // get info on my "next" rank in the bootstrap ring from root + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RECV]); NCCLCHECK(ncclSocketInit(&sock)); NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot)); - NCCLCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union ncclSocketAddress))); + NCCLCHECK(socketRecv(&sock, &nextPeer, sizeof(nextPeer))); NCCLCHECK(ncclSocketClose(&sock)); NCCLCHECK(ncclSocketClose(&listenSockRoot)); + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RECV]); - NCCLCHECK(ncclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); - NCCLCHECK(ncclSocketConnect(&state->ringSendSocket)); - // Accept the connect request from the previous rank in the AllGather ring - NCCLCHECK(ncclSocketInit(&state->ringRecvSocket)); - NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock)); + // accept and connect the ring network + if (ncclParamBootstrapNetEnable()) { + NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle, + &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle), + &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag)); + } else { + NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag)); + } // AllGather all listen handlers - NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks)); - NCCLCHECK(ncclSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank)); - NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress))); - - // Create the service proxy + // in case of failure, those resources will be free'd when calling bootstrapDestroy, so we can return immediatly NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks)); - NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks)); - - // proxy is aborted through a message; don't set abortFlag NCCLCHECK(ncclCalloc(&proxySocket, 1)); - NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag)); - NCCLCHECK(ncclSocketListen(proxySocket)); - NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank)); - NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress))); - // cuMem UDS support - // Make sure we create a unique UDS socket name - uint64_t randId; - NCCLCHECK(getRandomData(&randId, sizeof(randId))); - state->peerProxyAddressesUDS[rank] = getPidHash()+randId; - NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS))); + NCCLCHECK(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy)); + + NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks)); + NCCLCHECK(getUDS(state->peerProxyAddressesUDS + rank)); + + // create a socket for others to reach out (P2P) + union ncclSocketAddress peerSocketAddress; + NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap)); + NCCLCHECK(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress))); + memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress)); + + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RING]); + NCCLCHECK(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS)); + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RING]); + + // Create the service proxy and get the UDS NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS)); - TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); + BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]); + TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks); + INFO(NCCL_BOOTSTRAP | NCCL_PROFILE, "Bootstrap timings total %f (create %f, send %f, recv %f, ring %f, delay %f)", timers[BOOTSTRAP_INIT_TIME_TOTAL] / 1e9, + timers[BOOTSTRAP_INIT_TIME_CREATE] / 1e9, + timers[BOOTSTRAP_INIT_TIME_SEND] / 1e9, + timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9, + timers[BOOTSTRAP_INIT_TIME_RING] / 1e9, + timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9); return ncclSuccess; } -ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) { +ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) { ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; int prev, next; - ncclSocketAddress listenAddr, tmpAddr; - struct ncclSocket* proxySocket; + union ringConnectInfo info; + union ringConnectInfo nextPeer; + struct ncclSocket* proxySocket = NULL; struct bootstrapState* state; NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail); state->rank = rank; state->nranks = nranks; + state->cudaDev = comm->cudaDev; state->abortFlag = comm->abortFlag; + state->net = comm->ncclNet; comm->bootstrap = state; - comm->magic = state->magic = handle->magic; + comm->magic = state->magic = magic; - prev = parentRanks[(rank-1+nranks)%nranks]; - next = parentRanks[(rank+1)%nranks]; + prev = parentRanks[(rank - 1 + nranks) % nranks]; + next = parentRanks[(rank + 1) % nranks]; - // Setup my sockets for the allgather ring and other p2p connections - NCCLCHECKGOTO(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); - NCCLCHECKGOTO(ncclSocketInit(&state->ringRecvSocket, NULL, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); + // create a handle for the others to reach out to me + if (ncclParamBootstrapNetEnable()) { + NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail); + NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail); + memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE); + } else { + // create socket for ring neightbor to contact mee + NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, socket), &info.addr, ncclSocketTypeBootstrap)); + } + // create a socket for others to reach out (P2P) + union ncclSocketAddress peerSocketAddress; + NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap)); - // Create socket for other ranks to contact me - NCCLCHECKGOTO(ncclSocketListen(&state->listenSock), ret, fail); - - // Get addr from next rank - NCCLCHECKGOTO(ncclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail); - NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(union ncclSocketAddress)), ret, fail); - NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(union ncclSocketAddress)), ret, fail); - - NCCLCHECKGOTO(ncclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); - NCCLCHECKGOTO(ncclSocketConnect(&state->ringSendSocket), ret, fail); - // Accept the connect request from the previous rank in the AllGather ring - NCCLCHECKGOTO(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail); - - // AllGather all listen handlers - NCCLCHECKGOTO(ncclCalloc(&state->peerCommAddresses, nranks), ret, fail); - memcpy(state->peerCommAddresses+rank, &listenAddr, sizeof(union ncclSocketAddress)); - NCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)), ret, fail); + // Get addr from next rank using the parent's connections + NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail); + NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail); + if (ncclParamBootstrapNetEnable()) { + NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle, + &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle), + &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag), + ret, fail); + } else { + NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag)); + } + NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)), ret, fail); + memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress)); if (parent->config.splitShare) { /* map local rank to top parent local rank. */ for (int i = 0; i < nranks; ++i) { comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]]; } + NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL), ret, fail); } else { - // Create the service proxy NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail); - NCCLCHECKGOTO(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag, 0), ret, fail); - NCCLCHECKGOTO(ncclSocketListen(proxySocket), ret, fail); - NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail); - memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress)); - NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail); - // cuMem UDS support NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail); - // Make sure we create a unique UDS socket name - uint64_t randId; - NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail); - state->peerProxyAddressesUDS[rank] = getPidHash()+randId; - NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail); + // Create the service proxy and get the UDS + NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail); + NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), ret, fail); + NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), ret, fail); + NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail); NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail); } - INFO(NCCL_INIT, "bootstrapSplit: comm %p parent %p rank %d nranks %d color %d key %d prev %d next %d - DONE", comm, parent, rank, nranks, color, key, prev, next); + TRACE(NCCL_BOOTSTRAP, "bootstrapSplit: comm %p parent %p rank %d nranks %d color %d key %d prev %d next %d - DONE", comm, parent, rank, nranks, + color, key, prev, next); exit: return ret; fail: + free(proxySocket); goto exit; } -// Bootstrap send/receive functions -// -// We do not keep connections opened with all ranks at all times, and we have no guarantee -// that connections to our unique listen socket will arrive in the same order as we need -// them. Therefore, when establishing a connection, the sender sends a (peer, tag) tuple to -// allow the receiver to identify the flow, and keep it in an unexpected queue if needed. - -ncclResult_t bootstrapConnect(void* commState, int peer, int tag, struct ncclSocket* sock) { +struct socketAckInfo { + int rank; + int tag; +}; +static ncclResult_t socketConnect(void* commState, int peer, int tag, struct ncclSocket* sock) { ncclResult_t ret = ncclSuccess; struct bootstrapState* state = (struct bootstrapState*)commState; - NCCLCHECKGOTO(ncclSocketInit(sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail); + struct socketAckInfo ack = (struct socketAckInfo){.rank = state->rank, .tag = tag}; + NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap), ret, fail); NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail); - NCCLCHECKGOTO(bootstrapNetSend(sock, &state->rank, sizeof(int)), ret, fail); - NCCLCHECKGOTO(bootstrapNetSend(sock, &tag, sizeof(int)), ret, fail); + NCCLCHECKGOTO(socketSend(sock, &ack, sizeof(struct socketAckInfo)), ret, fail); return ncclSuccess; fail: - NCCLCHECK(ncclSocketClose(sock)); + (void)ncclSocketClose(sock); return ret; } - ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) { ncclResult_t ret = ncclSuccess; struct ncclSocket sock; - TRACE(NCCL_BOOTSTRAP, "Sending to peer=%d tag=%d size=%d", peer, tag, size); - NCCLCHECK(bootstrapConnect(commState, peer, tag, &sock)); - NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, exit); - + NCCLCHECK(socketConnect(commState, peer, tag, &sock)); + NCCLCHECKGOTO(socketSend(&sock, data, size), ret, fail); TRACE(NCCL_BOOTSTRAP, "Sent to peer=%d tag=%d size=%d", peer, tag, size); - -exit: NCCLCHECK(ncclSocketClose(&sock)); return ret; +fail: + (void)ncclSocketClose(&sock); + return ret; } - -ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) { +// Bootstrap send/receive functions +static ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) { // New unex struct unexConn* unex; NCCLCHECK(ncclCalloc(&unex, 1)); @@ -455,8 +851,7 @@ ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, list->next = unex; return ncclSuccess; } - -ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) { +static ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) { struct unexConn* elem = state->unexpectedConnections; struct unexConn* prev = NULL; *found = 0; @@ -491,10 +886,9 @@ static void unexpectedFree(struct bootstrapState* state) { } // We can't know who we'll receive from, so we need to receive everything at once -ncclResult_t bootstrapAccept(void* commState, int peer, int tag, struct ncclSocket* sock) { +static ncclResult_t socketAccept(void* commState, int peer, int tag, struct ncclSocket* sock) { ncclResult_t ret = ncclSuccess; struct bootstrapState* state = (struct bootstrapState*)commState; - int newPeer, newTag; // Search unexpected connections first int found; @@ -503,128 +897,203 @@ ncclResult_t bootstrapAccept(void* commState, int peer, int tag, struct ncclSock // Then look for new connections while (1) { + struct socketAckInfo ack = {0}; NCCLCHECKGOTO(ncclSocketInit(sock), ret, fail); - NCCLCHECKGOTO(ncclSocketAccept(sock, &state->listenSock), ret, fail); - NCCLCHECKGOTO(bootstrapNetRecv(sock, &newPeer, sizeof(int)), ret, fail); - NCCLCHECKGOTO(bootstrapNetRecv(sock, &newTag, sizeof(int)), ret, fail); - if (newPeer == peer && newTag == tag) return ncclSuccess; - NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, sock), ret, fail); + NCCLCHECKGOTO(ncclSocketAccept(sock, &STATE_LISTEN(state, peerSocket)), ret, fail); + NCCLCHECKGOTO(socketRecv(sock, &ack, sizeof(struct socketAckInfo)), ret, fail); + if (ack.rank == peer && ack.tag == tag) return ncclSuccess; + NCCLCHECKGOTO(unexpectedEnqueue(state, ack.rank, ack.tag, sock), ret, fail); } return ncclSuccess; fail: - NCCLCHECK(ncclSocketClose(sock)); + (void)ncclSocketClose(sock); return ret; } - // We can't know who we'll receive from, so we need to receive everything at once ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) { ncclResult_t ret; struct ncclSocket sock; - NCCLCHECK(bootstrapAccept(commState, peer, tag, &sock)); + NCCLCHECK(socketAccept(commState, peer, tag, &sock)); TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size); - NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, exit); -exit: + NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail); NCCLCHECK(ncclSocketClose(&sock)); return ret; +fail: + (void)ncclSocketClose(&sock); + return ret; } -// Collective algorithms, based on bootstrapSend/Recv, and sometimes bootstrapConnect/Accept - -ncclResult_t bootstrapRingAllGather(struct ncclSocket* prevSocket, struct ncclSocket* nextSocket, int rank, int nranks, char* data, int size) { +static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, void* recvComm, int rank, int nranks, char* data, int size, volatile uint32_t* abortFlag) { + ncclResult_t res; + uint64_t tFirst = 0, tRest = 0; + void* sendDataHandle = NULL; + void* recvDataHandle = NULL; + NCCLCHECKGOTO(netReg(net, sendComm, data, nranks * size, &sendDataHandle), res, exit); + NCCLCHECKGOTO(netReg(net, recvComm, data, nranks * size, &recvDataHandle), res, exit); /* Simple ring based AllGather * At each step i receive data from (rank-i-1) from prev * and send previous step's data from (rank-i) to next */ - for (int i=0; irank; int nranks = state->nranks; - TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size); + TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d size %d - AllGather", rank, nranks, size); - NCCLCHECK(bootstrapRingAllGather(&state->ringRecvSocket, &state->ringSendSocket, rank, nranks, (char*)allData, size)); - - TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); - return ncclSuccess; + uint64_t time = 0; + BOOTSTRAP_PROF_OPEN(time); + if (ncclParamBootstrapNetEnable()) { + NCCLCHECKGOTO(netRingAllGather(state->net, STATE_RING(state, net.sendComm), STATE_RING(state, net.recvComm), rank, nranks, (char*)allData, size, state->abortFlag), res, exit); + } else { + NCCLCHECKGOTO(socketRingAllGather(&STATE_RING(state, socket.send), &STATE_RING(state, socket.recv), rank, nranks, (char*)allData, size), res, exit); + } +exit: + BOOTSTRAP_PROF_CLOSE(time); + TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "bootstrapAllGather for %d B done in %f sec: %f MB/sec", size, time / 1e9, (nranks * size / 1e6) / (time / 1e9)); + TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d size %d - AllGather DONE", rank, nranks, size); + return res; } -ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag) { - if (nranks == 1) return ncclSuccess; - TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag); - +static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, int nranks, int tag) { + if (nranks == 1) + return ncclSuccess; /* Simple [intra] process barrier * * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" */ int data[1]; - for (int mask=1; maskunexpectedConnections != NULL) { unexpectedFree(state); if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) { @@ -632,26 +1101,31 @@ ncclResult_t bootstrapClose(void* commState) { return ncclInternalError; } } + if (ncclParamBootstrapNetEnable()) { + NCCLCHECK(state->net->closeSend(STATE_RING(state, net.sendComm))); + NCCLCHECK(state->net->closeRecv(STATE_RING(state, net.recvComm))); + NCCLCHECK(state->net->closeListen(STATE_LISTEN(state, net.comm))); + } else { + NCCLCHECK(ncclSocketClose(&STATE_RING(state, socket.send))); + NCCLCHECK(ncclSocketClose(&STATE_RING(state, socket.recv))); + NCCLCHECK(ncclSocketClose(&STATE_LISTEN(state, socket))); + } + // close the p2p socket + NCCLCHECK(ncclSocketClose(&STATE_LISTEN(state, peerSocket))); - NCCLCHECK(ncclSocketClose(&state->listenSock)); - NCCLCHECK(ncclSocketClose(&state->ringSendSocket)); - NCCLCHECK(ncclSocketClose(&state->ringRecvSocket)); - - free(state->peerCommAddresses); + // proxy things are free'd elsewhere + free(state->peerP2pAddresses); free(state); - return ncclSuccess; } ncclResult_t bootstrapAbort(void* commState) { + if (commState == NULL) + return ncclSuccess; struct bootstrapState* state = (struct bootstrapState*)commState; - if (commState == NULL) return ncclSuccess; - NCCLCHECK(ncclSocketClose(&state->listenSock)); - NCCLCHECK(ncclSocketClose(&state->ringSendSocket)); - NCCLCHECK(ncclSocketClose(&state->ringRecvSocket)); - free(state->peerCommAddresses); + // when aborting we need to close the proxy here (maybe?) free(state->peerProxyAddresses); free(state->peerProxyAddressesUDS); - free(state); + NCCLCHECK(bootstrapClose(commState)); return ncclSuccess; } diff --git a/src/collectives.cc b/src/collectives.cc index e21807e..be9468d 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -59,6 +59,7 @@ const char* ncclAlgoToString(int algo) { case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN"; case NCCL_ALGO_NVLS: return "NVLS"; case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE"; + case NCCL_ALGO_PAT: return "PAT"; default: return "Unknown"; } } diff --git a/src/debug.cc b/src/debug.cc index dde8e8f..d21ea3d 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -19,7 +19,7 @@ static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; char ncclLastError[1024] = ""; // Global string for the last error in human readable form -static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV +static uint64_t ncclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask is INIT and ENV FILE *ncclDebugFile = stdout; static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; static std::chrono::steady_clock::time_point ncclEpoch; @@ -122,7 +122,7 @@ static void ncclDebugInit() { int c = 0; char debugFn[PATH_MAX+1] = ""; char *dfn = debugFn; - while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) { + while (ncclDebugFileEnv[c] != '\0' && (dfn - debugFn) < PATH_MAX) { if (ncclDebugFileEnv[c++] != '%') { *dfn++ = ncclDebugFileEnv[c-1]; continue; @@ -132,16 +132,24 @@ static void ncclDebugInit() { *dfn++ = '%'; break; case 'h': // %h = hostname - dfn += snprintf(dfn, PATH_MAX, "%s", hostname); + dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%s", hostname); break; case 'p': // %p = pid - dfn += snprintf(dfn, PATH_MAX, "%d", pid); + dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%d", pid); break; default: // Echo everything we don't understand *dfn++ = '%'; - *dfn++ = ncclDebugFileEnv[c-1]; + if ((dfn - debugFn) < PATH_MAX) { + *dfn++ = ncclDebugFileEnv[c-1]; + } break; } + if ((dfn - debugFn) > PATH_MAX) { + // snprintf wanted to overfill the buffer: set dfn to the end + // of the buffer (for null char) and it will naturally exit + // the loop. + dfn = debugFn + PATH_MAX; + } } *dfn = '\0'; if (debugFn[0] != '\0') { @@ -181,9 +189,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file tid = syscall(SYS_gettid); } - int cudaDev; + int cudaDev = 0; if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) { - cudaGetDevice(&cudaDev); + (void)cudaGetDevice(&cudaDev); } char buffer[1024]; @@ -207,11 +215,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file va_start(vargs, fmt); len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); va_end(vargs); - // vsnprintf may return len > sizeof(buffer) in the case of a truncated output. + // vsnprintf may return len >= sizeof(buffer) in the case of a truncated output. // Rewind len so that we can replace the final \0 by \n - if (len > sizeof(buffer)) len = sizeof(buffer)-1; - buffer[len++] = '\n'; - if (len) fwrite(buffer, 1, len, ncclDebugFile); + if (len >= sizeof(buffer)) len = sizeof(buffer)-1; + if (len) { + buffer[len++] = '\n'; + fwrite(buffer, 1, len, ncclDebugFile); + } } NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); diff --git a/src/device/all_gather.h b/src/device/all_gather.h index 8fe2248..fb56e48 100644 --- a/src/device/all_gather.h +++ b/src/device/all_gather.h @@ -23,8 +23,11 @@ namespace { T *inputBuf = (T*)work->sendbuff; T *outputBuf = (T*)work->recvbuff; + // Coverity reports that the callee treats &ring->next as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, 1, Proto, 0> prims - (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg); + (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work); for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) { /////////////// begin AllGather steps /////////////// @@ -46,7 +49,7 @@ namespace { rankDest = ringRanks[nranks-j]; offset = dataOffset + rankDest * count; - prims.directRecvCopySend(offset, nelem); + prims.directRecvCopyDirectSend(offset, nelem); } // Make final copy from buffer to dest. @@ -54,7 +57,7 @@ namespace { offset = dataOffset + rankDest * count; // Final wait/copy. - prims.directRecv(offset, nelem); + prims.directRecv(offset, offset, nelem); } } } @@ -81,6 +84,31 @@ struct RunWorkColl +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + using Proto = ProtoSimple<1, 1>; + const int nranks = ncclShmem.comm.nRanks; + const int rank = ncclShmem.comm.rank; + size_t count, channelOffset, channelCount, chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount); + + T *inputBuf = (T*)work->sendbuff; + T *outputBuf = (T*)work->recvbuff; + Primitives, 0, Proto, 0> prims + (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg); + + PatAGAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); + int last = 0; + while (!last) { + int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem; + size_t inpIx, outIx; + patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last); + prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend); + } + } +}; + template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { @@ -165,7 +193,7 @@ struct RunWorkColl __device__ __forceinline__ void operator()( int tid, int tn, int slice, int maxSliceSize, - int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes + int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag ) { static_assert(SlicePerChunk==1, "require: SlicePerChunk==1"); static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1"); @@ -203,19 +231,22 @@ struct RunWorkColl (tid, tn, 0, nullptr, false, /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* { - return (char*)srcPtrs[src] + railAllOffset; + return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset; }, /*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* { return d < outIsDst ? outbuf + userOneBeg + : work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg : (char*)dstPtrs[d-outIsDst] + railAllOffset; }, delta); + } railAllOffset += delta; node += 1; } @@ -281,15 +312,15 @@ struct RunWorkColl deposit output + send to bcast - Primitives, /*Direct=*/0, Proto, 0> - prims(tid, tn, &direct->out, direct->heads + 1, nullptr, nullptr, - /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff, + /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.template process(scat); + prims.template process(scat, work->direct, 0); } } return; @@ -299,15 +330,15 @@ struct RunWorkColl deposit output - Primitives, /*Direct=*/0, Proto, 0> - prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr, - /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff, + /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work); for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.template process(scat); + prims.template process(scat, 0, work->direct); } return; } diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 293138f..36b8d32 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -23,8 +23,11 @@ namespace { int nelem; int chunk; + // Coverity reports that the callee treats &ring->next as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, 1, Proto, 0> prims - (tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); + (tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t remCount = channelCount - elemOffset; @@ -41,7 +44,7 @@ namespace { chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); - prims.send(offset, nelem); + prims.directSend(offset, offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j = 2; j < nranks; ++j) { @@ -49,7 +52,7 @@ namespace { chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); - prims.recvReduceSend(offset, nelem); + prims.directRecvReduceDirectSend(offset, offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final @@ -58,7 +61,7 @@ namespace { chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); - prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true); + prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*postOp=*/true); // k-2 steps: copy to next GPU for (int j = 1; j < nranks - 1; ++j) { @@ -66,7 +69,7 @@ namespace { chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); - prims.directRecvCopySend(offset, nelem); + prims.directRecvCopyDirectSend(offset, nelem); } // Make final copy from buffer to dest. @@ -75,7 +78,7 @@ namespace { offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); - prims.directRecv(offset, nelem); + prims.directRecv(offset, offset, nelem); } } @@ -90,34 +93,34 @@ namespace { int nelem; { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) - Primitives, /*Direct=*/0, Proto, 0> prims - (tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg); + Primitives, /*Direct=*/1, Proto, 0> prims + (tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work); if (tree->up == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); + prims.directRecvReduceCopy(offset, offset, nelem, /*postOp=*/true); } } else if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.send(offset, nelem); + prims.directSend(offset, nelem); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.recvReduceSend(offset, nelem); + prims.directRecvReduceDirectSend(offset, offset, nelem); } } } { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) Primitives, /*Direct=*/1, Proto, 0> prims - (tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg); + (tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work); if (tree->up == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; @@ -129,14 +132,14 @@ namespace { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecv(offset, nelem); + prims.directRecv(offset, offset, nelem); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecvCopySend(offset, nelem); + prims.directRecvCopyDirectSend(offset, nelem); } } } @@ -164,11 +167,11 @@ namespace { if (tree->up == -1) { // Reduce and broadcast. Max number of recv is 2, max number of send is 2 Primitives, /*Direct=*/1, Proto, 0> - prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg); + prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true); + prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*doPost=*/true); } } else if (tid < nthreadsSplit) { @@ -180,40 +183,46 @@ namespace { * into DirectRecv and DirectSend capabilities, this ctor would have both=0, * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1. */ + // Coverity reports that the callee treats &tree->up as an array. However, due to the use of + // FanAsymmetric, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, /*Direct=*/1, Proto, 0> - prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth); + prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work); if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.send(offset, nelem); + prims.directSend(offset, offset, nelem); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.recvReduceSend(offset, nelem); + prims.directRecvReduceDirectSend(offset, offset, nelem); } } } else { // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local) + // Coverity reports that the callee treats &tree->up as an array. However, due to the use of + // FanAsymmetric<1, n>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff, - work->redOpArg, 1*Proto::MaxGroupWidth); + work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work); if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecv(offset, nelem); + prims.directRecv(offset, offset, nelem); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecvCopySend(offset, nelem); + prims.directRecvCopyDirectSend(offset, nelem); } } } @@ -264,9 +273,9 @@ struct RunWorkColl= tidStartScatter && tid < tidStartReduce && hasUp) { // Scatter - Primitives, /*Direct=*/1, Proto, 0> + Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff, - work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work); + work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); @@ -276,12 +285,15 @@ struct RunWorkCollheadRank, direct->shift); } } + // Coverity complains about a possible overrun inside the destructor of "prims", but that's actually + // a false positive. + // coverity[overrun-call:FALSE] } else if (tid >= tidStartReduce && direct->out != -1) { if (hasDn) { // Reduce, send to network - Primitives, /*Direct=*/1, Proto, 0> + Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff, - work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work); + work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -323,6 +335,9 @@ struct RunWorkColl= tidStartBcast && tid < tidStartScatter && direct->out != -1) { if (hasDn) { // Recv from network, broadcast + // Coverity complains about a possible overrun inside the class below, but that's actually + // a false positive. + // coverity[identity_transfer:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff, work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work); @@ -382,7 +397,7 @@ struct RunWorkCollnHeads*chunkSize); - int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T)); + int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T)); if (tid < tidEndScatter) { // Scatter @@ -456,6 +471,9 @@ struct RunWorkColl; + // Coverity complains about a possible overrun inside the class below, but that's actually + // a false positive. + // coverity[identity_transfer:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); @@ -467,6 +485,9 @@ struct RunWorkColl; + // Coverity complains about a possible overrun inside the class below, but that's actually + // a false positive. + // coverity[identity_transfer:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work); @@ -479,6 +500,9 @@ struct RunWorkCollheadRank != -1) { // Recv from network, broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + // Coverity complains about a possible overrun inside the class below, but that's actually + // a false positive. + // coverity[identity_transfer:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL, work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); @@ -564,6 +588,9 @@ struct RunWorkColl; + // Coverity reports that the callee treats &treeUp as an array. However, due to the use of + // FanAsymmetric<3, 1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); @@ -579,6 +606,9 @@ struct RunWorkCollheadRank != -1) { // Recv from network, broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + // Coverity reports that the callee treats &treeUp as an array. However, due to the use of + // FanAsymmetric<1, 3>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL, work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); @@ -639,21 +669,21 @@ struct RunWorkColl, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, - work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); - prims.send(offset, nelem); + prims.directSend(offset, offset, nelem); } } } else { Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, - work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); - prims.recvReduceSend(offset, nelem); + prims.directRecvReduceDirectSend(offset, offset, nelem); } } } @@ -668,40 +698,49 @@ struct RunWorkColl, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, - work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); - prims.recv(offset, nelem, /*postOp*/true); + prims.directRecv(offset, offset, nelem, /*postOp*/true); } } } else { + // Coverity reports that the callee treats &send as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, - work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); - prims.recvCopyDirectSend(offset, nelem, /*postOp*/true); + prims.directRecvCopyDirectSend(offset, nelem, /*postOp*/true); } } } else { + // Coverity reports that the callee treats &send as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, - work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work); if (send == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); - prims.directRecv(offset, nelem); + prims.directRecv(offset, offset, nelem); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); - prims.directRecvCopySend(offset, nelem); + prims.directRecvCopyDirectSend(offset, nelem); } } } diff --git a/src/device/broadcast.h b/src/device/broadcast.h index 7026adc..851b01d 100644 --- a/src/device/broadcast.h +++ b/src/device/broadcast.h @@ -24,8 +24,11 @@ namespace { T *inputBuf = (T*)work->sendbuff; T *outputBuf = (T*)work->recvbuff; - Primitives, 0, Proto, 0> - prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg); + // Coverity reports that the callee treats &ring->next as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] + Primitives, 1, Proto, 0> + prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; @@ -33,14 +36,14 @@ namespace { if (rank == root) { if (inputBuf == outputBuf) { - prims.send(offset, nelem); + prims.directSend(offset, offset, nelem); } else { - prims.copySend(offset, offset, nelem); + prims.directCopySend(offset, offset, nelem); } } else if (nextRank == root) { - prims.recv(offset, nelem); + prims.directRecv(offset, offset, nelem); } else { - prims.recvCopySend(offset, nelem); + prims.directRecvCopyDirectSend(offset, nelem); } } } diff --git a/src/device/common.h b/src/device/common.h index 5fa7be9..967421b 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -97,7 +97,7 @@ __device__ inline void barrier_sync_aligned(int name, int nThreads) { __device__ inline bool barrier_red_or(bool vote, int name) { int ans; - asm("{ .reg .pred p;" + asm volatile("{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " barrier.red.or.pred p, %2, p; " " selp.s32 %0, 1, 0, p; }" @@ -106,7 +106,7 @@ __device__ inline bool barrier_red_or(bool vote, int name) { } __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) { int ans; - asm("{ .reg .pred p;" + asm volatile("{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " barrier.red.or.pred p, %2, %3, p; " " selp.s32 %0, 1, 0, p; }" @@ -115,7 +115,7 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) { } __device__ inline bool barrier_red_or_aligned(bool vote, int name) { int ans; - asm("{ .reg .pred p;" + asm volatile("{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " barrier.red.or.pred.aligned p, %2, p; " " selp.s32 %0, 1, 0, p; }" @@ -137,9 +137,9 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by int offset = 16*tid; if (offset < bytes) { uint64_t a=0, b=0; - asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset)); + asm volatile("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset) : "memory"); uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst); - asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b)); + asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b) : "memory"); } } @@ -300,6 +300,9 @@ struct RunWorkBatch { if (work->nWarps != workPrev->nWarps) __syncthreads(); } int subtn = work->nWarps*WARP_SIZE; + // Coverity reports a possible thread divergence due to not all threads participating in the collective. + // However, the code ensures that the participation is on a per-warp basis. + // coverity[device_thread_diverged:FALSE] if (tid < subtn) RunWorkColl().run(tid, subtn, work); } } @@ -348,6 +351,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a default: { int subtid = tid - 2*WARP_SIZE; int subtn = tn - 2*WARP_SIZE; + // Coverity reports a possible thread divergence due to not all threads participating in the collective. + // However, the code ensures that the participation is on a per-warp basis. + // coverity[device_thread_diverged:FALSE] loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x); } break; } diff --git a/src/device/common_kernel.h b/src/device/common_kernel.h index e82c947..f932f51 100644 --- a/src/device/common_kernel.h +++ b/src/device/common_kernel.h @@ -69,6 +69,8 @@ __device__ __forceinline__ void reduceCopyPacks( minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind; #pragma unroll for (int d=0; d < MinDsts; d++) + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind; // We dictate loop termination condition according to whether partial hunks @@ -93,13 +95,17 @@ __device__ __forceinline__ void reduceCopyPacks( #pragma unroll (MinSrcs-1 + !(MinSrcs-1)) for (int s=1; s < MinSrcs; s++) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_begin] BytePack tmp[Unroll]; + // coverity[dead_error_line] RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (s < MultimemSrcs) { // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. - acc[u] = applyLoadMultimem(redFn, minSrcs[s]); + // coverity[dead_error_line] + tmp[u] = applyLoadMultimem(redFn, minSrcs[s]); } else { // Use volatile loads in case credits are polled for with volatile (instead of acquire). tmp[u] = ld_volatile_global(minSrcs[s]); @@ -108,6 +114,7 @@ __device__ __forceinline__ void reduceCopyPacks( } #pragma unroll Unroll for (int u=0; u < Unroll; u++) { + // coverity[dead_error_line] if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]); acc[u] = applyReduce(redFn, acc[u], tmp[u]); } @@ -116,6 +123,8 @@ __device__ __forceinline__ void reduceCopyPacks( for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) { uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind; BytePack tmp[Unroll]; + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { @@ -125,6 +134,8 @@ __device__ __forceinline__ void reduceCopyPacks( } #pragma unroll Unroll for (int u=0; u < Unroll; u++) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]); acc[u] = applyReduce(redFn, acc[u], tmp[u]); } @@ -139,7 +150,10 @@ __device__ __forceinline__ void reduceCopyPacks( #pragma unroll (MinDsts + !MinDsts) for (int d=0; d < MinDsts; d++) { #pragma unroll Unroll + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_begin] for (int u=0; u < Unroll; u++) { + // coverity[dead_error_condition] if (d < MultimemDsts) { multimem_st_global(minDsts[d], acc[u]); } else { @@ -161,6 +175,8 @@ __device__ __forceinline__ void reduceCopyPacks( #pragma unroll for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk; #pragma unroll + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk; threadBytesBehind += nWarps*BytePerHunk; threadBytesAhead -= nWarps*BytePerHunk; diff --git a/src/device/generate.py b/src/device/generate.py index d0feee1..a0d2259 100755 --- a/src/device/generate.py +++ b/src/device/generate.py @@ -7,7 +7,7 @@ all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","Send all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"] all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"] all_protos = ["LL","LL128","SIMPLE"] -all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"] +all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"] ################################################################################ # The first command line argument is the path to the directory to generate and @@ -74,11 +74,11 @@ else: ################################################################################ algos_of_coll = { - "AllGather": ["RING","COLLNET_DIRECT","NVLS"], - "AllReduce": all_algos, + "AllGather": ["RING","COLLNET_DIRECT","NVLS","PAT"], + "AllReduce": ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"], "Broadcast": ["RING"], "Reduce": ["RING"], - "ReduceScatter": ["RING","COLLNET_DIRECT","NVLS"], + "ReduceScatter": ["RING","COLLNET_DIRECT","NVLS","PAT"], "SendRecv": [None] } @@ -253,6 +253,9 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f: cudart, _ = required_cuda(*kfn) sym = paste("_", "ncclDevKernel", *kfn) if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart) + # __global__ below gets removed by the host compiler, which results in + # Coverity diagnosing a specifiers inconsistency. + out("// coverity[declaration]\n") out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym) if cudart != 0: out("#endif\n") out("\n") diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h index b213fbe..e760998 100644 --- a/src/device/network/unpack/unpack.h +++ b/src/device/network/unpack/unpack.h @@ -19,10 +19,10 @@ inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) { #if __CUDA_ARCH__ >= 700 asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];" - : "=l"(v) : "l"(ptr)); + : "=l"(v) : "l"(ptr) : "memory"); #else asm volatile("ld.volatile.global.u64 {%0}, [%1];" - : "=l"(v) : "l"(ptr)); + : "=l"(v) : "l"(ptr) : "memory"); #endif } @@ -226,6 +226,8 @@ inline __device__ void ncclNetDeviceUnpackInner( int PPW = ppw(nbytes, nw); + // Coverity reports a potential overflow but in reality PPW is tiny so there's no need to store it in an uint64_t. + // coverity[overflow_before_widen] for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) { uint64_t iter_meta_cnt = meta_cnt - meta_s; diff --git a/src/device/op128.h b/src/device/op128.h index b2f8227..b2e519d 100644 --- a/src/device/op128.h +++ b/src/device/op128.h @@ -11,28 +11,28 @@ inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" - : "=l"(v0), "=l"(v1) : "l"(ptr)); + : "=l"(v0), "=l"(v1) : "l"(ptr) : "memory"); } inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) { asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};" - :: "l"(v0), "l"(v1), "l"(ptr)); + :: "l"(v0), "l"(v1), "l"(ptr) : "memory"); } inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) { uint64_t* shmemAsmPtr; - asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr)); + asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr) : "memory"); return shmemAsmPtr; } inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) { asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];" - : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr)); + : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr) : "memory"); } inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) { asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};" - :: "l"(v0), "l"(v1), "l"(shmemAsmPtr)); + :: "l"(v0), "l"(v1), "l"(shmemAsmPtr) : "memory"); } template @@ -48,20 +48,20 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1 // Produce 4 bytes of sub-register type by reading 2 4-byte // aligned values and shifting. uint32_t lo, hi; - asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0)); - asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1)); + asm volatile("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0) : "memory"); + asm volatile("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1) : "memory"); tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast(ptr))%4)); } } else if(sizeof(T) == 4) { #pragma unroll for(int e=0; e < 4; e++) - asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e)); + asm volatile("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e) : "memory"); } else /*sizeof(T)==8*/ { #pragma unroll for(int e=0; e < 2; e++) - asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e)); + asm volatile("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e) : "memory"); } v0 = tmp8[0]; v1 = tmp8[1]; @@ -146,6 +146,9 @@ struct BytePackOf> { template __device__ __forceinline__ typename BytePackOf::Pack toPack(T value) { union { typename BytePackOf::Pack p; T v; }; + // Coverity recommends the use of std::move here but, given that T is a POD + // scalar, a plain copy will be just as efficient. + // coverity[copy_assignment_call] v = value; return p; } @@ -183,7 +186,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad template<> \ __device__ __forceinline__ BytePack ld_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ - asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \ + asm volatile("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \ BytePack ans; \ ans.native = tmp; \ return ans; \ @@ -191,7 +194,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad template<> \ __device__ __forceinline__ BytePack ld_volatile_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ - asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \ + asm volatile("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \ BytePack ans; \ ans.native = tmp; \ return ans; \ @@ -212,7 +215,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad template<> \ __device__ __forceinline__ BytePack ld_relaxed_gpu_global(uintptr_t addr) { \ data_cxx_ty tmp; \ - asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \ + asm volatile("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr) : "memory"); \ BytePack ans; \ ans.native = tmp; \ return ans; \ @@ -242,18 +245,18 @@ DEFINE_ld_st__size(8, uint64_t, b64, l) template<> \ __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ - asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \ + asm volatile("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \ return ans; \ } \ template<> \ __device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ - asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \ + asm volatile("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \ - asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \ + asm volatile("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \ } DEFINE_ld_st_16__space(global, uintptr_t, l) DEFINE_ld_st_16__space(shared, uint32_t, r) @@ -262,7 +265,7 @@ DEFINE_ld_st_16__space(shared, uint32_t, r) template<> __device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) { BytePack<16> ans; - asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr)); + asm volatile("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr) : "memory"); return ans; } template<> @@ -277,33 +280,33 @@ __device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePa __device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) { uint64_t ans; - asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); return ans; } __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 - asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + asm volatile("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #else - asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #endif return ans; } __device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 - asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + asm volatile("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #else - asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #endif return ans; } __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 - asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #else - asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr))); + asm volatile("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); #endif return ans; } diff --git a/src/device/primitives.h b/src/device/primitives.h index 01cad70..1913640 100644 --- a/src/device/primitives.h +++ b/src/device/primitives.h @@ -115,19 +115,25 @@ struct PrimitivesWithoutDirect { __device__ void directSendFromOutput(intptr_t outIx, int eltN) { static_cast(this)->sendFromOutput(outIx, eltN); } - __device__ void directRecv(intptr_t outIx, int eltN) { + __device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) { static_cast(this)->recv(outIx, eltN, /*postOp=*/false); } __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { static_cast(this)->copySend(inpIx, outIx, eltN, postOp); } - __device__ void directRecvCopySend(intptr_t outIx, int eltN) { + __device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { static_cast(this)->recvCopySend(outIx, eltN, /*postOp=*/false); } - __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + __device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part static_cast(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp); } + __device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) { + static_cast(this)->recvReduceSend(inpIx, eltN); + } + __device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) { + static_cast(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp); + } }; #include "prims_simple.h" diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h index 4a6f9e2..1a1307f 100644 --- a/src/device/prims_ll.h +++ b/src/device/prims_ll.h @@ -101,7 +101,7 @@ class Primitives: uint32_t data1, flag1, data2, flag2; int spins = 0; do { - asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); + asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory"); if (checkAbort(spins, 0)) break; } while ((flag1 != flag) || (flag2 != flag)); uint64_t val64 = data1 + (((uint64_t)data2) << 32); @@ -112,9 +112,11 @@ class Primitives: __device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) { #pragma unroll for (int i=BeginIx; i < MaxRecv; i++) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] if (i < fan.nrecv()) { union ncclLLFifoLine* src = recvPtr(i) + offset; - asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4)); + asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory"); } } } @@ -123,7 +125,7 @@ class Primitives: uint32_t flag = recvFlag(i); int spins = 0; while (line[i].flag1 != flag || line[i].flag2 != flag) { - asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4)); + asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory"); if (checkAbort(spins, 0)) break; } uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32); @@ -131,7 +133,7 @@ class Primitives: } __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { - asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); + asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory"); } static constexpr int EltPerLine = sizeof(uint64_t)/sizeof(T); @@ -145,13 +147,13 @@ class Primitives: uint64_t u8; }; if(sizeof(U) == 1) - asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src)); + asm volatile("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src) : "memory"); else if(sizeof(U) == 2) - asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src)); + asm volatile("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src) : "memory"); else if(sizeof(U) == 4) - asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src)); + asm volatile("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src) : "memory"); else - asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src)); + asm volatile("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src) : "memory"); return elt; } @@ -165,13 +167,13 @@ class Primitives: }; elt = val; if(sizeof(U) == 1) - asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4)); + asm volatile("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4) : "memory"); else if(sizeof(U) == 2) - asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2)); + asm volatile("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2) : "memory"); else if(sizeof(U) == 4) - asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4)); + asm volatile("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4) : "memory"); else - asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8)); + asm volatile("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8) : "memory"); } struct DataLoader { @@ -194,6 +196,8 @@ class Primitives: else { #pragma unroll for(int i=0; i < EltPerLine; i++) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] if(i==0 || i < eltN) elt[i] = load(src + i); } @@ -218,6 +222,8 @@ class Primitives: u8 = val; #pragma unroll for(int i=0; i < EltPerLine; i++) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] if (i==0 || i < eltN) //store(dst+i, elt[i]); dst[i] = elt[i]; @@ -261,6 +267,8 @@ class Primitives: if (RECV) { data = !SRC ? peerData : applyReduce(redOp, peerData, data); #pragma unroll MaxRecv + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) { peerData = readLLFinish(offset, line, i); data = applyReduce(redOp, peerData, data); @@ -271,6 +279,8 @@ class Primitives: // Send : inter-node, then intra-node, then local if (SEND) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] for (int i=1; i < MaxSend && i < fan.nsend(); i++) storeLL(sendPtr(i)+offset, data, sendFlag(i)); storeLL(sendPtr(0)+offset, data, sendFlag(0)); @@ -288,6 +298,8 @@ class Primitives: postRecv(); } if (SEND) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] for (int i=1; i < MaxSend && i < fan.nsend(); i++) incSend(i, offset); incSend(0, offset); @@ -324,8 +336,8 @@ class Primitives: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, - bool userBufReg=false, int stepSize_=0 + uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr, + bool ipcReg = false, bool netReg = false, int stepSize_ = 0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), @@ -334,16 +346,23 @@ class Primitives: // If we are going to support oneshot collNet + LL, then we would need to add connector index here int nrecv=0, nsend=0; // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1 + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) { loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv); nrecv++; } + // coverity[dead_error_line] while (nsend < MaxSend && sendPeers[nsend] >= 0) { loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend); nsend++; } this->fan = Fan(nrecv, nsend); + // Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually + // happen given the two "while" loops just above. + // coverity[var_deref_model:FALSE] loadRecvSync(); + // coverity[var_deref_model:FALSE] loadSendSync(); setDataPtrs(inputBuf, outputBuf); } diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h index 9c71695..2cb10cc 100644 --- a/src/device/prims_ll128.h +++ b/src/device/prims_ll128.h @@ -234,6 +234,8 @@ class Primitives: } } + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] for (int i=1; i: /************************ Send **************************/ if (SEND) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] for (int i=1; ifan = Fan(nrecv, nsend); + // Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually + // happen given the two "while" loops just above. + // coverity[var_deref_model:FALSE] loadRecvSync(); + // coverity[var_deref_model:FALSE] loadSendSync(); setDataPtrs(inputBuf, outputBuf); } diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h index c026570..945878b 100644 --- a/src/device/prims_simple.h +++ b/src/device/prims_simple.h @@ -7,6 +7,12 @@ #include "network/unpack/unpack.h" #include +enum primsMode { + primsModeDefault = 0, + primsModePatRs = 1, + primsModePatAg = 2 +}; + template class Primitives< @@ -14,21 +20,25 @@ class Primitives< > { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; - static constexpr int RoleWaitRecv = 0x04, // 0x1 0x2 are free to use + static constexpr int RoleInput = 0x01, + RoleOutput = 0x02, + RoleWaitRecv = 0x04, RoleWaitSend = 0x08, RolePostSend = 0x10, RolePostRecv = 0x20, Aborted = 0x40, - UserBufferMode = 0x80, + NetRegMode = 0x80, ConnFifoEnabled = 0x100, DirectWrite = 0x200, DirectRead = 0x400, - // 0x800 is free to use + PatMode = 0x800, NvlsMinPolling = 0x1000, NetDeviceUnpack = 0x2000, AnyNetDeviceUnpack = 0x4000, NvlsDirectRead = 0x8000, - NvlsDirectWrite = 0x10000; + NvlsDirectWrite = 0x10000, + IpcWrite = 0x20000, + IpcRead = 0x40000; const int tid, tidInBlock; const int nthreads; int nworkers; @@ -38,13 +48,15 @@ class Primitives< int flags; int group; uint64_t step; + struct ncclConnInfo* conn = NULL; struct ncclConnFifo* connFifo = NULL; T* connEltsFifo; - T* directBuff; + T* directBuff = NULL; uint64_t *connStepPtr; uint64_t connStepCache; // Cache last seen value of (*connStepPtr) int connStepSize; // Connection step size void* netDeviceHandle; + uint64_t accSize; // Accumulated size. Used by PAT operations // Don't use barrier 0 as it's used by the final sync __device__ void barrier() { @@ -95,7 +107,7 @@ class Primitives< #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 if (flags & NvlsMinPolling) { uint64_t ans; - asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + asm volatile("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory"); return ans; } #endif @@ -107,8 +119,10 @@ class Primitives< template __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) { const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; - const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input + const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead)); // no wait when directly reading from remote input const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) || ((flags & (Send*RoleWaitSend)) && !noSendWait)) { int spins = 0; @@ -125,28 +139,30 @@ class Primitives< void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst) : (ncclShmem.groups[group].srcs + Src); - if (flags & UserBufferMode) { + if (flags & NetRegMode) { // Do nothing } else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) { ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T); } else if (isSendNotRecv && DirectSend) { - if (flags & (DirectWrite | NvlsDirectWrite)) { + if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) { ptrs[index] = directBuff + dstIx + offset; - } else if (flags & DirectRead) { // empty send + } else if ((flags & DirectRead) || (flags & IpcRead)) { // empty send ptrs[index] = nullptr; } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } } else if (!isSendNotRecv && DirectRecv) { - if (flags & (DirectRead | NvlsDirectRead)) { + if (flags & (DirectRead | NvlsDirectRead | IpcRead)) { ptrs[index] = directBuff + srcIx + offset; - } else if (flags & DirectWrite) { + } else if ((flags & DirectWrite) || (flags & IpcWrite)) { ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } } else { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } if (flags & NetDeviceUnpack) { @@ -182,7 +198,7 @@ class Primitives< int slice = 0; int offset = 0; - if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) { + if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) { // Worker-only loop for non-empty slices. Non-workers and empty slices are // processed in the loop following this if block. The benefit of splitting // the loop like this is we pull two branches out of the critical path. @@ -234,7 +250,7 @@ class Primitives< if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0] /* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch", * so we need to check whether MultimemSrcs and MultimemDsts are 0. */ - && MultimemSrcs == 0 && MultimemDsts == 0) { + && MultimemSrcs == 0 && MultimemDsts == 0 && !Src) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (Send) { reduceCopy @@ -250,7 +266,7 @@ class Primitives< Recv, ncclShmem.groups[group].srcs, Dst, ncclShmem.groups[group].dsts, workSize); - } else { + } else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) { constexpr int PreOpSrcs = SrcBuf != Input ? 0 : DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; reduceCopy(0 < sliceSize); offset += sliceSize; slice += 1; + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] } while (slice < SlicePerChunk && offset < nelem); } @@ -310,12 +328,13 @@ public: } template - __device__ __forceinline__ void process(Fn &&fn) { + __device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag, uint32_t recvDirectFlag) { #pragma unroll 1 for (int slice=0; slice < SlicePerChunk; slice++) { if (tid < nworkers) { + int nsend, nrecv; if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) { - bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; + const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); @@ -326,19 +345,53 @@ public: if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) { int offset = loadInt(&connFifo[step%NCCL_STEPS].offset); ptrs[index] = connEltsFifo + offset/sizeof(T); + } else if (Direct && fn.work->regUsed) { + if (isSendNotRecv) { + if (flags & (DirectWrite | IpcWrite)) { + ptrs[index] = directBuff; + } else if (flags & (DirectRead | IpcRead)) { // empty send + ptrs[index] = nullptr; + } else { + ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; + } + } else { + if (flags & (DirectRead | IpcRead)) { + ptrs[index] = directBuff; + } else if (flags & (DirectWrite | IpcWrite)) { + if (Send) + ptrs[index] = directBuff; // send to next from my output buffer + else + ptrs[index] = nullptr; + } else { + ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; + } + } } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } } subBarrier(); - fn.template operator() - (tid, nworkers, slice, stepSize*StepPerSlice, - fan.nrecv(), ncclShmem.groups[group].srcs, - fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes); + if (Recv == 0 || ncclShmem.groups[group].srcs[0] == nullptr) { + nrecv = 0; + } else { + nrecv = fan.nrecv(); + } + + if (Send == 0 || ncclShmem.groups[group].dsts[0] == nullptr) { + nsend = 0; + } else { + nsend = fan.nsend(); + } + fn.template operator() < SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend > + (tid, nworkers, slice, stepSize * StepPerSlice, + nrecv, ncclShmem.groups[group].srcs, + nsend, ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes, sendDirectFlag, recvDirectFlag); } barrier(); int32_t dstSize = 0; if (flags & Send*RolePostSend) { + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_begin] dstSize = ncclShmem.groups[group].dstSizes[index]; ncclShmem.groups[group].dstSizes[index] = 0; if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T); @@ -421,99 +474,97 @@ private: } } - __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) { - if (flags & (RoleWaitRecv|RolePostRecv)) { - auto *conn = &peer->recv[connIndex]; - if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) { - // handle must be a device ptr - netDeviceHandle = conn->netDeviceHandle.handle; - // Cache the handle - ncclNetDeviceUnpackSetup(netDeviceHandle, group, index); - flags |= NetDeviceUnpack; - } - step = conn->step; - step = roundUp(step, SlicePerChunk*StepPerSlice); - if (flags & RolePostRecv) { - connStepPtr = conn->head; - *connStepPtr = step; // Return credits in case we rounded up. - } - if (flags & RoleWaitRecv) { - ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() - flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; - connStepPtr = conn->tail; - connStepCache = loadStepValue(connStepPtr); - connStepSize = conn->stepSize/sizeof(T); - connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; - if (conn->connFifo != nullptr) { - flags |= ConnFifoEnabled; - connFifo = conn->connFifo; - } else if (Direct) { - // User buffers have been registered - if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { - if (connIndex == 1 && P2p == 0) { - flags |= DirectRead; // scatter-reduce use direct pull - } else { - flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : - (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; - } - } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { - if (connIndex == 1 && P2p == 0) { - flags |= DirectRead; // scatter-reduce use direct pull - } else { - // direct read not allowed in non-register case - // otherwise, in one-to-multi send, we could mix empty send and intermediate send - flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0; - } - } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) { - /* NVLS direct */ - flags |= NvlsDirectRead; + __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) { + conn = &peer->recv[connIndex]; + if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) { + // handle must be a device ptr + netDeviceHandle = conn->netDeviceHandle.handle; + // Cache the handle + ncclNetDeviceUnpackSetup(netDeviceHandle, group, index); + flags |= NetDeviceUnpack; + } + step = conn->step; + step = roundUp(step, SlicePerChunk*StepPerSlice); + if (flags & RolePostRecv) { + connStepPtr = conn->head; + *connStepPtr = step; // Return credits in case we rounded up. + } + if (flags & RoleWaitRecv) { + if ((flags & PatMode) == 0) ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() + flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; + connStepPtr = conn->tail; + connStepCache = loadStepValue(connStepPtr); + connStepSize = conn->stepSize/sizeof(T); + connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; + if (conn->connFifo != nullptr) { + flags |= ConnFifoEnabled; + connFifo = conn->connFifo; + } else if (Direct && regFlag) { + // User buffers have been registered + if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) { + if (P2p) { + flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead; + } else if (connIndex == 1 && direct) { + flags |= IpcRead; + } else { + flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite; } + } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) { + if (P2p) { + flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead; + } else if (connIndex == 1 && direct) { + flags |= DirectRead; // scatter-reduce use direct pull + } else { + flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite; + } + } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) { + /* NVLS direct */ + flags |= NvlsDirectRead; } } } } - __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) { - if (flags & (RoleWaitSend|RolePostSend)) { - auto *conn = &peer->send[connIndex]; - step = conn->step; - step = roundUp(step, SlicePerChunk*StepPerSlice); + __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) { + conn = &peer->send[connIndex]; + step = conn->step; + step = roundUp(step, SlicePerChunk*StepPerSlice); - connFifo = conn->connFifo; - if (connFifo != nullptr) flags |= ConnFifoEnabled; + connFifo = conn->connFifo; + if (connFifo != nullptr) flags |= ConnFifoEnabled; - if (flags & RolePostSend) { - connStepPtr = conn->tail; - connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; - } - if (flags & RoleWaitSend) { - ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() - flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; - connStepPtr = conn->head; - connStepCache = loadStepValue(connStepPtr); - connStepSize = conn->stepSize/sizeof(T); - connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; - if (connFifo == nullptr && Direct) { - // User buffers have been registered - if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { - if (connIndex == 1 && P2p == 0) { - flags |= DirectRead; // scatter-reduce use direct pull - } else { - flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : - (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; - } - } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { - if (connIndex == 1 && P2p == 0) { - flags |= DirectRead; // scatter-reduce use direct pull - } else { - // direct read not allowed in non-register case - // otherwise, in one-to-multi send, we could mix empty send and intermediate send - flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0; - } - } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) { - /* NVLS direct */ - flags |= NvlsDirectWrite; + if (flags & RolePostSend) { + connStepPtr = conn->tail; + connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; + } + if (flags & RoleWaitSend) { + if ((flags & PatMode) == 0) ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() + flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; + connStepPtr = conn->head; + connStepCache = loadStepValue(connStepPtr); + connStepSize = conn->stepSize/sizeof(T); + connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; + if (connFifo == nullptr && Direct && regFlag) { + // User buffers have been registered + if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) { + if (P2p) { + flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead; + } else if (connIndex == 1 && direct) { + flags |= IpcRead; + } else { + flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite; } + } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) { + if (P2p) { + flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead; + } else if (connIndex == 1 && direct) { + flags |= DirectRead; // scatter-reduce use direct pull + } else { + flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite; + } + } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) { + /* NVLS direct */ + flags |= NvlsDirectWrite; } } } @@ -523,7 +574,8 @@ private: __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0 + uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr, + bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault ): tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) { @@ -531,33 +583,71 @@ private: // For send operations, we need an extra warp to overlap the threadfence and the copy this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0); - int nrecv=0, nsend=0; - while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++; - while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++; - this->fan = Fan(nrecv, nsend); - - constexpr int ThreadPerSync = - MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups - MaxSend >= 8 || MaxRecv >= 8 ? 16 : - 8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp - static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers"); - - index = -1; + int peer = -1; flags = 0; - assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role. - if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; } - else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; } - else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); } - else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); } + index = -1; + if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers + int nrecv=0, nsend=0; + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_line] + while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++; + // coverity[dead_error_line] + while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++; + this->fan = Fan(nrecv, nsend); - int peer = 0; - if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; - if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; + constexpr int ThreadPerSync = + MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups + MaxSend >= 8 || MaxRecv >= 8 ? 16 : + 8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp + static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers"); - loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e); - loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e); + assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role. + // Coverity assumes that index will equal tid based on the line below, but it doesn't consider the setting + // of flags. This results in multiple false positive overruns being reported here and in all_reduce.h. + // Unfortunately, we've been unsuccessful in trying to silence them with a single directive here so + // instead it's being done at the callers. + // coverity[assignment:FALSE] + if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; } + // Yes, for some template arguments this code will be unreachable. That's fine. + // coverity[dead_error_begin] + else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; } + else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); } + else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); } - if (userBufReg) flags |= UserBufferMode; + if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; + if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; + } else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n + flags |= PatMode; + accSize = 0; + int nranks = ncclShmem.comm.nRanks; + int rank = ncclShmem.comm.rank; + // A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer. + index = tid % 32; + uint32_t delta = 1 << index; + const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv}; + int block = tid / 32; + if (block < 4 && delta < nranks) { + int role = roles[block]; + if (mode == primsModePatRs) { + if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks; + if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks; + } else if (mode == primsModePatAg) { + if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks; + if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks; + } + flags |= role; + } else if (tid == 128) { + flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation + } + } + + // Coverity thinks that index could be -1 here but that's not actually the case. + // coverity[negative_returns:FALSE] + if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg); + // coverity[negative_returns:FALSE] + if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg); + + if (netReg) flags |= NetRegMode; if (barrierAny(flags & NetDeviceUnpack)) { flags |= AnyNetDeviceUnpack; @@ -569,18 +659,14 @@ private: } } - setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e); + // coverity[negative_returns:FALSE] + setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer); } __device__ ~Primitives() { - // Ensure ncclShmem.groups[].send/recvConns are available - barrier(); // Save steps for the next operation - if (flags & (RolePostSend|RolePostRecv)) { - auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns; - conns[index]->step = step; - } - if ((flags & UserBufferMode) && (flags & RoleWaitSend)) { + if (flags & (RolePostSend|RolePostRecv)) conn->step = step; + if ((flags & NetRegMode) && (flags & RoleWaitSend)) { // Make sure we wait until the proxy has sent data before we return. // We don't want the next CUDA kernel to overwrite the send buffer which // was accessed directly. @@ -599,97 +685,111 @@ private: barrier(); } - __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) { + __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) { if (tid==0) { ncclShmem.groups[group].userInput = (void*)inputBuf; ncclShmem.groups[group].userOutput = (void*)outputBuf; ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input } - bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite); - bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite)); - bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched) - bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer - int regUsed = e != nullptr ? e->coll.regUsed : 0; - if (Direct && recvProvider) { - int spins = 0; - void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; - // Wait for consumer to consume previous value before trampling it. - if (slot) { - while (*slot != nullptr && !checkAbort(spins)); - directBuff = (T*)outputBuf; - // Encode pointer by XOR'ing against some address they definitely wouldn't send - // since we want to allow them sending us nullptr while not colliding with - // the empty slot value. - *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); - } - } - if (Direct && sendAcceptor) { - int spins = 0; - void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; - void *ptr; - while (slot) { - ptr = *slot; - if (ptr != nullptr || checkAbort(spins)) break; - } - - if (slot) { - directBuff = regUsed ? (T*)(e->dnOutputs[index]) : - reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); - *slot = nullptr; - } else { - /* slot is NULL, it must be regUsed == 1 */ - directBuff = (T*)e->dnOutputs[index]; - } - } - if (Direct && sendProvider) { - int spins = 0; - void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; - volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange; - volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1; - // Wait for consumer to consume previous value before trampling it. - if (slot && argSlot0 && argSlot1) { - while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins)); - // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) - // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) - directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; - // Exchange pre-scalers for use in direct pull - *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg; - *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32); - // Encode pointer by XOR'ing against some address they definitely wouldn't send - // since we want to allow them sending us nullptr while not colliding with - // the empty slot value. - *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); - } - } - if (Direct && recvAcceptor) { - int spins = 0; - void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; - volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange; - volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1; - void *ptr; - while (slot) { - ptr = *slot; - if (ptr != nullptr || checkAbort(spins)) break; - } - - if (slot && argSlot0 && argSlot1) { - directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) : - reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); - if (MaxSend != 0) { // reduce group rather than gather group - // Store scalers for remote inputs - uint64_t arg0, arg1; - while (true) { - arg0 = *argSlot0; - arg1 = *argSlot1; - if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; + if (Direct && ipcReg) { + bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite); + bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite); + bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched) + bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer + if (recvProvider) { + int spins = 0; + void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; + // Wait for consumer to consume previous value before trampling it. + if (slot) { + T* exchgPtr; + directBuff = (T*)outputBuf; + while (*slot != nullptr && !checkAbort(spins)); + if (P2p) { + exchgPtr = (T*)outputBuf; + } else { + int localPeer = ncclShmem.comm.rankToLocalRank[peer]; + exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]); } - ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff); + *slot = reinterpret_cast(exchgPtr); + } + } + if (sendAcceptor) { + int spins = 0; + void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; + void* ptr; + while (slot) { + ptr = *slot; + if (ptr != nullptr || checkAbort(spins)) break; + } + + if (slot) { + directBuff = reinterpret_cast(ptr); + *slot = nullptr; + } else { + directBuff = (T*)work->dnOutputs[index]; + } + } + if (sendProvider) { + int spins = 0; + void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; + volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange; + volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange + 1; + // Wait for consumer to consume previous value before trampling it. + if (slot && argSlot0 && argSlot1) { + T* exchgPtr; + while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins)); + // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) + // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) + directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; + if (P2p) { + exchgPtr = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; + } else { + int localPeer = ncclShmem.comm.rankToLocalRank[peer]; + if (MaxRecv == 0) + exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]); + else + exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]); + } + + // Exchange pre-scalers for use in direct pull + *argSlot0 = (uint64_t(1) << 32) | (uint32_t)redOpArg; + *argSlot1 = (uint64_t(1) << 32) | (uint32_t)(redOpArg >> 32); + *slot = reinterpret_cast(exchgPtr); + } + } + if (recvAcceptor) { + int spins = 0; + void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; + volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange; + volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange + 1; + void* ptr; + while (slot) { + ptr = *slot; + if (ptr != nullptr || checkAbort(spins)) break; + } + + if (slot && argSlot0 && argSlot1) { + directBuff = reinterpret_cast(ptr); + if (MaxSend != 0) { // reduce group rather than gather group + // Store scalers for remote inputs + uint64_t arg0, arg1; + while (true) { + arg0 = *argSlot0; + arg1 = *argSlot1; + if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; + } + ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff); + } + *argSlot0 = 0; *argSlot1 = 0; + *slot = nullptr; + } else { + // Coverity complains about work being possibly NULL below. However, slot + // being NULL means that the NVLS buffer is registered (regUsed == 1) + // so work can't be NULL in this code path. + // coverity[var_deref_op] + directBuff = (T*)work->dnInputs[index]; } - *argSlot0 = 0; *argSlot1 = 0; - *slot = nullptr; - } else { - directBuff = (T*)e->dnInputs[index]; } } } @@ -717,8 +817,8 @@ private: __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) { - genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false); + __device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) { genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false); @@ -737,8 +837,8 @@ private: __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) { - genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false); + __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { + genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) { genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false); @@ -750,6 +850,9 @@ private: __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp); } + __device__ __forceinline__ void directRecvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + genericOp<1, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp); + } __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); @@ -757,14 +860,20 @@ private: __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); } + __device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) { + genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp); + } __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + __device__ __forceinline__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } + __device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) { + genericOp<1, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); + } __device__ __forceinline__ void scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) { @@ -783,4 +892,126 @@ private: directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) { ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } + + __device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) { + nelem = nelem < 0 ? 0 : nelem; + T* userInput = (T*)ncclShmem.groups[group].userInput; + T* userOutput = (T*)ncclShmem.groups[group].userOutput; + + if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) { + ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset; + int spins = 0; + while (connStepCache < step + StepPerSlice) { + connStepCache = loadStepValue(connStepPtr); + if (checkAbort(spins)) break; + } + if (postRecv) step += StepPerSlice; + } + if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) { + int spins = 0; + while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) { + connStepCache = loadStepValue(connStepPtr); + if (checkAbort(spins)) break; + } + ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset; + if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) { + // New data, add our own data to it. + ncclShmem.groups[group].srcs[1] = userInput + inpIx; + accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize; + if (flags & ConnFifoEnabled) + connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T); + } else { + // There is already data in there, accumulate instead of writing to it. + ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0]; + } + if (postSend) step += StepPerSlice; + } + if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer + ncclShmem.groups[group].dsts[0] = userOutput + outIx; + if (accSize < outIx + nelem) { + // New data, add our own data to it. + ncclShmem.groups[group].srcs[1] = userInput + inpIx; + accSize = outIx + nelem; + } else { + // There is already data in there, accumulate instead of writing to it. + ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0]; + } + } + barrier(); + int nSrcs = 2; + void** srcs = ncclShmem.groups[group].srcs; + if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source + + int workSize = ncclShmem.aborted ? 0 : nelem; + + reduceCopy + (tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false, + nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize); + + barrier(); + if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem); + if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem); + } + + __device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) { + nelem = nelem < 0 ? 0 : nelem; + T* userInput = (T*)ncclShmem.groups[group].userInput; + T* userOutput = (T*)ncclShmem.groups[group].userOutput; + + if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) { + ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset; + int spins = 0; + while (connStepCache < step + recvStepOffset + StepPerSlice) { + connStepCache = loadStepValue(connStepPtr); + if (checkAbort(spins)) break; + } + if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) { + // New data, copy to our output buffer. + ncclShmem.groups[group].dsts[1] = userOutput + outIx; + accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize; + } else { + ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done + } + if (postRecv) step += StepPerSlice; + } + if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) { + int spins = 0; + while (connStepCache + NCCL_STEPS < step + StepPerSlice) { + connStepCache = loadStepValue(connStepPtr); + if (checkAbort(spins)) break; + } + ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset; + if (postSend) { + if (flags & ConnFifoEnabled) + connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T); + step += StepPerSlice; + } + } + if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer + ncclShmem.groups[group].srcs[0] = userInput + inpIx; + if (accSize < inpIx + nelem) { + // New data, copy to our output buffer. + ncclShmem.groups[group].dsts[1] = userOutput + outIx; + accSize = inpIx + nelem; + } else { + ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done + } + } + barrier(); + int nDsts = 2; + void** dsts = ncclShmem.groups[group].dsts; + if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest + if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done. + + int workSize = ncclShmem.aborted ? 0 : nelem; + + reduceCopy + (tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false, + 1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize); + + barrier(); + if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem); + if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem); + } + }; diff --git a/src/device/reduce.h b/src/device/reduce.h index 91cdaeb..f8597a6 100644 --- a/src/device/reduce.h +++ b/src/device/reduce.h @@ -23,6 +23,9 @@ namespace { size_t offset; int nelem; + // Coverity reports that the callee treats &ring->next as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h index 9e78da9..b069c07 100644 --- a/src/device/reduce_kernel.h +++ b/src/device/reduce_kernel.h @@ -234,10 +234,10 @@ struct Apply_Reduce, /*EltPerPack=*/4> { uint32_t a = apack.native; uint32_t b = bpack.native; uint32_t ab0 = (a*b) & 0xffu; - asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u)); + asm volatile("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u)); uint32_t ab1; - asm("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000)); - asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u)); + asm volatile("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000)); + asm volatile("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u)); apack.native = __byte_perm(ab0, ab1, 0x6420); return apack; } @@ -260,8 +260,12 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y)) + // Coverity recommends the use of std::move here but, given that half is a scalar, + // a plain copy will be just as efficient. + // coverity[copy_constructor_call] SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y)) SPECIALIZE_REDUCE(FuncProd, half, 1, half, __hmul(x, y)) + // coverity[copy_constructor_call] SPECIALIZE_REDUCE(FuncProd, half, 2, half2, __hmul2(x, y)) #else SPECIALIZE_REDUCE(FuncSum, half, 1, half, __float2half(__half2float(x) + __half2float(y))) @@ -270,6 +274,7 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f #if __CUDA_ARCH__ >= 800 SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y)) + // coverity[copy_constructor_call] SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y)) #else SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y)))) @@ -278,10 +283,13 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f #if defined(__CUDA_BF16_TYPES_EXIST__) #if __CUDA_ARCH__ >= 800 SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y)) + // coverity[copy_constructor_call] SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y)) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y)) + // coverity[copy_constructor_call] SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y)) SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y)) + // coverity[copy_constructor_call] SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y)) #else SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y))) @@ -402,6 +410,9 @@ struct FuncPreMulSum { }; template<> +// Coverity recommends the users of this type to use std::move in certain cases but, +// given that half is a scalar, a plain copy will be just as efficient. +// coverity[moveable_type] struct FuncPreMulSum { using EltType = half; #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 @@ -424,6 +435,9 @@ struct FuncPreMulSum { #if defined(__CUDA_BF16_TYPES_EXIST__) template<> + // Coverity recommends the users of this type to use std::move in certain cases but, + // given that __nv_bfloat16 is a scalar, a plain copy will be just as efficient. + // coverity[moveable_type] struct FuncPreMulSum<__nv_bfloat16> { using EltType = __nv_bfloat16; #if __CUDA_ARCH__ >= 800 @@ -584,9 +598,9 @@ struct Apply_PostOp, /*EltPerPack=*/1> { static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ BytePack ans; \ - asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ - : "l"(addr)); \ + : "l"(addr) : "memory"); \ return ans; \ } \ }; @@ -597,13 +611,13 @@ struct Apply_PostOp, /*EltPerPack=*/1> { __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ BytePack ans; \ if (fn.isMinNotMax) { \ - asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ - : "l"(addr)); \ + : "l"(addr) : "memory"); \ } else { \ - asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ - : "l"(addr)); \ + : "l"(addr) : "memory"); \ } \ return ans; \ } \ @@ -615,12 +629,12 @@ struct Apply_PostOp, /*EltPerPack=*/1> { static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ BytePack ans; \ - asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ - : "l"(addr)); \ + : "l"(addr) : "memory"); \ return ans; \ } \ }; @@ -631,19 +645,19 @@ struct Apply_PostOp, /*EltPerPack=*/1> { __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ BytePack ans; \ if (fn.isMinNotMax) { \ - asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ - : "l"(addr)); \ + : "l"(addr) : "memory"); \ } else { \ - asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ - : "l"(addr)); \ + : "l"(addr) : "memory"); \ } \ return ans; \ } \ @@ -655,9 +669,9 @@ struct Apply_PostOp, /*EltPerPack=*/1> { struct Apply_LoadMultimem, sizeof(T)> { \ __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ BytePack<2*sizeof(T)> tmp; \ - asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ - : "l"(addr & -uintptr_t(2*sizeof(T)))); \ + : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \ return tmp.half[(addr/sizeof(T))%2]; \ } \ }; @@ -668,13 +682,13 @@ struct Apply_PostOp, /*EltPerPack=*/1> { __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ BytePack<2*sizeof(T)> tmp; \ if (fn.isMinNotMax) { \ - asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ - : "l"(addr & -uintptr_t(2*sizeof(T)))); \ + : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \ } else { \ - asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ - : "l"(addr & -uintptr_t(2*sizeof(T)))); \ + : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \ } \ return tmp.half[(addr/sizeof(T))%2]; \ } \ diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index cf068ff..f7b3c25 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -24,6 +24,9 @@ namespace { uint32_t nelem; int rankDest; + // Coverity reports that the callee treats &ring->next as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); @@ -74,6 +77,32 @@ struct RunWorkColl +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + using Proto = ProtoSimple<1, 1>; + const int nranks = ncclShmem.comm.nRanks; + const int rank = ncclShmem.comm.rank; + size_t count, channelOffset, channelCount, chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount); + + T *inputBuf = (T*)work->sendbuff; + T *outputBuf = (T*)work->recvbuff; + Primitives, 0, Proto, 0> prims + (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs); + + PatRSAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); + int last = 0; + while (!last) { + int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem; + size_t inpIx, outIx; + patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last); + prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend); + } + } +}; + + template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { @@ -88,7 +117,7 @@ struct RunWorkCollregUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE); @@ -143,6 +172,9 @@ struct RunWorkColl __device__ __forceinline__ void operator()( int tid, int tn, int slice, int maxSliceSize, - int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes + int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag ) { static_assert(SlicePerChunk==1, "require: SlicePerChunk==1"); static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1"); @@ -199,19 +231,23 @@ struct RunWorkColl (tid, tn, work->redOpArg, &work->redOpArg, false, /*nSrcs=*/1+nSrcs, [=]__device__(int s) { return s==0 ? (T*)inbuf + userOneBeg + : work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) + ? (T*)srcPtrs[s-1] + userOneBeg : (T*)srcPtrs[s-1] + railAllOffset; }, /*nDsts=*/1, [=]__device__(int d/*==0*/) { return (T*)dstPtrs[dst] + railAllOffset; }, delta); + } railAllOffset += delta; node += 1; } @@ -245,15 +281,15 @@ struct RunWorkColl, /*Direct=*/0, Proto, 0> - prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr, - work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr, + work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work); for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.template process(scat); + prims.template process(scat, NCCL_DIRECT_READ, 0); } return; } @@ -269,15 +305,15 @@ struct RunWorkColl send to network - Primitives, /*Direct=*/0, Proto, 0> + Primitives, /*Direct=*/1, Proto, 0> prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr, - work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); + work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.template process(scat); + prims.template process(scat, 0, NCCL_DIRECT_READ); } } return; diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h index 7774202..9b039a4 100644 --- a/src/device/sendrecv.h +++ b/src/device/sendrecv.h @@ -15,11 +15,11 @@ struct RunWorkBatch __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) { size_t bytes = work->sendBytes; - int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8); + int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8); Primitives, 1, Proto, 1> prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, - /*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize); + /*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize); size_t cursor = 0; do { int n = min(size_t(chunkSize), bytes-cursor); @@ -31,15 +31,15 @@ struct RunWorkBatch __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) { size_t bytes = work->recvBytes; - int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8); + int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8); Primitives, 1, Proto, 1> prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, - /*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize); + /*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize); size_t cursor = 0; do { int n = min(size_t(chunkSize), bytes-cursor); - prims.directRecv(cursor, n); + prims.directRecv(cursor, cursor, n); cursor += n; } while (cursor < bytes && work->recvRegistered == 0); } @@ -80,6 +80,9 @@ struct RunWorkBatchsendBytes : work->recvBytes) = partEnd - partBeg; } } + // Coverity reports a possible thread divergence due to not all threads participating in the collective. + // However, the code ensures that the participation is on a per-warp basis. + // coverity[device_thread_diverged:FALSE] uint32_t mask = __ballot_sync(~0u, hasWork); if (lane == 0) { shared->workSendMask = mask>>16; diff --git a/src/enqueue.cc b/src/enqueue.cc index 0e07e3f..4edb42d 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -11,6 +11,7 @@ #include "bootstrap.h" #include "channel.h" #include "cudawrap.h" +#include "profiler.h" #include "transport.h" #include // std::memcpy @@ -121,6 +122,10 @@ static void addWorkBatchToPlan( if (newBatch || extendBatch) { if (!newBatch) batch->nextExtends = extendBatch; // Extending the previous batch. struct ncclWorkBatchList* batchNode = ncclMemoryStackAlloc(&comm->memScoped); + // Coverity thinks that ncclIntruQueueEnqueue will access chan->workBatchQueue->tail, which might + // be NULL. But that code is guarded by chan->workBatchQueue->head not being NULL, in which + // case tail won't be NULL either. + // coverity[var_deref_model:FALSE] ncclIntruQueueEnqueue(&chan->workBatchQueue, batchNode); batch = &batchNode->batch; batch->nextExtends = 0; @@ -239,7 +244,29 @@ static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* c return ncclSuccess; } -static ncclResult_t registerIntraNodeBuffers( +static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) { + if (conn->connected) { + if (conn->conn.flags & (NCCL_IPC_READ | NCCL_IPC_WRITE | NCCL_DIRECT_READ | NCCL_DIRECT_WRITE)) { + *needReg = true; + } else { + // network connection + *needReg = false; + } + } else { + struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer]; + struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank]; + int canConnect = 0; + NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo)); + if (canConnect) { + *needReg = true; + } else { + *needReg = false; + } + } + return ncclSuccess; +} + +static ncclResult_t registerCollBuffers( struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], @@ -250,8 +277,10 @@ static ncclResult_t registerIntraNodeBuffers( info->regBufType = NCCL_REGULAR_BUFFER; *regNeedConnect = true; + if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit; #if CUDART_VERSION >= 11030 - if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) { + if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { + if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit; bool regBufUsed = false; const void *sendbuff = info->sendbuff; void *recvbuff = info->recvbuff; @@ -284,60 +313,6 @@ static ncclResult_t registerIntraNodeBuffers( } info->regBufType = NCCL_NVLS_REG_BUFFER; } - } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now - comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other - comm->intraRanks < comm->localRanks && // only with inter-process & intra-node peers - comm->planner.persistent && 0) { - /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */ - int localRank = comm->localRank; - cudaPointerAttributes sattr, rattr; - - CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff)); - CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff)); - if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess; - - if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess; - - struct HandlePair { - cudaIpcMemHandle_t ipc[2]; // {send, recv} - size_t offset[2]; // {send, recv} - }; - struct HandlePair handles[NCCL_MAX_LOCAL_RANKS]; - - CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback); - CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback); - - void *baseSend, *baseRecv; - size_t size; - CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff)); - handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend; - CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff)); - handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv; - - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair))); - - // Open handles locally - for (int i=0; i < comm->localRanks; i++) { - if (i == localRank) { // Skip self - outRegBufSend[i] = nullptr; - outRegBufRecv[i] = nullptr; - } else { - for (int sr=0; sr < 2; sr++) { - // Get base address of mapping - void* base; - CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess)); - // Get real buffer address by adding offset in the mapping - (sr == 0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr]; - // Enqueue reminder to close memory handle - struct ncclIpcCleanupCallback* cb = (struct ncclIpcCleanupCallback*)malloc(sizeof(struct ncclIpcCleanupCallback)); - cb->base.fn = cleanupIpc; - cb->ptr = base; - ncclIntruQueueEnqueue(cleanupQueue, &cb->base); - info->nCleanupQueueElts += 1; - } - } - } - info->regBufType = NCCL_IPC_REG_BUFFER; } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) { size_t elementSize = ncclTypeSize(info->datatype); size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); @@ -356,27 +331,200 @@ static ncclResult_t registerIntraNodeBuffers( } if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) { - ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); - info->sendMhandle = sendHandle; - if (sendRegBufFlag) { + if (!sendRegBufFlag) { + ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); + info->sendMhandle = sendHandle; + } + if (sendRegBufFlag && !recvRegBufFlag) { ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); info->recvMhandle = recvHandle; } } if (sendRegBufFlag && recvRegBufFlag) { - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1)); + info->nMaxChannels = 1; info->regBufType = NCCL_COLLNET_REG_BUFFER; if (sendRegBufFlag == 1 && recvRegBufFlag == 1) { INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize); } } + } else if (comm->intraNodeP2pSupport && info->protocol == NCCL_PROTO_SIMPLE) { + // IPC buffer registration + if (info->func == ncclFuncReduceScatter) goto exit; + if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit; + if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit; + if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit; + + int peerRanks[NCCL_MAX_LOCAL_RANKS]; + int nPeers = 0; + size_t elementSize = ncclTypeSize(info->datatype); + size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); + size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); + int regBufFlag = 0; + memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS); + + if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { + struct ncclChannel* channel = comm->channels; + for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) { + for (int updown = 0; updown < 2; ++updown) { + int peer; + if (updown == 0) + peer = channel->collnetDirect.up[r]; + else + peer = channel->collnetDirect.down[r]; + if (peer != -1) { + struct ncclConnector* peerConn = &channel->peers[peer]->recv[0]; + bool needReg = false; + + NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg)); + if (needReg) { + bool found = false; + for (int p = 0; p < nPeers; ++p) { + if (peerRanks[p] == peer) { + found = true; + break; + } + } + if (!found) peerRanks[nPeers++] = peer; + } + } + } + } + + if (nPeers > 0) { + if (ncclParamLocalRegister()) + ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs); + if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); + } + if (regBufFlag) { + if (ncclParamLocalRegister()) + ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); + if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); + } + } + } + if (regBufFlag) { + info->regBufType = NCCL_IPC_REG_BUFFER; + } + } else if (info->algorithm == NCCL_ALGO_RING) { + struct ncclReg* recvRegRecord; + NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord)); + if (recvRegRecord == NULL) goto exit; + for (int c = 0; c < comm->nChannels; ++c) { + struct ncclChannel* channel = comm->channels + c; + for (int r = 0; r < 2; ++r) { + bool needReg = false; + int peer; + struct ncclConnector* peerConn; + // P2P transport + if (r == 0) + peer = channel->ring.prev; + else + peer = channel->ring.next; + peerConn = &channel->peers[peer]->recv[0]; + NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_RING], peer, &needReg)); + + if (needReg) { + bool found = false; + for (int p = 0; p < nPeers; ++p) { + if (peerRanks[p] == peer) { + found = true; + break; + } + } + if (!found) peerRanks[nPeers++] = peer; + } + } + } + if (nPeers > 0) { + if (ncclParamLocalRegister()) { + ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); + } + if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); + } + } + if (regBufFlag) { + info->regBufType = NCCL_IPC_REG_BUFFER; + } + } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) { + struct ncclReg* recvRegRecord; + NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord)); + if (recvRegRecord == NULL) goto exit; + for (int c = 0; c < comm->nChannels; ++c) { + struct ncclChannel* channel = comm->channels + c; + struct ncclTree* tree = NULL; + int peers[NCCL_MAX_TREE_ARITY + 1]; + + if (info->algorithm == NCCL_ALGO_TREE) + tree = &channel->tree; + else + tree = &channel->collnetChain; + for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p]; + peers[NCCL_MAX_TREE_ARITY] = tree->up; + for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) { + int peer = peers[p]; + bool peerNeedReg = false; + struct ncclConnector* recvConn = NULL; + // P2P transport + if (peer == -1 || peer == comm->nRanks) continue; + recvConn = &channel->peers[peer]->recv[0]; + NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg)); + + if (peerNeedReg) { + bool found = false; + for (int pindex = 0; pindex < nPeers; ++pindex) { + if (peerRanks[pindex] == peer) { + found = true; + break; + } + } + if (!found) peerRanks[nPeers++] = peer; + } + } + } + if (nPeers > 0) { + if (ncclParamLocalRegister()) { + ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); + } + if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); + } + } + if (regBufFlag) { + info->regBufType = NCCL_IPC_REG_BUFFER; + } + } + + if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) { + info->nMaxChannels = 16; + } } -fallback: +exit: #endif return result; } +static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int peerRank, size_t size, int* regFlag, void** regAddr, struct ncclIntruQueue* cleanupQueue) { + ncclResult_t ret = ncclSuccess; + uintptr_t offset = 0; + uintptr_t* peerRmtAddrs = NULL; + + *regFlag = 0; + if (ncclParamLocalRegister()) { + ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs); + } + if (*regFlag == 0 && comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast(cleanupQueue), NULL); + } + + if (*regFlag) + *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset); + return ret; +} + static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport); static ncclResult_t getAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* task, @@ -500,7 +648,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool void* regBufSend[NCCL_MAX_LOCAL_RANKS]; void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; bool regNeedConnect = true; - registerIntraNodeBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect); + registerCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect); if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) { if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) { @@ -517,6 +665,10 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool struct ncclDevWorkColl devWork = {}; devWork.sendbuff = (void*)task->sendbuff; devWork.recvbuff = (void*)task->recvbuff; + devWork.sendbuffOffset = task->sendbuffOffset; + devWork.recvbuffOffset = task->recvbuffOffset; + devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs; + devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs; devWork.root = task->root; devWork.nWarps = task->nWarps; devWork.redOpArg = task->opDev.scalarArg; @@ -527,35 +679,13 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool struct ncclWorkList* workNode; switch (task->regBufType) { case NCCL_REGULAR_BUFFER: + case NCCL_IPC_REG_BUFFER: case NCCL_COLLNET_REG_BUFFER: { workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); workNode->workType = ncclDevWorkTypeColl; workNode->size = sizeof(struct ncclDevWorkColl); memcpy((void*)(workNode+1), (void*)&devWork, workNode->size); } break; - case NCCL_IPC_REG_BUFFER: - { struct ncclDevWorkCollReg workReg = {}; - workReg.coll = devWork; - struct ncclChannel *channel0 = &comm->channels[0]; - for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { - int peer = channel0->collnetDirect.down[i]; - if (peer == -1) break; - int j = comm->rankToLocalRank[peer]; // Get intra-node slot - workReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer - workReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer - } - for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { - int peer = channel0->collnetDirect.up[i]; - if (peer == -1) break; - int j = comm->rankToLocalRank[peer]; - // Output buffer of root peer - workReg.upOutputs[i] = regBufRecv[j]; - } - workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); - workNode->workType = ncclDevWorkTypeCollReg; - workNode->size = sizeof(struct ncclDevWorkCollReg); - memcpy((void*)(workNode+1), (void*)&workReg, workNode->size); - } break; case NCCL_NVLS_REG_BUFFER: { struct ncclDevWorkCollReg workReg = {}; workReg.coll = devWork; // C++ struct assignment @@ -590,6 +720,7 @@ static ncclResult_t scheduleCollTasksToPlan( int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls] int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls] comm->nChannels, comm->nvlsChannels}; + constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal do { size_t workBytes = 0; struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue); @@ -601,7 +732,7 @@ static ncclResult_t scheduleCollTasksToPlan( nPlanColls += 1; workBytes += workNode->size; int kind = 2*task->isCollnet + task->isNvls; - trafficBytes[kind] += task->trafficBytes; + trafficBytes[kind] += std::max(MinTrafficPerChannel, task->trafficBytes); nChannels[kind] += task->nMaxChannels; nChannels[kind] = std::min(nChannels[kind], nMaxChannels[kind]); task = task->next; @@ -611,7 +742,6 @@ static ncclResult_t scheduleCollTasksToPlan( } while (0); int kindPrev = -1; - constexpr size_t MinTrafficPerChannel = 512; size_t trafficPerChannel = 0; int channelId = 0; size_t currentTraffic = 0; @@ -650,14 +780,16 @@ static ncclResult_t scheduleCollTasksToPlan( for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) { proxyOp.channelId = c; proxyOp.opCount = proxyOpId; + proxyOp.task.coll = task; + proxyOp.rank = comm->rank; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); } } else { // not task->isCollnet - constexpr size_t cellSize = 16; + int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks); + size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16; int elementsPerCell = cellSize/elementSize; size_t cells = divUp(task->count*elementSize, cellSize); - int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks); size_t trafficPerElement = elementSize*trafficPerByte; size_t trafficPerCell = cellSize*trafficPerByte; size_t cellsPerChannel = std::min(cells, divUp(trafficPerChannel, trafficPerCell)); @@ -665,7 +797,7 @@ static ncclResult_t scheduleCollTasksToPlan( if (channelId+1 == nMaxChannels[kind]) { // On last channel everything goes to "lo" cellsLo = cells; } else { - cellsLo = std::min(cells, (trafficPerChannel-currentTraffic)/trafficPerCell); + cellsLo = std::min(cells, divUp((trafficPerChannel-currentTraffic),trafficPerCell)); } int nMidChannels = (cells-cellsLo)/cellsPerChannel; size_t cellsHi = (cells-cellsLo)%cellsPerChannel; @@ -725,12 +857,12 @@ static ncclResult_t scheduleCollTasksToPlan( // Update the current channel and vacant traffic budget. if (countHi != 0) { channelId += nChannels-1; - currentTraffic = countHi*trafficPerElement; + currentTraffic = cellsHi*elementsPerCell*trafficPerElement; } else if (nMidChannels != 0) { channelId += nChannels; currentTraffic = 0; } else { - currentTraffic += countLo*trafficPerElement; + currentTraffic += cellsLo*elementsPerCell*trafficPerElement; } if (currentTraffic >= trafficPerChannel && channelId+1 != nMaxChannels[kind]) { @@ -750,7 +882,12 @@ static ncclResult_t scheduleCollTasksToPlan( } proxyOp->channelId = c; proxyOp->opCount = proxyOpId; + proxyOp->task.coll = task; + proxyOp->rank = comm->rank; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); + // Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to + // determine if that's actually true but it's also not clear if that would be an issue. + // coverity[uninit_use_in_call:FALSE] NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp)); } } @@ -790,6 +927,7 @@ static ncclResult_t scheduleCollTasksToPlan( ncclIntruQueueDequeue(&planner->collWorkQueue); nPlanColls -= 1; planner->nTasksColl -= 1; + ncclIntruQueueEnqueue(&plan->collTaskQueue, task); ncclIntruQueueEnqueue(&plan->workQueue, workNode); plan->workBytes += workNode->size; } @@ -807,7 +945,8 @@ static ncclResult_t addP2pToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int nChannelsMin, int nChannelsMax, int p2pRound, int sendRank, void* sendAddr, ssize_t sendBytes, - int recvRank, void* recvAddr, ssize_t recvBytes + int recvRank, void* recvAddr, ssize_t recvBytes, + struct ncclTaskP2p** p2pTasks ) { constexpr int connIndex = 1; bool selfSend = (sendRank == comm->rank); @@ -842,7 +981,8 @@ static ncclResult_t addP2pToPlan( int chunkSize[2]; int chunkDataSize[2]; int chunkDataSize_u32fp8[2]; - bool registered[2]; + bool registered[2] = {false, false}; + bool ipcRegistered[2] = {false, false}; for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send if (bytes[dir] != -1) protoLL[dir] &= bytes[dir] <= thresholdLL; @@ -866,11 +1006,29 @@ static ncclResult_t addP2pToPlan( chunkSize[dir] = chunkDataSize[dir]; if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2; - registered[dir] = false; - if (bytes[dir] > 0 && network[dir] && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) { - struct ncclReg* regRecord; - NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], ®Record)); - registered[dir] = (regRecord && regRecord->nDevs); + if (network[dir]) { + if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) { + struct ncclReg* regRecord; + NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], ®Record)); + registered[dir] = regRecord && regRecord->nDevs; + } + } else if (bytes[dir] > 0 && addrs[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && !selfSend) { + int peerRank = dir ? sendRank : recvRank; + int regFlag = 0; + int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, 0); + struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers; + struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex] + : &channelPeers[peerRank]->recv[connIndex]; + void* regAddr = NULL; + if (conn->conn.flags & (NCCL_IPC_WRITE | NCCL_IPC_READ | NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) { + // We require users registering buffers on both sides + NCCLCHECK(registerP2pBuffer(comm, addrs[dir], peerRank, bytes[dir], ®Flag, ®Addr, &plan->cleanupQueue)); + if (regFlag) { + if (dir == 0 && conn->conn.flags & (NCCL_IPC_WRITE | NCCL_DIRECT_WRITE)) recvAddr = regAddr; + else if (dir == 1 && conn->conn.flags & (NCCL_IPC_READ | NCCL_DIRECT_READ)) sendAddr = regAddr; + } + } + ipcRegistered[dir] = regFlag ? true : false; } if (bytes[dir] == -1) nChannels[dir] = 0; @@ -900,6 +1058,7 @@ static ncclResult_t addP2pToPlan( work->nSendChannels = nChannels[1]; work->sendProtoLL = protoLL[1]; work->sendRegistered = registered[1]; + work->sendIpcReg = ipcRegistered[1]; work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1]; work->sendRank = sendRank; work->sendAddr = sendAddr; @@ -907,6 +1066,7 @@ static ncclResult_t addP2pToPlan( work->nRecvChannels = nChannels[0]; work->recvProtoLL = protoLL[0]; work->recvRegistered = registered[0]; + work->recvIpcReg = ipcRegistered[0]; work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0]; work->recvRank = recvRank; work->recvAddr = recvAddr; @@ -925,6 +1085,9 @@ static ncclResult_t addP2pToPlan( op->pattern = dir ? ncclPatternSend : ncclPatternRecv; op->chunkSize = chunkSize[dir]; op->reg = registered[dir]; + op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0; + op->task.p2p = p2pTasks[dir]; + op->rank = comm->rank; // The following are modified per channel part in addWorkToChannels(): // op->buffer, op->nbytes, op->nsteps = ...; } @@ -1041,13 +1204,16 @@ static ncclResult_t scheduleP2pTasksToPlan( if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) { return ncclSuccess; } - NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes)); + struct ncclTaskP2p* p2pTasks[2] = { recv, send }; + NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, p2pTasks)); if (send != nullptr) { ncclIntruQueueDequeue(&peers[sendRank].sendQueue); + ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send); comm->planner.nTasksP2p -= 1; } if (recv != nullptr) { ncclIntruQueueDequeue(&peers[recvRank].recvQueue); + ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv); comm->planner.nTasksP2p -= 1; } } @@ -1100,29 +1266,44 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce } } +namespace { + struct uploadWork_cleanup_t { + struct ncclCommEventCallback base; + void *hostBuf; + }; + ncclResult_t uploadWork_cleanup_fn( + struct ncclComm* comm, struct ncclCommEventCallback* cb + ) { + struct uploadWork_cleanup_t* me = (struct uploadWork_cleanup_t*)cb; + free(me->hostBuf); + CUDACHECK(cudaEventDestroy(me->base.event)); + return ncclSuccess; + } +} + static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) { size_t workBytes = plan->workBytes; size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); - void* fifoBuf; + void* fifoBufHost; uint32_t fifoCursor, fifoMask; switch (plan->workStorageType) { case ncclDevWorkStorageTypeArgs: plan->kernelArgs->workBuf = nullptr; - fifoBuf = (void*)plan->kernelArgs; + fifoBufHost = (void*)plan->kernelArgs; fifoCursor = sizeof(ncclDevKernelArgs) + batchBytes; fifoMask = ~0u; break; case ncclDevWorkStorageTypeFifo: - fifoBuf = comm->workFifoBuf; + fifoBufHost = comm->workFifoBuf; fifoCursor = comm->workFifoProduced; fifoMask = comm->workFifoBytes-1; waitWorkFifoAvailable(comm, fifoCursor + workBytes); plan->kernelArgs->workBuf = comm->workFifoBufDev; break; case ncclDevWorkStorageTypePersistent: - ncclMemoryStackPush(&comm->memScoped); - fifoBuf = ncclMemoryStackAlloc(&comm->memScoped, workBytes, /*align=*/16); + static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment."); + fifoBufHost = malloc(workBytes); fifoCursor = 0; fifoMask = ~0u; break; @@ -1144,7 +1325,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla // Write the channel-shared work structs. struct ncclWorkList* workNode = ncclIntruQueueHead(&plan->workQueue); while (workNode != nullptr) { - char* dst = (char*)fifoBuf; + char* dst = (char*)fifoBufHost; char* src = (char*)(workNode+1); for (int n = workNode->size; n != 0; n -= 16) { memcpy( @@ -1164,11 +1345,39 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla if (comm->workFifoBufGdrHandle != nullptr) wc_store_fence(); break; case ncclDevWorkStorageTypePersistent: - NCCLCHECK(ncclCudaMalloc(&plan->workBufPersistent, workBytes)); - plan->kernelArgs->workBuf = plan->workBufPersistent; - NCCLCHECK(ncclCudaMemcpy(plan->workBufPersistent, fifoBuf, workBytes)); - ncclMemoryStackPop(&comm->memScoped); - break; + { ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + void* fifoBufDev = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + + // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the + // user's graph will be launched later, and it also acquires the deviceStream, + // it will observe this upload. + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, finish_scope); + + CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, finish_scope); + plan->workBufPersistent = fifoBufDev; + plan->kernelArgs->workBuf = fifoBufDev; + + CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, finish_scope); + cudaEvent_t memcpyDone; + CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, finish_scope); + CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, finish_scope); + + struct uploadWork_cleanup_t* cleanup; + NCCLCHECK(ncclCalloc(&cleanup, 1)); + cleanup->base.fn = uploadWork_cleanup_fn; + cleanup->base.event = memcpyDone; + cleanup->hostBuf = fifoBufHost; + ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cleanup->base); + + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, finish_scope); + NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, finish_scope); + + finish_scope: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (result != ncclSuccess) return result; + } break; default: break; } return ncclSuccess; @@ -1182,6 +1391,11 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue); while (op != nullptr) { + op->profilerContext = comm->profilerContext; + op->eActivationMask = op->coll <= ncclFuncAllReduce ? op->task.coll->eActivationMask : op->task.p2p->eActivationMask; + op->taskEventHandle = op->coll <= ncclFuncAllReduce ? op->task.coll->eventHandle : op->task.p2p->eventHandle; + ncclProfilerAddPidToProxyOp(op); + uint64_t oldId = op->opCount; // Ignoring the bottom tag bit, opCount's are zero-based within plan so // translate them to the tip of the comm's history. @@ -1216,8 +1430,12 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* } static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) { + NCCLCHECK(ncclProfilerStartGroupEvent(plan)); + NCCLCHECK(ncclProfilerStartTaskEvents(plan)); NCCLCHECK(uploadProxyOps(comm, plan)); NCCLCHECK(ncclProxyStart(comm)); + NCCLCHECK(ncclProfilerStopTaskEvents(plan)); + NCCLCHECK(ncclProfilerStopGroupEvent(plan)); if (!plan->persistent) { // Notify main thread of our reclaiming. This will reclaim plan concurrently. ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); @@ -1238,13 +1456,30 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim` if (plan->persistent) { comm->persistentRefs -= 1; - NCCLCHECK(ncclCudaFree(plan->workBufPersistent)); + if (plan->workStorageType == ncclDevWorkStorageTypePersistent) { + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + CUDACHECK(cudaFree(plan->workBufPersistent)); + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + } struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue); while (q != nullptr) { struct ncclProxyOp* q1 = q->enqNext; ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q); q = q1; } + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + while (ct != nullptr) { + struct ncclTaskColl* ct1 = ct->next; + ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct); + ct = ct1; + } + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + while (pt != nullptr) { + struct ncclTaskP2p* pt1 = pt->next; + ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt); + pt = pt1; + } ncclResult_t result = ncclSuccess; while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) { struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue); @@ -1286,7 +1521,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { plan->comm = comm; plan->reclaimer.fn = reclaimPlan; plan->persistent = persistent; - // uploadWork() promotes ncclDevWorkStorageType[Fifo|Buf]->Args if the work can fit. + // finishPlan() promotes ncclDevWorkStorageType[Fifo|Persistent]->Args if the work can fit. plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent : ncclDevWorkStorageTypeFifo; @@ -1554,10 +1789,15 @@ static ncclResult_t updateCollCostTable( for (int a=0; amaxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue; if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue; if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue; /* now we only support single-node NVLS allgather and reducescatter */ if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue; + /* Tree reduceScatter doesn't support scaling yet */ + if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter + && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue; for (int p=0; pprotocol = protocol; float time = minTime; + // Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case. + // coverity[check_after_sink] if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) { WARN("Error : no algorithm/protocol available"); @@ -1610,7 +1852,7 @@ static ncclResult_t topoGetAlgoInfo( info->protocol = backupProto; time = backupTime; } - if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time); + if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time); if (simInfo) simInfo->estimatedTime = time; TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time); @@ -1653,6 +1895,7 @@ static ncclResult_t topoGetAlgoInfo( } nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always. + if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS; info->nMaxChannels = nc; info->nWarps = nt/WARP_SIZE; return ncclSuccess; @@ -1704,8 +1947,15 @@ static ncclResult_t calcCollChunking( pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break; case ncclFuncReduceScatter: + pattern = + info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatUp : + info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : + info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : + ncclPatternRing; + break; case ncclFuncAllGather: pattern = + info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatDown : info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : ncclPatternRing; @@ -1729,6 +1979,8 @@ static ncclResult_t calcCollChunking( case ncclPatternTreeUp: case ncclPatternTreeDown: case ncclPatternTreeUpDown: + case ncclPatternPatUp: + case ncclPatternPatDown: case ncclPatternPipelineFrom: case ncclPatternPipelineTo: case ncclPatternCollnetChain: @@ -1776,13 +2028,17 @@ static ncclResult_t calcCollChunking( int maxChunkSize = comm->nvlsChunkSize; if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768; if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; - // Use uint64_t so that concurrentOps*chunkSize*X does not overflow + // Use uint64_t so that concurrentOps*chunkSize*X does not overflow. + // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits. + // coverity[overflow_before_widen] uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads; if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384; } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) { - // Use uint64_t so that concurrentOps*chunkSize*X does not overflow + // Use uint64_t so that concurrentOps*chunkSize*X does not overflow. + // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits. + // coverity[overflow_before_widen] uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads; chunkSize = comm->nvlsChunkSize; int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize(); @@ -1796,14 +2052,21 @@ static ncclResult_t calcCollChunking( int nNodes = comm->nNodes; float ppn = comm->nRanks / (float)nNodes; float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn; + // Yes, we are OK with the division on the left side of the < operand being integer. + // coverity[integer_division] while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2; + // coverity[integer_division] while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2; + } else if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) { + while (chunkSize*nChannels*32 > nBytes && chunkSize > 65536) chunkSize /= 2; + } else if (info->func == ncclFuncReduceScatter && info->algorithm == NCCL_ALGO_PAT) { + while (chunkSize*nChannels*16 > nBytes && chunkSize > 65536) chunkSize /= 2; } // Compute directFlags of work struct. if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // Set direct direction for broadcast-gather (read or write) - *outDirectFlags = (nBytes/nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ; + *outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { *outDirectFlags = 0; } @@ -1852,6 +2115,10 @@ static ncclResult_t calcCollChunking( } } + if (pattern == ncclPatternPatUp || pattern == ncclPatternPatDown) { + proxyOp->nbytes = DIVUP(nBytes, nChannels); + } + *outChunkSize = chunkSize; return ncclSuccess; } @@ -1874,6 +2141,7 @@ static ncclResult_t hostToDevRedOp( opFull->proxyOp = op; int nbits = 8*ncclTypeSize(datatype); + if (nbits <= 0) return ncclInvalidArgument; uint64_t allBits = uint64_t(-1)>>(64-nbits); uint64_t signBit = allBits^(allBits>>1); @@ -1947,8 +2215,12 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. ncclGroupCommJoin(info->comm); - struct ncclTaskP2p* p2p = ncclMemoryStackAlloc(&comm->memScoped); + struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskP2p, &comm->memPermanent); + p2p->func = info->coll; p2p->buff = (void*)info->recvbuff; + p2p->count = info->count; + p2p->datatype = info->datatype; + p2p->root = info->root; p2p->bytes = nBytes; ncclIntruQueueEnqueue( isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue, @@ -1996,7 +2268,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { } else { // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. ncclGroupCommJoin(info->comm); - struct ncclTaskColl* t = ncclMemoryStackAlloc(&comm->memScoped); + struct ncclTaskColl* t = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskColl, &comm->memPermanent); t->func = info->coll; t->sendbuff = info->sendbuff; t->recvbuff = info->recvbuff; @@ -2026,7 +2298,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { while (true) { if (l == nullptr) { // Got to the end, this must be a new stream. struct ncclCudaGraph graph; - NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream)) + NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream)); if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) { WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph."); return ncclInvalidUsage; @@ -2075,7 +2347,7 @@ exit: NCCLCHECK(ncclGroupEndInternal()); /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change * so we have to check state here. */ - if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) }; + if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)); } return ret; fail: if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret); @@ -2093,7 +2365,8 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp int cap = 2*comm->userRedOpCapacity; if (cap < 4) cap = 4; ncclUserRedOp *ops = new ncclUserRedOp[cap]; - std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp)); + if (comm->userRedOpCapacity > 0) + std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp)); for(int ix=comm->userRedOpCapacity; ix < cap; ix++) ops[ix].freeNext = ix + 1; delete[] comm->userRedOps; @@ -2109,8 +2382,10 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp user->datatype = datatype; user->opFull.op = ncclDevPreMulSum; if (residence == ncclScalarHostImmediate) { + int size = ncclTypeSize(datatype); + if (size < 1) return ncclInternalError; user->opFull.scalarArgIsPtr = false; - std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype)); + std::memcpy(&user->opFull.scalarArg, scalar, size); } else { user->opFull.scalarArgIsPtr = true; user->opFull.scalarArg = reinterpret_cast(scalar); @@ -2127,6 +2402,10 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) { WARN("ncclRedOpDestroy : operator is a NCCL builtin."); return ncclInvalidArgument; } + // int(ncclMaxRedOp) < int(op) will always be false due to the sizes of + // the datatypes involved, and that's by design. We keep the check though + // just as a reminder. + // coverity[result_independent_of_operands] if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) { WARN("ncclRedOpDestroy : operator is garbage."); return ncclInvalidArgument; diff --git a/src/graph/connect.cc b/src/graph/connect.cc index b1b99d4..3f639a0 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -226,6 +226,8 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* } } channel->collnetDirect.nHeads = nHeads; + // nHeads should always be greater than 0. + // coverity[divide_by_zero] channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2; sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads); @@ -374,20 +376,21 @@ NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) { // Gather data from all ranks - int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads; + ncclResult_t ret = ncclSuccess; + int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL; int nranks = comm->nRanks; int nNodes = comm->nNodes; int nChannels = comm->nChannels; int minHeadNum = INT_MAX; int shared = parent && parent->nvlsSupport && parent->config.splitShare; NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS)); + NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail); // Alternate rings to avoid crossing rails if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) { @@ -433,8 +436,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa } // Connect rings and trees. This should also duplicate the channels. - NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext)); - NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns)); + NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail); + NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail); // Duplicate ringPrev/ringNext for ncclBuildRing memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); @@ -459,7 +462,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); } - NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT])); + NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail); } // Use 4 compute channels per search channel to reach peak BW on <8 PPN @@ -493,7 +496,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa if (comm->nChannels < comm->nvlsChannels) { nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext); } - NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum)); + NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail); #endif if (shared && comm->nChannels > parent->sharedRes->tpNChannels) { nChannels = comm->nChannels = parent->sharedRes->tpNChannels; @@ -501,16 +504,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa } // Create rings array and check all is fine - NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); + NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail); - free(ringRecv); - free(ringSend); - free(ringPrev); - free(ringNext); - free(treeToParent); - free(treeToChild0); - free(treeToChild1); - free(nvlsHeads); - - return ncclSuccess; +exit: + if (ringRecv) free(ringRecv); + if (ringSend) free(ringSend); + if (ringPrev) free(ringPrev); + if (ringNext) free(ringNext); + if (treeToParent) free(treeToParent); + if (treeToChild0) free(treeToChild0); + if (treeToChild1) free(treeToChild1); + if (nvlsHeads) free(nvlsHeads); + return ret; +fail: + goto exit; } diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 1380d24..999312a 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -36,13 +36,13 @@ NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0); static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) { if (baseNode->paths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); + for (int i=0; inodes[baseNode->type].count; i++) baseNode->paths[baseNode->type][i].type = PATH_DIS; } // breadth-first search to set all paths to that node in the system struct ncclTopoNodeList nodeList; - struct ncclTopoNodeList nextNodeList; + struct ncclTopoNodeList nextNodeList = { { 0 }, 0 }; nodeList.count = 1; nodeList.list[0] = baseNode; - nextNodeList.count = 0; struct ncclTopoLinkList* basePath; NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); basePath->count = 0; @@ -116,9 +116,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n const int linesize = 1024; char line[linesize]; #ifdef ENABLE_TRACE - INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id); + INFO(NCCL_GRAPH, "Paths from %s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id)); #else - snprintf(line, linesize, "%s/%lX :", topoNodeTypeStr[node->type], node->id); + snprintf(line, linesize, "%s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id)); int offset = strlen(line); #endif for (int t=0; tnodes[GPU].nodes[gpu].paths[CPU]; for (int c=0; cnodes[CPU].count; c++) { int hops = paths[c].count; - if (minHops == 0 || hops < minHops) { + if (hops > 0 && (minHops == 0 || hops < minHops)) { localCpu = c; minHops = hops; } @@ -193,20 +193,15 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, return ncclSuccess; } -// Remove/free paths for a given type -static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) { - for (int t=0; tnodes[t].count; n++) { - struct ncclTopoNode* node = system->nodes[t].nodes+n; - free(node->paths[nodeType]); - node->paths[nodeType] = NULL; - } - // Remove links _from_ the given type - for (int n=0; nnodes[nodeType].count; n++) { - struct ncclTopoNode* node = system->nodes[nodeType].nodes+n; - free(node->paths[t]); - node->paths[t] = NULL; +// Remove/free all paths +static void ncclTopoRemovePaths(struct ncclTopoSystem* system) { + for (int t1=0; t1nodes[t1].count; n++) { + struct ncclTopoNode* node = system->nodes[t1].nodes+n; + for (int t2=0; t2paths[t2]) free(node->paths[t2]); + node->paths[t2] = NULL; + } } } } @@ -220,6 +215,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE if (str) { int disable = strtol(str, NULL, 0); if (disable == 1) l = 0; + if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable); } } if (l == -1) { @@ -241,9 +237,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE if (oldLevel > maxOldLevel) oldLevel = maxOldLevel; l = levelsOldToNew[oldLevel]; } + if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]); } } - if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]); *level = l >= 0 ? l : -2; } return ncclSuccess; @@ -252,16 +248,16 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0); int ncclTopoUserP2pLevel = -1; -ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) { +ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) { *p2p = 0; if (read) *read = 0; if (intermediateRank) *intermediateRank = -1; // Get GPUs from topology int g1, g2; - NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1)); + NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1)); struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; - if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) { + if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) { // GPU not found, we can't use p2p. return ncclSuccess; } @@ -277,8 +273,13 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_ } } - // In general, use P2P whenever we can. - int p2pLevel = PATH_SYS; + // By default don't use P2P across CPU Host Bridges and further apart + int p2pLevel = PATH_PXB; + + int arch, vendor, model; + NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); + // Allow P2P between pairs of GPUs on AMD systems + if ((arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD) && system->nodes[GPU].count <= 2) p2pLevel = PATH_SYS; // User override if (ncclTopoUserP2pLevel == -1) @@ -288,16 +289,6 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_ goto compare; } - // Don't use P2P through ARM CPUs - int arch, vendor, model; - NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); - if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; - if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { - p2pLevel = PATH_PXB; - } - if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { - p2pLevel = PATH_PXB; - } compare: // Compute the PCI distance and compare with the p2pLevel. @@ -438,7 +429,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0); // Check whether going through the network would be faster than going through P2P/SHM. -ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) { +ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net) { if (ncclParamNetDisableIntra() == 1) { *net = 0; return ncclSuccess; @@ -446,8 +437,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_ *net = 1; // First check the current GPU-to-GPU speed. int g1, g2; - if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess || - ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) { + if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess || + ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) { return ncclSuccess; } @@ -545,7 +536,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm // Precompute paths between GPUs/NICs. // Remove everything in case we're re-computing - for (int t=0; tnodes[CPU].count; c++) { @@ -571,11 +562,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm for (int g=0; gnodes[GPU].count; g++) { for (int p=0; pnodes[GPU].count; p++) { int p2p; - NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL)); + NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL)); if (p2p == 0) { // Divert all traffic through the CPU int cpu; - NCCLCHECK(getLocalCpu(system, g, &cpu)); + NCCLCHECK(ncclGetLocalCpu(system, g, &cpu)); NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g)); } } @@ -587,10 +578,10 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm if (p == g) continue; struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank; int p2p; - NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo)); + NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, comm, NULL, srcInfo, dstInfo)); if (p2p == 0) { int shm; - NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo)); + NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, comm, NULL, srcInfo, dstInfo)); if (shm == 0) { // Mark this peer as inaccessible. We'll trim it later. system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET; @@ -631,7 +622,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm if (gdr == 0) { // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; - NCCLCHECK(getLocalCpu(system, g, &localCpu)); + NCCLCHECK(ncclGetLocalCpu(system, g, &localCpu)); NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g)); NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n)); } @@ -642,11 +633,13 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm } ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; int *domains; - int64_t *ids; - NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count)); - NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count)); + int64_t *ids = NULL; int myDomain = 0; + int ngpus = system->nodes[GPU].count; + NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count)); + NCCLCHECKGOTO(ncclCalloc(&ids, system->nodes[GPU].count), ret, fail); for (int g=0; gnodes[GPU].count; g++) { struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; domains[g] = g; @@ -659,7 +652,6 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* if (gpu->gpu.rank == comm->rank) myDomain = domains[g]; } - int ngpus = system->nodes[GPU].count; for (int i=0; inodes[GPU].count == comm->nRanks) { for (int n=system->nodes[NET].count-1; n>=0; n--) - NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); + NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail); } +exit: free(domains); - free(ids); - return ncclSuccess; + if (ids) free(ids); + return ret; +fail: + goto exit; } void ncclTopoFree(struct ncclTopoSystem* system) { - for (int t=0; t= line_length) { + // Sprintf wanted to write more than would fit in the buffer. Assume + // line_length is at least 4 and replace the end with "..." to + // indicate that it was truncated. + snprintf(line+line_length-4, 4, "..."); + } + INFO(NCCL_INIT, "%s", line); } ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { @@ -32,7 +38,7 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p rings[r*nranks+i] = current; current = next[r*nranks+current]; } - sprintf(prefix, "Channel %02d/%02d : ", r, nrings); + snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings); if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix); if (current != rank) { WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank); diff --git a/src/graph/search.cc b/src/graph/search.cc index 7f16cb7..ad6f580 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -104,6 +104,9 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink)); revBw += fwBw; } + // Coverity thinks that revLink could be NULL below. However, we access it only if revBw is non-0, and the + // logic of the code is that revBw can become non-0 only if revLink is non-NULL (see the "if" statement right above). + // coverity[var_deref_op] if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; } SUB_ROUND(link->bw, fwBw); if (revBw) SUB_ROUND(revLink->bw, revBw); @@ -444,6 +447,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop // 2. add other NETs satisfying typeInter but not already in the list. ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { + ncclResult_t ret = ncclSuccess; int netCount = 0; int localNetCount; int* localNets; @@ -456,8 +460,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; for (int c = 0; cgpu.rank, c, &netId, NULL)); - NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); + NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail); + NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail); if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; localNetCount++; } @@ -491,12 +495,15 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in } *netCountRet = netCount; +exit: free(localNets); - - return ncclSuccess; + return ret; +fail: + goto exit; } ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) { + ncclResult_t ret = ncclSuccess; if ((*time) <= 0) return ncclSuccess; (*time)--; @@ -518,6 +525,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo } graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank; int g = gpu - system->nodes[GPU].nodes; + int* nets = NULL; if (step == backToNet) { // first get back to NIC if (system->nodes[NET].count) { @@ -525,15 +533,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; int netCount; - int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); - NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount)); + NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail); for (int i=0; inodes[NET].nodes+n; if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric - if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; - if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue; + if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) { + if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue; + } else { + if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; + } // Balanced Tree : count half of the bandwidth on first two GPUs int nextBackToNet = -1; @@ -545,18 +555,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo graph->bwInter /= 2; } - NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); + NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail); graph->bwInter = bwInterSave; if (net) { graph->inter[graph->nChannels*2+1] = net->id; - NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time)); + NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail); if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2; - NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net)); + NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail); graph->bwInter = bwInterSave; } } - free(nets); } } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time)); @@ -592,23 +601,29 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo // Next path NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time)); } - return ncclSuccess; +exit: + if (nets) free(nets); + return ret; +fail: + goto exit; } ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { + ncclResult_t ret = ncclSuccess; const int bw = graph->bwInter; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); int netCount; int graphFound = 0; - NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount)); + NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail); for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue; + if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break; int n = nets[(graph->nChannels+i)%netCount]; struct ncclTopoNode* net = system->nodes[NET].nodes+n; if (graph->collNet && net->net.collSupport == 0) continue; if (net->net.bw < bw) continue; - if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue; + if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2 + && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue; graph->inter[graph->nChannels*2] = net->id; graph->latencyInter = net->net.latency; @@ -624,31 +639,34 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo // NVLS search only tries to find NIC:GPU combinations to compute the heads. if (graph->nChannels < netCount) { int gpu; - int duplicate = 0; - NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu)); - // check whether there is duplicate head when one GPU connects with multiple NICs - for (int gc = 0; gc < graph->nChannels; gc++) { - if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) { - duplicate = 1; - break; + NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail); + if (gpu != -1) { + int duplicate = 0; + // check whether there is duplicate head when one GPU connects with multiple NICs + for (int gc = 0; gc < graph->nChannels; gc++) { + if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) { + duplicate = 1; + break; + } + } + if (!duplicate) { + NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail); + graphFound = 1; } } - if (duplicate) continue; - if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu)); - graphFound = 1; } } else { if (graph->nChannels > 0) { // Try to replay the last channel int g; - NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); + NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail); + NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail); } if (graph->nChannels == 0 || graph->sameChannels == 0) { if (graph->nChannels == 0 && system->nodes[NVS].count == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); + NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail); if (t == -1) *time = -1; } @@ -660,7 +678,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo if (paths[g].bw > maxBw) { maxBw = paths[g].bw; minHops = paths[g].count; - } else if (paths[g].bw == maxBw && paths[g].count < minHops) { + } else if (paths[g].bw == maxBw && paths[g].count > 0 && paths[g].count < minHops) { minHops = paths[g].count; } } @@ -668,7 +686,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo for (int i=0; inodes[GPU].count; i++) { int g = (graph->nChannels+i)%system->nodes[GPU].count; if (paths[g].bw == maxBw && paths[g].count == minHops) { - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); + NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail); } } } @@ -682,8 +700,11 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } } +exit: free(nets); - return ncclSuccess; + return ret; +fail: + goto exit; } /* Search Patterns @@ -1040,9 +1061,10 @@ search: } tmpGraph.typeInter = PATH_PIX; - if (crossNic == 2 && tmpGraph.crossNic == 0) { + if (crossNic == 2 && tmpGraph.crossNic == 0 + && (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) { // Try again with crossNic if permitted - tmpGraph.crossNic = 1; + tmpGraph.crossNic = 2; goto search; } tmpGraph.crossNic = crossNic == 1 ? 1 : 0; @@ -1112,7 +1134,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr sprintf(line, "%2d :", c); int offset = strlen(line); if (system->nodes[NET].count > 0) { - sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c]); + sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c])); offset = strlen(line); } for (int i=0; inodes[NET].count > 0) { - sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c+1]); + sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c])); offset = strlen(line); } INFO(NCCL_GRAPH, "%s", line); @@ -1129,16 +1151,20 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr } ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) { + ncclResult_t ret = ncclSuccess; const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE"); + struct ncclXml* xml = NULL; if (str) { INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str); - struct ncclXml* xml; NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES)); - NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml)); - NCCLCHECK(ncclTopoDumpXmlToFile(str, xml)); - free(xml); + NCCLCHECKGOTO(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml), ret, fail); + NCCLCHECKGOTO(ncclTopoDumpXmlToFile(str, xml), ret, fail); } - return ncclSuccess; +exit: + if (xml) free(xml); + return ret; +fail: + goto exit; } #include "comm.h" diff --git a/src/graph/topo.cc b/src/graph/topo.cc index d6af928..9771ae0 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -192,6 +192,7 @@ int getBcmGen(uint64_t id, int level) { return 0; } ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { + ncclResult_t ret = ncclSuccess; for (int s=0; snodes[PCI].count; s++) { struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s; int gen = getBcmGen(pciSwitch->pci.device, 0); @@ -217,7 +218,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { for (int s=0; snodes[PCI].nodes is changing every time we remove a node) int index; - NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index)); + NCCLCHECKGOTO(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index), ret, fail); struct ncclTopoNode* sub = system->nodes[PCI].nodes+index; // Connect all sub PCI devices to the parent switch for (int l=0; lnlinks; l++) { @@ -226,7 +227,8 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { // Add link from parent PCI switch -> PCI device if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) { WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS); - return ncclInternalError; + ret = ncclInternalError; + goto fail; } memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink)); pciSwitch->nlinks++; @@ -238,16 +240,20 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { } } } - NCCLCHECK(ncclTopoRemoveNode(system, PCI, index)); + NCCLCHECKGOTO(ncclTopoRemoveNode(system, PCI, index), ret, fail); } // Set subdevice to 0xffff to make sure we don't merge this switch again. pciSwitch->pci.device |= 0xffff; free(subSwIds); // Restart, as system->nodes[PCI].nodes has changed. s = 0; + continue; +fail: + free(subSwIds); + return ret; } } - return ncclSuccess; + return ret; } ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) { @@ -281,7 +287,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_LOC) { - sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id); + sprintf(line+offset, "+ %s[%2.1f] - %s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id)); INFO(NCCL_GRAPH, "%s", line); } else if (link->type != LINK_PCI || link->remNode != prevNode) { sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw); @@ -290,9 +296,9 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); } else { if (link->remNode->type == NET) { - sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw); + sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw); } else { - sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); + sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id)); } INFO(NCCL_GRAPH, "%s", line); } @@ -720,84 +726,87 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) { } ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { + ncclResult_t ret = ncclSuccess; struct ncclXml* xml; + char* mem = NULL; + int* localRanks = NULL; + int netDevCount = 0; + struct ncclXml* rankXml; + int localRank = -1, nLocalRanks = 0; NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES)); const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE"); if (xmlTopoFile) { INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile); - NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1)); + NCCLCHECKGOTO(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1), ret, fail); } else { // Try default XML topology location - NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0)); + NCCLCHECKGOTO(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0), ret, fail); } if (xml->maxIndex == 0) { // Create top tag struct ncclXmlNode* top; - NCCLCHECK(xmlAddNode(xml, NULL, "system", &top)); - NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION)); + NCCLCHECKGOTO(xmlAddNode(xml, NULL, "system", &top), ret, fail); + NCCLCHECKGOTO(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION), ret, fail); } - NCCLCHECK(ncclTopoRefreshBcmP2pLinks()); + NCCLCHECKGOTO(ncclTopoRefreshBcmP2pLinks(), ret, fail); // Detect only the GPU managed by this process. We'll get any others through XML fusion. char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId)); + NCCLCHECKGOTO(int64ToBusId(comm->peerInfo[comm->rank].busId, busId), ret, fail); struct ncclXmlNode* node; - NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); + NCCLCHECKGOTO(ncclTopoFillGpu(xml, busId, &node), ret, fail); if (node) { - NCCLCHECK(xmlSetAttrInt(node, "keep", 1)); - NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank)); - NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport)); + NCCLCHECKGOTO(xmlSetAttrInt(node, "keep", 1), ret, fail); + NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail); + NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail); } // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. - int netDevCount = 0; if (collNetSupport(comm)) { - NCCLCHECK(collNetDevices(comm, &netDevCount)); + NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail); for (int n=0; ndmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); - NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); - NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1)); + NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail); + NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail); } } if (netDevCount == 0) { - NCCLCHECK(comm->ncclNet->devices(&netDevCount)); + NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail); } for (int n=0; nncclNet->getProperties(n, &props)); + NCCLCHECKGOTO(comm->ncclNet->getProperties(n, &props), ret, fail); comm->netDeviceType = props.netDeviceType; struct ncclXmlNode* netNode; - NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode)); - NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); - NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); - NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); - NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); - NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency)); - NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); - NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); + NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail); + NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail); + NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail); + NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail); + NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail); + NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail); + NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail); + NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail); bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); - NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); + NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail); } // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) - NCCLCHECK(ncclTopoTrimXml(xml)); + NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail); // XML topo fusion. - int* localRanks; - int localRank = -1, nLocalRanks = 0; if (comm->MNNVL) { // MNNVL clique support nLocalRanks = comm->clique.size; @@ -805,7 +814,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy localRanks = comm->clique.ranks; } else { // Intra-node fusion. Much of the comm is not initialized yet at this point so we need to do our own calculations. - NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks)); + NCCLCHECKGOTO(ncclCalloc(&localRanks, comm->nRanks), ret, fail); for (int i = 0; i < comm->nRanks; i++) { if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) { if (i == comm->rank) @@ -814,37 +823,42 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy } } } - char* mem; - NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES))); - struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank); + NCCLCHECKGOTO(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail); + rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank); memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)); - NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1)); - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES))); + NCCLCHECKGOTO(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1), ret, fail); + // nLocalRanks can't actually be 0, or we wouldn't be running at all... + // coverity[divide_by_zero] + NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail); if (comm->MNNVL) { // Ensure that we have enough room when fusing topos from multiple nodes. free(xml); - NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES)); + xml = NULL; + NCCLCHECKGOTO(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES), ret, fail); } else { // In the intra-node case there's no need to enlarge the topo xml. xml->maxIndex = 0; - free(localRanks); } for (int i = 0; i < nLocalRanks; i++) { struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i); - NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0)); - NCCLCHECK(ncclTopoFuseXml(xml, peerXml)); + NCCLCHECKGOTO(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0), ret, fail); + NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail); } - free(mem); xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE"); if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) { INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile); - NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml)); + NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail); } - NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash)); + NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail); +exit: + if (!comm->MNNVL && localRanks) free(localRanks); + if (mem) free(mem); free(xml); - return ncclSuccess; + return ret; +fail: + goto exit; } ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) { @@ -853,6 +867,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index int count = 0; NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count)); struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType]; + if (paths == NULL) { *localCount = 0; return ncclSuccess; } for (int i=0; inodes[resultType].count; i++) { if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) { maxBw = paths[i].bw; @@ -891,6 +906,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c } ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { + ncclResult_t ret = ncclSuccess; int gpu; NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu)); int* localNets; @@ -898,39 +914,46 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL)); int* localGpus = NULL; int localGpuCount; - NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL)); - int net = system->nodes[GPU].nodes[gpu].gpu.dev; + int net; + NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail); + net = system->nodes[GPU].nodes[gpu].gpu.dev; if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount); net += channelId%(DIVUP(localNetCount,localGpuCount)); if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id; if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev; +exit: free(localNets); - free(localGpus); - return ncclSuccess; + if (localGpus) free(localGpus); + return ret; +fail: + goto exit; } ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) { + ncclResult_t ret = ncclSuccess; int netIndex; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex)); int* localGpus = NULL; int localGpuCount; + int foundGpu = -1; NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL)); for (int c=0; cnodes[GPU].nodes+g; int64_t id; - NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL)); + NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail); if (netId == id) { - *gpuIndex = g; - free(localGpus); - return ncclSuccess; + foundGpu = g; + goto exit; } } } +exit: + *gpuIndex = foundGpu; +fail: free(localGpus); - *gpuIndex = -1; - return ncclSuccess; + return ret; } /****************************/ @@ -948,25 +971,11 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) { struct ncclTopoNode* cpu = NULL, *gpu = NULL; - for (int g=0; gnodes[GPU].count; g++) { - if (system->nodes[GPU].nodes[g].gpu.rank == rank) { - gpu = system->nodes[GPU].nodes+g; - // Find closer CPU - int cpuIndex = -1, minHops = 0; - for (int c=0; cnodes[CPU].count; c++) { - int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count; - if (cpuIndex == -1 || nHops < minHops) { - cpuIndex = c; - minHops = nHops; - } - } - cpu = system->nodes[CPU].nodes+cpuIndex; - } - } - if (cpu == NULL) { - WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank); - return ncclInternalError; - } + int gpuIndex, cpuIndex; + NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex)); + NCCLCHECK(ncclGetLocalCpu(system, gpuIndex, &cpuIndex)); + gpu = system->nodes[GPU].nodes+gpuIndex; + cpu = system->nodes[CPU].nodes+cpuIndex; // Query the CPU affinity set we were provided cpu_set_t mask; diff --git a/src/graph/topo.h b/src/graph/topo.h index 6613f32..0837fb4 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -30,7 +30,7 @@ // to GPU traffic consumes more PCI bandwidth. #define INTEL_P2P_OVERHEAD(bw) (bw*6/5) -#define NCCL_TOPO_NODE_TYPES 7 +#define NCCL_TOPO_NODE_TYPES 6 #define GPU 0 #define PCI 1 #define NVS 2 @@ -103,9 +103,10 @@ struct ncclTopoLinkList { #define NCCL_TOPO_UNDEF (-1) +#define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56) -#define NCCL_TOPO_ID_LOCAL_ID(id) (id & 0x00ffffffffffffff) -#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + localid) +#define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK) +#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK)) struct ncclTopoNode { int type; diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index f9d814a..f0a6224 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -54,7 +54,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { - { 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring + { 6.8, 14.0, 8.4 }, { 6.6, 14.0, 8.4 }, // Tree, Ring { 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain { 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree @@ -64,15 +64,15 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { #define NCCL_HW_NET 2 static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 }, + { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4.0 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 }, /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 }, /* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, + { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 4.0 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 }, /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 }, + { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 }, /* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 }, /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } } }; @@ -105,6 +105,15 @@ static const double perChMaxTreeBws[3][3] = { /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0}, }; +NCCL_PARAM(PatEnable, "PAT_ENABLE", 2); +static int ncclPatEnable(struct ncclComm* comm) { + int patEnable = ncclParamPatEnable(); + if (patEnable != 2) return patEnable; + if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node + if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0; // PAT doesn't support net device offload + return 1; +} + // Network post overhead in ns (1000 = 1 us) NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2); @@ -146,7 +155,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; - float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount + float ppn = (float)nRanks / nNodes; int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; for (int a=0; atypeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; @@ -156,18 +165,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) : coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 : nRanks; - int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) : - coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 : - nNodes; for (int a=0; abwIntra : graphs[a]->bwInter; if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter); @@ -176,11 +185,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Various model refinements if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); } - if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw); - if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw); + if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw); if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; + if (a == NCCL_ALGO_PAT) busBw *= .85; if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { @@ -208,7 +218,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom } // Convert bus BW to algorithm BW - if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) { + if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) { float ratio = 1.0f; if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps; else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0; @@ -222,7 +232,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = hwLat[intraHw[a]][a][p]; - float interLat = hwLat[NCCL_HW_NET][a][p] + graphs[a]->latencyInter; + // With ppn=1 latencies are fully exposed, use the Tree network latency + float interLat = ppn == 1 ? hwLat[NCCL_HW_NET][NCCL_ALGO_TREE][p] : hwLat[NCCL_HW_NET][a][p]; + interLat += graphs[a]->latencyInter; // Also add the flush extra latency if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter; @@ -243,11 +255,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3; } intraLat = std::max(intraLat, netOverhead); + int nInterSteps = nNodes == 1 ? 0 : coll == ncclFuncAllReduce ? 2*(nNodes-1) : nNodes-1; comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat; } } else if (a == NCCL_ALGO_TREE) { - comm->latencies[coll][a][p] += - 2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat); + if (coll == ncclFuncAllReduce) { + comm->latencies[coll][a][p] += + 2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat); + } } else if (a == NCCL_ALGO_COLLNET_DIRECT) { comm->latencies[coll][a][p] += 2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency @@ -258,6 +273,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (nNodes > 1) comm->latencies[coll][a][p] += interLat; } else if (a == NCCL_ALGO_NVLS_TREE) { comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat; + } else if (a == NCCL_ALGO_PAT) { + if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) { + comm->latencies[coll][a][p] = 8 // Base time + + log2i(nNodes) * (interLat/3.5) // Log latency + + nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point. + } } } } @@ -266,7 +287,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; - int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 }; + int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 }; const char *protoStr = ncclGetEnv("NCCL_PROTO"); if (protoStr) { @@ -336,23 +357,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (comm->rank == 0) { char line[1024]; - for (int block=0; block<2; block++) { + for (int block=0; block= NCCL_NUM_ALGORITHMS) continue; sprintf(line+strlen(line), " %14s %14s %14s |", "", ncclAlgoStr[a], ""); } INFO(NCCL_TUNING, "%s", line); sprintf(line, " Protocol |"); - for (int ba=0; ba= NCCL_NUM_ALGORITHMS) continue; for (int p=0; pmaxThreads[a][p]); } @@ -360,8 +383,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom INFO(NCCL_TUNING, "%s", line); for (int c=0; c= NCCL_NUM_ALGORITHMS) continue; for (int p=0; platencies[c][a][p], comm->bandwidths[c][a][p]); } @@ -431,7 +455,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, *time = -1.0; return ncclSuccess; } int logSize = log2i(nBytes>>6); - if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; + if (algorithm == NCCL_ALGO_TREE && coll == ncclFuncAllReduce && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1 && coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) { lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring diff --git a/src/graph/xml.cc b/src/graph/xml.cc index c2c6a1c..bb123b7 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -468,8 +468,8 @@ ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* return ncclInternalError; } // Set affinity - char cpumaskPath[] = "/sys/devices/system/node/node0000"; - sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId); + char cpumaskPath[] = "/sys/devices/system/node/node000000"; + snprintf(cpumaskPath, sizeof(cpumaskPath), "/sys/devices/system/node/node%s", numaId); NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity")); } @@ -690,6 +690,9 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* } pciNode->parent = parent; // Keep PCI sub devices ordered by PCI Bus ID (Issue #820) + // Coverity complains about dereferenced parent being NULL + // but this can never happen. + // coverity[var_deref_op] int subIndex = parent->nSubs; const char* newBusId; NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId)); diff --git a/src/group.cc b/src/group.cc index 7158b45..3d3ecb8 100644 --- a/src/group.cc +++ b/src/group.cc @@ -57,7 +57,12 @@ ncclResult_t ncclAsyncLaunch( WARN("Blocking and nonblocking communicators are not allowed in the same group."); ret = ncclInvalidArgument; } - ncclIntruQueueEnqueue(&ncclAsyncJobs, job); + if (ret == ncclSuccess) { + ncclIntruQueueEnqueue(&ncclAsyncJobs, job); + } else { + // no need to undo, the job hasn't run + if (destructor) destructor(job); + } } return ret; @@ -75,7 +80,7 @@ void* ncclAsyncJobMain(void* arg) { ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) { ncclResult_t ret; - SYSCHECK(pthread_join(job->thread, NULL), "pthread_join"); + PTHREADCHECK(pthread_join(job->thread, NULL), "pthread_join"); if (job->result != ncclSuccess) { WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result); } @@ -165,6 +170,12 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) { NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail); break; } + case NCCL_ALGO_PAT: { + NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail); + break; + } + // Yes, it's a dead code. That's fine... + // coverity[dead_error_begin] default: { ret = ncclInternalError; goto fail; @@ -301,7 +312,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g ncclKernelPlanner::Peer* tmp = comm->planner.peers; memset(&comm->planner, 0, sizeof(comm->planner)); comm->planner.peers = tmp; - memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0])); + if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0])); } if (!comm->config.blocking) @@ -329,7 +340,7 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueuethread, nullptr, ncclAsyncJobMain, job), ret, fail); + PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail); job = job->next; } while (job != nullptr); @@ -341,8 +352,9 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueuethread, nullptr) != 0) { - WARN("Error waiting for pthread_join : %s", strerror(errno)); + int err; + if ((err = pthread_join(job->thread, nullptr)) != 0) { + WARN("Error waiting for pthread_join: %s", strerror(err)); ret = ncclSystemError; } job->state = ncclGroupJobJoined; @@ -373,13 +385,6 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueuedestroyFlag && job->comm && !job->comm->config.blocking) - (void) ncclCommSetAsyncError(job->comm, ret); - if (job->destructor) job->destructor((void*)job); - } - exit: return ret; fail: @@ -393,6 +398,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr; struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr; struct ncclIntruQueue *asyncJobsMain = gjob->asyncJobsPtr; + bool *groupAbortFlag = gjob->abortFlagPtr; CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); @@ -409,7 +415,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf job->base.abortFlag = comm->abortFlag; job->base.abortFlagDev = comm->abortFlagDev; job->comm = comm; - ncclIntruQueueEnqueue(asyncJobsMain, &job->base); + ncclIntruQueueEnqueue(asyncJobsMain, (struct ncclAsyncJob*)job); struct ncclComm* next = comm->preconnectNext; comm->preconnectNext = reinterpret_cast(0x1); @@ -422,12 +428,14 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf /* Connect channels at runtime if cumem is supported */ if (groupCommHeadMain != nullptr) { struct ncclComm* comm = groupCommHeadMain; - + struct ncclIntruQueue asyncCollJobs; + ncclIntruQueueConstruct(&asyncCollJobs); do { bool needConnect = false; bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail); if (comm->cuMemSupport && needConnect) { @@ -438,21 +446,33 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf job->base.destructor = free; job->base.state = ncclGroupJobRunning; job->base.abortFlag = comm->abortFlag; + job->base.abortFlagDev = comm->abortFlagDev; job->comm = comm; NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail); memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); - ncclIntruQueueEnqueue(asyncJobsMain, &job->base); + ncclIntruQueueEnqueue(&asyncCollJobs, &job->base); } comm = comm->groupNext; } while (comm); - NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail); + NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail); + while (!ncclIntruQueueEmpty(&asyncCollJobs)) { + struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs); + if (job->destructor) job->destructor((void*)job); + } } if ((!simInfo) && (groupCommHeadMain != nullptr)) { NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail); } + while (!ncclIntruQueueEmpty(asyncJobsMain)) { + struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain); + if (!job->destroyFlag && job->comm && !job->comm->config.blocking) + (void) ncclCommSetAsyncError(job->comm, ret); + if (job->destructor) job->destructor((void*)job); + } + while (groupCommHeadMain != nullptr) { struct ncclComm* comm = groupCommHeadMain; struct ncclComm* next = comm->groupNext; @@ -517,7 +537,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { ncclGroupJobMainPtr = &ncclGroupJobMain; /* make sure ncclGroupBlocking has been set. */ assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1); - if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) { + if (ncclGroupBlocking == 0) { /* nonblocking group */ if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs); @@ -539,7 +559,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { } ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking; - SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail); + PTHREADCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), "pthread_create", ret, fail); ret = ncclInProgress; } else { /* blocking group */ diff --git a/src/include/alloc.h b/src/include/alloc.h index 71d0777..7744119 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -17,6 +17,11 @@ #include #include +#if CUDART_VERSION >= 11030 +#include +#include "cudawrap.h" +#endif + uint64_t clockNano(); // from utils.h with which we have a circular dependency template @@ -24,6 +29,81 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); } template<> constexpr size_t ncclSizeOfT() { return 1; } +#if CUDART_VERSION >= 12020 + +static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) { + ncclResult_t result = ncclSuccess; + size_t granularity = 0; + CUdevice currentDev; + CUmemAllocationProp prop = {}; + CUmemAccessDesc accessDesc = {}; + CUmemGenericAllocationHandle handle; + int cudaDev; + int cpuNumaNodeId = -1; + CUmemAllocationHandleType type = ncclCuMemHandleType; + + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuDeviceGet(¤tDev, cudaDev)); + CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev)); + if (cpuNumaNodeId < 0) cpuNumaNodeId = 0; + prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.requestedHandleTypes = type; // So it can be exported + prop.location.id = cpuNumaNodeId; + CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + ALIGN_SIZE(size, granularity); + /* Allocate the physical memory on the device */ + CUCHECK(cuMemCreate(&handle, size, &prop, 0)); + /* Reserve a virtual address range */ + CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, granularity, 0, 0)); + /* Map the virtual address range to the physical allocation */ + CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + /* Now allow RW access to the newly mapped memory for local GPU */ + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); + + /* Now allow RW access to the newly mapped memory from the CPU */ + accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA; + accessDesc.location.id = cpuNumaNodeId; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); + + if (handlep) *handlep = handle; + INFO(NCCL_ALLOC, "CUMEM Host Alloc Size %zi pointer %p handle %llx numa %d dev %d granularity %ld", size, *ptr, handle, cpuNumaNodeId, cudaDev, granularity); + return result; +} + +static inline ncclResult_t ncclCuMemHostFree(void* ptr) { + if (ptr == NULL) return ncclSuccess; + ncclResult_t result = ncclSuccess; + CUmemGenericAllocationHandle handle; + size_t size = 0; + CUCHECK(cuMemRetainAllocationHandle(&handle, ptr)); + CUCHECK(cuMemRelease(handle)); + CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + TRACE(NCCL_ALLOC, "CUMEM Host Free Size %zi pointer %p handle 0x%llx", size, ptr, handle); + CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CUCHECK(cuMemRelease(handle)); + CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + return result; +} + +#else /* CUDART_VERSION >= 12020 */ + +static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, void* handlep, size_t size) { + WARN("CUMEM Host is not supported prior to CUDA 12.2"); + return ncclInternalError; +} + +static inline ncclResult_t ncclCuMemHostFree(void* ptr) { + WARN("CUMEM Host is not supported prior to CUDA 12.2"); + return ncclInternalError; +} + +#endif /* CUDART_VERSION >= 12020 */ + template ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; @@ -40,24 +120,25 @@ finish: INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } -#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) -inline ncclResult_t ncclCudaHostFree(void* ptr) { +static inline ncclResult_t ncclCudaHostFree(void* ptr) { CUDACHECK(cudaFreeHost(ptr)); return ncclSuccess; } +#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) + template ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { if (nelem > 0) { - void* p = malloc(nelem*ncclSizeOfT()); + T* p = (T*)malloc(nelem*ncclSizeOfT()); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT()); return ncclSystemError; } //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), p); memset(p, 0, nelem*ncclSizeOfT()); - *ptr = (T*)p; + *ptr = p; } else { *ptr = NULL; } @@ -67,17 +148,17 @@ ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int li template ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { - if (nelem < oldNelem) return ncclInternalError; + T* oldp = *ptr; + if (nelem < oldNelem || (oldp == NULL && oldNelem > 0)) return ncclInternalError; if (nelem == oldNelem) return ncclSuccess; - T* oldp = *ptr; T* p = (T*)malloc(nelem*ncclSizeOfT()); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT()); return ncclSystemError; } - memcpy(p, oldp, oldNelem*ncclSizeOfT()); - free(oldp); + if (oldp && oldNelem) memcpy(p, oldp, oldNelem * ncclSizeOfT()); + if (oldp) free(oldp); memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT()); *ptr = (T*)p; INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT(), nelem*ncclSizeOfT(), *ptr); @@ -89,6 +170,40 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { #include #include "cudawrap.h" +// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer +static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) { + ncclResult_t result = ncclSuccess; + size_t granularity = 0; + CUmemAllocationProp prop = {}; + CUmemAccessDesc accessDesc = {}; + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuMemGetAllocationPropertiesFromHandle(&prop, *handleIn)); + CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + ALIGN_SIZE(size, granularity); + /* Reserve a virtual address range */ + CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + /* Map the virtual address range to the physical allocation */ + CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, *handleIn, 0)); + /* Now allow RW access to the newly mapped memory */ + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); + TRACE(NCCL_ALLOC, "CuMem Map Size %zu pointer %p handle %llx", size, *ptr, *handleIn); + return result; +} + +static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) { + if (ptr == NULL) return ncclSuccess; + ncclResult_t result = ncclSuccess; + size_t size = 0; + CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + return result; +} + static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) { ncclResult_t result = ncclSuccess; size_t granularity = 0; @@ -106,7 +221,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand prop.requestedHandleTypes = type; prop.location.id = currentDev; // Query device to see if RDMA support is available - CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev)); if (flag) prop.allocFlags.gpuDirectRDMACapable = 1; CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); ALIGN_SIZE(size, granularity); @@ -154,6 +269,15 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) { return ncclInternalError; } +static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) { + WARN("CUMEM not supported prior to CUDA 11.3"); + return ncclInternalError; +} + +static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) { + WARN("CUMEM not supported prior to CUDA 11.3"); + return ncclInternalError; +} #endif template @@ -274,7 +398,8 @@ finish: // and if they are shared, that could cause a crash in a child process inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { if (size > 0) { - size_t page_size = sysconf(_SC_PAGESIZE); + long page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) return ncclSystemError; void* p; int size_aligned = ROUNDUP(size, page_size); int ret = posix_memalign(&p, page_size, size_aligned); diff --git a/src/include/bitops.h b/src/include/bitops.h index 95620cb..a650aa7 100644 --- a/src/include/bitops.h +++ b/src/include/bitops.h @@ -185,6 +185,8 @@ inline __host__ __device__ Int pow2Up(Int x) { template inline __host__ __device__ Int pow2Down(Int x) { + // True, log2Down can return -1, but we don't normally pass 0 as an argument... + // coverity[negative_shift] return Int(1)< // Check system calls -#define SYSCHECK(call, name) do { \ +#define SYSCHECK(statement, name) do { \ int retval; \ - SYSCHECKVAL(call, name, retval); \ -} while (false) - -#define SYSCHECKVAL(call, name, retval) do { \ - SYSCHECKSYNC(call, name, retval); \ + SYSCHECKSYNC((statement), name, retval); \ if (retval == -1) { \ - WARN("Call to " name " failed : %s", strerror(errno)); \ + WARN("Call to " name " failed: %s", strerror(errno)); \ return ncclSystemError; \ } \ } while (false) -#define SYSCHECKSYNC(call, name, retval) do { \ - retval = call; \ +#define SYSCHECKSYNC(statement, name, retval) do { \ + retval = (statement); \ if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ } else { \ @@ -60,14 +56,33 @@ } \ } while(true) -#define SYSCHECKGOTO(statement, RES, label) do { \ - if ((statement) == -1) { \ - /* Print the back trace*/ \ - RES = ncclSystemError; \ - INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ +#define SYSCHECKGOTO(statement, name, RES, label) do { \ + int retval; \ + SYSCHECKSYNC((statement), name, retval); \ + if (retval == -1) { \ + WARN("Call to " name " failed: %s", strerror(errno)); \ + RES = ncclSystemError; \ goto label; \ } \ -} while (0); +} while (0) + +// Pthread calls don't set errno and never return EINTR. +#define PTHREADCHECK(statement, name) do { \ + int retval = (statement); \ + if (retval != 0) { \ + WARN("Call to " name " failed: %s", strerror(retval)); \ + return ncclSystemError; \ + } \ +} while (0) + +#define PTHREADCHECKGOTO(statement, name, RES, label) do { \ + int retval = (statement); \ + if (retval != 0) { \ + WARN("Call to " name " failed: %s", strerror(retval)); \ + RES = ncclSystemError; \ + goto label; \ + } \ +} while (0) #define NEQCHECK(statement, value) do { \ if ((statement) != value) { \ @@ -75,7 +90,7 @@ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ return ncclSystemError; \ } \ -} while (0); +} while (0) #define NEQCHECKGOTO(statement, value, RES, label) do { \ if ((statement) != value) { \ @@ -84,7 +99,7 @@ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ -} while (0); +} while (0) #define EQCHECK(statement, value) do { \ if ((statement) == value) { \ @@ -92,7 +107,7 @@ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ return ncclSystemError; \ } \ -} while (0); +} while (0) #define EQCHECKGOTO(statement, value, RES, label) do { \ if ((statement) == value) { \ @@ -101,7 +116,7 @@ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ -} while (0); +} while (0) // Propagate errors up #define NCCLCHECK(call) do { \ @@ -111,7 +126,7 @@ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ return RES; \ } \ -} while (0); +} while (0) #define NCCLCHECKGOTO(call, RES, label) do { \ RES = call; \ @@ -120,7 +135,7 @@ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ goto label; \ } \ -} while (0); +} while (0) #define NCCLWAIT(call, cond, abortFlagPtr) do { \ uint32_t* tmpAbortFlag = (abortFlagPtr); \ @@ -130,7 +145,7 @@ return ncclInternalError; \ } \ if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \ -} while (!(cond)); +} while (!(cond)) #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \ uint32_t* tmpAbortFlag = (abortFlagPtr); \ @@ -140,7 +155,7 @@ goto label; \ } \ if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ -} while (!(cond)); +} while (!(cond)) #define NCCLCHECKTHREAD(a, args) do { \ if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \ diff --git a/src/include/collectives.h b/src/include/collectives.h index fb7af3b..e45d78f 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -64,4 +64,490 @@ struct ncclConnFifo { ssize_t size; void* ptr; }; + +#include + +template +class PatRSAlgorithm{ + size_t offset; + size_t end; + size_t count; + int chunkCount; + int nelem; + int rank; + int nranks; + int nrPow2; + int postFreq; + int lastA; + + int aggFactor; + int as; // aggregated steps + int a; // step inside aggregated step + int sendSkipped; // number of skipped steps during aggregation + int recvSkipped; // number of skipped steps during aggregation + int phase2recv; // receive offset for phase 2 + int aggDelta; + int scale; + int phase; + + __device__ __host__ int min(int a, int b) { + return (a>=1) { + if ((i&mask) == 0) ret += imask; + } + return ret; + } + + __device__ __host__ int firstBitSet(int i, int max) { + int ffs = +#ifdef __CUDA_ARCH__ + __ffs(i); +#else + __builtin_ffs(i); +#endif + return ffs ? ffs-1 : max; + } + + __device__ __host__ void resetA() { + a = 0; + sendSkipped = recvSkipped = 0; + lastA = aggFactor; + if (phase >= 2) lastA /= 2*scale; + } + + __device__ __host__ void reset() { + nelem = getNelem(); + phase = 0; + scale = 1; + phase2recv = 0; + as = aggDelta - 1; + resetA(); + } + + __device__ __host__ int nBitsSet(int i) { + int nbits = +#ifdef __CUDA_ARCH__ + __popc(i); +#else + __builtin_popcount(i); +#endif + return nbits; + } + + // Return 1 when only upper bits are set. For example, if nrpow2==16 we'll return 1 for 8, 12, 14, 15. + // A number being in the form of 1111000 implies that the complementary is 0000111 meaning it's a power of 2 minus 1. + __device__ __host__ int newPeer(int i, int pow2) { + //printf("New peer %d/%d -> %d\n", i, pow2, nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0); + return nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0; + } + +public: + __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks): + offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) { + aggDelta = nrPow2 = (1<= 2 && aggFactor < nranks/2) { + aggFactor *= 2; + aggDelta /= 2; + } + postFreq = aggFactor; + int d = stepDepth; + while (d > 1 && aggFactor < nranks/2) { + d /= 2; + aggFactor *= 2; + aggDelta /= 2; + } + + reset(); + } + + __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) { +restart: + last = 0; + nelemOut = nelem; + outIx = offset; + int skip = 0; + //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale); + if (phase == 0) { + int s = mirrorInvert(a, lastA)*aggDelta + as; + if (s >= nranks) skip = 1; + int sendDataRank = (rank + s) % nranks; + inpIx = sendDataRank * count + offset; + recvDim = -1; + sendDim = 0; + outIx = 0; + recvOffset = -1; + sendOffset = ((a - sendSkipped)%postFreq) * nelem; + sendStepOffset = 0; + if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) { + postSend = 1; + } else { + postSend = 0; + } + postRecv = 0; + if (skip) sendSkipped++; + if (++a == lastA) { + phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2 + resetA(); + } + if (skip == 0) return; + } else if (phase == 1) { + int s = mirrorInvert(a, lastA)*aggDelta + as; + if (s >= nranks) skip = 1; + recvDim = firstBitSet(s, nrPow2); + sendOffset = ((a - sendSkipped)%postFreq)*nelem; + recvOffset = ((a - recvSkipped)%postFreq)*nelem; + postSend = 0; + if (recvDim == 0) { + if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1; + sendStepOffset = 0; + } else { + sendStepOffset = (a - sendSkipped)/postFreq; + } + if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) { + postRecv = 1; + } else { + postRecv = 0; + } + s -= (1<= nranks) skip = 1; + recvDim = 0; + postSend = a == lastA-1 ? 1 : 0; + s -= 1; + if (s < nranks && skip) { + recvDim = -1; + recvOffset = -1; + skip = 0; + } else if (!skip) { + int foffset = phase2recv; + phase2recv++; + postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0; + recvOffset = (foffset%postFreq) * nelem; + } + int recvDataRank = (rank + nranks + s) % nranks; + inpIx = recvDataRank * count + offset; + sendDim = s ? firstBitSet(s, nrPow2) : -1; + int foffset = a - sendSkipped; + postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0; + sendStepOffset = 0; + sendOffset = (foffset%postFreq) * nelem; + if (skip || sendDim == -1) sendSkipped++; + if (++a == lastA) { + phase = 3; + resetA(); + } + if (skip == 0) return; + } else if (phase == 3) { + int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta; + postRecv = a == lastA-1 ? 1 : 0; + if (s >= nranks) skip = 1; + recvDim = firstBitSet(s, nrPow2); + postSend = 0; + s -= (1<= 0 ? (foffset%postFreq) * nelem : -1; + if (skip || recvDim == -1) recvSkipped++; + if (skip) sendSkipped++; + if (++a == lastA) { + scale *= 2; + phase = scale < aggFactor ? 2 : 4; + resetA(); + } + if (skip == 0) return; + } else if (phase == 4) { + recvDim = 0; + sendDim = -1; + inpIx = rank * count + offset; + recvOffset = (phase2recv%postFreq) * nelem; + sendStepOffset = 0; + sendOffset = -1; + postRecv = 1; + postSend = 0; + offset += chunkCount; + if (offset >= end) { + last = 1; + } else { + reset(); + } + return; + } + goto restart; + } +}; + +template +class PatAGAlgorithm{ + size_t offset; + size_t end; + size_t count; + int chunkCount; + int nelem; + int rank; + int nranks; + int nrPow2; + int postFreq; + int lastA; + + int aggFactor; + int as; // aggregated steps + int a; // step inside aggregated step + int aggDelta; + + int scale; + + int phase; + + // AS computation + int asDim; + int v; + int bitCount[32]; + int bitZeroStep[32]; + + __device__ __host__ int min(int a, int b) { + return (a>=1) { + if ((i&mask)) ret += imask; + } + return ret; + } + + __device__ __host__ int firstBitSet(int i, int max) { + int ffs = +#ifdef __CUDA_ARCH__ + __ffs(i); +#else + __builtin_ffs(i); +#endif + return ffs ? ffs-1 : max; + } + + __device__ __host__ void resetA() { + a = 0; + lastA = aggFactor; + if (phase >= 2) lastA /= 2*scale; + } + + __device__ __host__ void reset() { + nelem = getNelem(); + scale = aggFactor/2; + phase = scale ? 2 : 1; + v = 0; + for (int i = 0; i= 2 && aggFactor < nranks/2) { + aggFactor *= 2; + aggDelta /= 2; + } + postFreq = aggFactor; + int d = stepDepth; + while (d > 1 && aggFactor < nranks/2) { + d /= 2; + aggFactor *= 2; + aggDelta /= 2; + } + //printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta); + + asDim = log2Up(aggDelta); + reset(); + } + + __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) { +restart: + //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale); + last = 0; + nelemOut = nelem; + inpIx = offset; + int skip = 0; + if (phase == 0) { + int s = a*aggDelta + as; + if (s >= nranks) skip = 1; + int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0; + int recvDataRank = (rank + s) % nranks; + outIx = recvDataRank * count + offset; + sendDim = -1; + recvDim = 0; + inpIx = 0; + sendOffset = -1; + recvOffset = (a % postFreq) * nelem; + recvStepOffset = 0; + postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; + postSend = 0; + a++; + if (nextSkip) { + as = nextAs(); + if (as == aggDelta/2) { + offset += chunkCount; + if (offset >= end) { + last = 1; + } else { + reset(); + } + return; + } + phase = 1; + resetA(); + } + if (skip == 0) return; + } else if (phase == 1) { + int s = a*aggDelta + as; + if (s >= nranks) skip = 1; + sendDim = firstBitSet(s, nrPow2); + s -= (1<= nranks) ? 1 : 0; + postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0; + recvStepOffset = (sendDim == 0) ? 0 : a/postFreq; + if (recvDim == -1) { + recvOffset = -1; + postRecv = 0; + } else if (as - (1<> (recvDim+1); + recvOffset = (foffset%postFreq)*nelem; + postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<= nranks) ? 1 : 0; + recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq; + } + if (s < nranks && sendDim == 0 && skip) { + // Don't forget to receive at least once even if we don't send afterwards + sendDim = -1; + sendOffset = -1; + postSend = 0; + skip = 0; + } + if (++a == lastA) { + if (as % 2 == 1) { + phase = 0; + } else { + as = nextAs(); + } + resetA(); + } + if (skip == 0) return; + } else if (phase == 2) { + int s = (2*a+1)*scale*aggDelta; + postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0; + postRecv = 0; + if (s >= nranks) skip = 1; + sendDim = firstBitSet(s, nrPow2); + s -= (1<> (recvDim+1); + recvOffset = (foffset%postFreq)*nelem; + recvStepOffset = foffset / postFreq; + } + if (++a == lastA) { + scale /= 2; + phase = scale ? 2 : 1; + resetA(); + } + if (skip == 0) return; + } + goto restart; + } +}; #endif diff --git a/src/include/comm.h b/src/include/comm.h index 0cc0a89..9d102df 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -16,6 +16,7 @@ #include "nccl_net.h" #include "register.h" #include "graph.h" +#include "profiler.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -104,6 +105,11 @@ struct ncclCommCallback { struct ncclCommCallback* next; ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb); }; +struct ncclCommEventCallback { + struct ncclCommEventCallback* next; + cudaEvent_t event; + ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommEventCallback* cb); +}; struct ncclSharedResources { int refCount; @@ -173,6 +179,54 @@ struct ncclCollnetHandleList { struct ncclProxyConnector* proxyconn; }; +struct ncclTaskColl { + struct ncclTaskColl* next; + ncclFunc_t func; + void const* sendbuff; + void* recvbuff; + size_t count; + int root; + ncclDataType_t datatype; + ncclRedOp_t opHost; + struct ncclDevRedOpFull opDev; + int chunkSteps, sliceSteps; + // Computed later: + size_t trafficBytes; + int32_t nMaxChannels:8; + int32_t nWarps:8; + int32_t algorithm:8, protocol:8; + uint32_t isCollnet:1, isNvls:1; + uint32_t devFuncId:30; + enum ncclRegBufferType regBufType; + // number of elements in planner->ipcMemQueue associated with this collective + int nCleanupQueueElts; + + void* sendMhandle; + void* recvMhandle; + // index for IPC record lookup + uintptr_t sendbuffOffset; + uintptr_t recvbuffOffset; + uintptr_t* sendbuffRmtAddrs; + uintptr_t* recvbuffRmtAddrs; + + // Profiler plugin + int eActivationMask; + void* eventHandle; +}; +struct ncclTaskP2p { + struct ncclTaskP2p* next; + ncclFunc_t func; + void* buff; + size_t count; + ncclDataType_t datatype; + int root; + size_t bytes; + + // Profiler plugin + int eActivationMask; + void* eventHandle; +}; + struct ncclKernelPlan { // A kernel plan is also a callback that reclaims itself. Hence this must // be the first member. @@ -198,40 +252,12 @@ struct ncclKernelPlan { struct ncclIntruQueue cleanupQueue; void* workBufPersistent; + struct ncclIntruQueue p2pTaskQueue; + struct ncclIntruQueue collTaskQueue; struct ncclIntruQueue proxyOpQueue; -}; -//////////////////////////////////////////////////////////////////////////////// - -struct ncclTaskColl { - struct ncclTaskColl* next; - ncclFunc_t func; - void const* sendbuff; - void* recvbuff; - size_t count; - int root; - ncclDataType_t datatype; - ncclRedOp_t opHost; - struct ncclDevRedOpFull opDev; - int chunkSteps, sliceSteps; - // Computed later: - size_t trafficBytes; - int32_t nMaxChannels:8; - int32_t nWarps:8; - int32_t algorithm:8, protocol:8; - uint32_t isCollnet:1, isNvls:1; - uint32_t devFuncId:30; - enum ncclRegBufferType regBufType; - // number of elements in planner->ipcMemQueue associated with this collective - int nCleanupQueueElts; - - void* sendMhandle; - void* recvMhandle; -}; -struct ncclTaskP2p { - struct ncclTaskP2p* next; - void* buff; - size_t bytes; + // Profiler plugin + void* groupEventHandle; }; //////////////////////////////////////////////////////////////////////////////// @@ -383,6 +409,8 @@ struct ncclComm { struct ncclChannel channels[MAXCHANNELS]; struct ncclPeerInfo* peerInfo; struct ncclTopoSystem* topo; + struct ncclProxyConnector* gproxyConn; + struct ncclIntruQueue legacyRegCleanupQueue; int netPluginLoaded; ncclNet_t* ncclNet; @@ -395,10 +423,12 @@ struct ncclComm { struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS]; bool initAlgoChannels[NCCL_NUM_ALGORITHMS]; bool runtimeConn; // if dynamic connection is supported + bool directMode; int cuMemSupport; uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. + const char* commName; uint64_t commHash; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator @@ -504,7 +534,7 @@ struct ncclComm { int collNetSupport; bool collNetRegSupport; uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes]; - int intraHighestTransportType; + bool intraNodeP2pSupport; int* collNetHeads; int collNetHeadsNum; int* collNetDenseToUserRank; @@ -519,6 +549,8 @@ struct ncclComm { struct ncclNvlsSharedRes* nvlsResources; // pools backed by comm->memPermanent + struct ncclMemoryPool memPool_ncclTaskColl; + struct ncclMemoryPool memPool_ncclTaskP2p; struct ncclMemoryPool memPool_ncclProxyOp; struct ncclMemoryPool memPool_ncclKernelPlan; @@ -532,6 +564,13 @@ struct ncclComm { struct ncclKernelPlanner planner; + cudaMemPool_t memPool; + // Queue of events and associated callbacks for cleaning up asynchronous work. + // Using this is preferable to using CUDA host callbacks because host callbacks + // won't allow the work following the callback to run until the callback completes, + // which comes at expense to perf. + struct ncclIntruQueue eventCallbackQueue; + // user-created reduction ops int userRedOpCapacity, userRedOpFreeHead; ncclUserRedOp *userRedOps; @@ -553,6 +592,11 @@ struct ncclComm { int tunerPluginLoaded; ncclTuner_t* tuner; void *tunerContext; + + // Profiler plugin + void* profilerContext; + uint64_t seqNumber[NCCL_NUM_FUNCTIONS]; + // buffer registration cache struct ncclRegCache regCache; uint64_t endMagic; @@ -583,6 +627,27 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) return ncclSuccess; } +inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + while (true) { + struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue); + if (cb == nullptr) break; + cudaError_t ok = cudaEventSynchronize(cb->event); + if (ok == cudaErrorNotReady) break; + ncclIntruQueueDequeue(&comm->eventCallbackQueue); + if (ok == cudaSuccess) { + NCCLCHECKGOTO(cb->fn(comm, cb), result, finish); + } else { + CUDACHECKGOTO(ok, result, finish); + } + } +finish: + cudaThreadExchangeStreamCaptureMode(&mode); + return ncclSuccess; +} + inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) { int phase = comm->intraBarrierPhase; if (comm->intraRanks == 1) { diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h index fd7b031..bf61326 100644 --- a/src/include/cudawrap.h +++ b/src/include/cudawrap.h @@ -13,6 +13,7 @@ // Is cuMem API usage enabled extern int ncclCuMemEnable(); +extern int ncclCuMemHostEnable(); #if CUDART_VERSION >= 11030 #include @@ -96,6 +97,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemRelease); DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle); DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess); DECLARE_CUDA_PFN_EXTERN(cuMemUnmap); +DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle); #if CUDA_VERSION >= 11070 DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support #endif diff --git a/src/include/device.h b/src/include/device.h index 76a909f..153b5ae 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -128,6 +128,8 @@ struct ncclConnInfo { }; struct ncclProxyConnector { + bool initialized; + int rank; int tpRank; int tpLocalRank; int sameProcess; @@ -141,6 +143,8 @@ struct ncclConnector { struct ncclTransportComm* transportComm; void* transportResources; struct ncclConnInfo conn; + int sendMemSameProcess; + int recvMemSameProcess; }; struct ncclRing { @@ -225,6 +229,7 @@ struct alignas(16) ncclDevWorkP2p { uint8_t sendProtoLL:1, recvProtoLL:1; uint8_t sendRegistered:1, recvRegistered:1; + uint8_t sendIpcReg:1, recvIpcReg:1; }; // Compute the subset of the data transfer corresponding to the given part index. @@ -266,6 +271,10 @@ struct alignas(16) ncclDevWorkColl { uint32_t root; void* recvbuff; void* sendbuff; + uintptr_t sendbuffOffset; + uintptr_t recvbuffOffset; + uintptr_t* sendbuffRmtAddrs; + uintptr_t* recvbuffRmtAddrs; union { // Continuous-byte-distribution scheduling. The lo and hi channels are of // different size than the channels in the middle. @@ -384,6 +393,7 @@ struct ncclDevComm { int nNodes; int buffSizes[NCCL_NUM_PROTOCOLS]; int p2pChunkSize; + int isNvlink; // Work fifo return credits uint32_t* workConsumed/*[MAXCHANNELS]*/; @@ -395,6 +405,7 @@ struct ncclDevComm { // Channels, device side struct ncclDevChannel* channels/*[MAXCHANNELS]*/; + int* rankToLocalRank; }; struct alignas(16) ncclDevCommAndChannels { @@ -539,11 +550,12 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) if (coll == ncclFuncSendRecv) break; row += 1; - int nAlgos = 3; + int nAlgos = 4; if (coll == ncclFuncAllGather) { int algo1 = algo == NCCL_ALGO_RING ? 0 : algo == NCCL_ALGO_COLLNET_DIRECT ? 1 : - /*algo == NCCL_ALGO_NVLS*/ 2; + algo == NCCL_ALGO_NVLS ? 2 : + /*algo == NCCL_ALGO_PAT*/ 3; row += algo1*NCCL_NUM_PROTOCOLS + proto; break; } @@ -556,7 +568,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) } row += nAlgos*NCCL_NUM_PROTOCOLS; - nAlgos = NCCL_NUM_ALGORITHMS; + nAlgos = 6; if (coll == ncclFuncAllReduce) { row += ((devRedOp*NumTypes + type)*nAlgos + algo)*NCCL_NUM_PROTOCOLS + proto; break; @@ -570,11 +582,12 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) } row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS; - nAlgos = 3; + nAlgos = 4; if (coll == ncclFuncReduceScatter) { int algo1 = algo == NCCL_ALGO_RING ? 0 : algo == NCCL_ALGO_COLLNET_DIRECT ? 1 : - /*algo == NCCL_ALGO_NVLS*/ 2; + algo == NCCL_ALGO_NVLS ? 2 : + /*algo == NCCL_ALGO_PAT*/ 3; row += ((devRedOp*NumTypes + type)*nAlgos + algo1)*NCCL_NUM_PROTOCOLS + proto; break; } diff --git a/src/include/graph.h b/src/include/graph.h index 0271b52..b6d86b3 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -33,13 +33,14 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); -ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); +ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr); ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush); -ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net); +ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net); int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); +ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu); // Find CPU affinity ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); @@ -76,7 +77,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); #define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct struct ncclTopoGraph { // Input / output - int id; // ring : 0, tree : 1, collnet : 2 + int id; // ring : 0, tree : 1, collnet : 2, nvls : 3, collnetDirect : 4 int pattern; int crossNic; int collNet; diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h index a0fb3a5..26851b1 100644 --- a/src/include/nccl_common.h +++ b/src/include/nccl_common.h @@ -50,7 +50,7 @@ typedef enum { ncclNumFuncs = 8 } ncclFunc_t; -#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* +#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* #define NCCL_ALGO_UNDEF -1 #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 @@ -58,6 +58,7 @@ typedef enum { #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 #define NCCL_ALGO_NVLS_TREE 5 +#define NCCL_ALGO_PAT 6 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_UNDEF -1 diff --git a/src/include/nvtx.h b/src/include/nvtx.h index 3bdfec5..14b317f 100644 --- a/src/include/nvtx.h +++ b/src/include/nvtx.h @@ -16,20 +16,23 @@ #endif // Define all NCCL-provided static schema IDs here (avoid duplicates). -#define NVTX_SID_CommInitRank 0 -#define NVTX_SID_CommInitAll 1 -#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank -#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank -#define NVTX_SID_AllGather 4 -#define NVTX_SID_AllReduce 5 -#define NVTX_SID_Broadcast 6 -#define NVTX_SID_ReduceScatter 7 -#define NVTX_SID_Reduce 8 -#define NVTX_SID_Send 9 -#define NVTX_SID_Recv 10 +#define NVTX_SID_CommInitRank 0 +#define NVTX_SID_CommInitAll 1 +#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank +#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank +#define NVTX_SID_AllGather 4 +#define NVTX_SID_AllReduce 5 +#define NVTX_SID_Broadcast 6 +#define NVTX_SID_ReduceScatter 7 +#define NVTX_SID_Reduce 8 +#define NVTX_SID_Send 9 +#define NVTX_SID_Recv 10 +#define NVTX_SID_CommInitRankConfig 11 // same schema as NVTX_SID_CommInitRank +#define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank +#define NVTX_SID_CommSplit 13 // Define static schema ID for the reduction operation. -#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START +#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START extern const nvtxDomainHandle_t ncclNvtxDomainHandle; diff --git a/src/include/p2p.h b/src/include/p2p.h index 5c73a6c..e49c45d 100644 --- a/src/include/p2p.h +++ b/src/include/p2p.h @@ -34,11 +34,36 @@ typedef union { // Legacy CUDA IPC cudaIpcMemHandle_t devIpc; // cuMem API support - ncclCuDesc cuDesc; + struct { + ncclCuDesc cuDesc; + CUmemGenericAllocationHandle memHandle; + }; } ncclIpcDesc; -ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr); +enum ncclIpcRegType { + NCCL_IPC_SENDRECV = 0, + NCCL_IPC_COLLECTIVE = 1 +}; + +struct ncclIpcImpInfo { + void* rmtRegAddr; + bool legacyIpcCap; + uintptr_t offset; +}; + +struct ncclIpcRegInfo { + int peerRank; + void* baseAddr; + struct ncclProxyConnector* ipcProxyconn; + struct ncclIpcImpInfo impInfo; +}; + +ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr); ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); -ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); +ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); +ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut); +ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts); + +ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo); #endif diff --git a/src/include/profiler.h b/src/include/profiler.h index 103af99..36774dc 100644 --- a/src/include/profiler.h +++ b/src/include/profiler.h @@ -4,34 +4,52 @@ * See LICENSE.txt for license information ************************************************************************/ -#ifndef NCCL_PROFILER_H_ -#define NCCL_PROFILER_H_ +#ifndef PROFILER_H_ +#define PROFILER_H_ -#include "proxy.h" +#include +#include "nccl_profiler.h" -enum ncclProxyProfileState { - ncclProxyProfileBegin = 0, +struct ncclProxyArgs; +struct ncclKernelPlan; +struct ncclTaskColl; +struct ncclTaskP2p; +struct ncclInfo; +struct ncclComm; +struct ncclProxyOp; - ncclProxyProfileSendGPUWait = 1, - ncclProxyProfileSendWait = 2, +// Plugin Init/Finalize Wrappers +ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm); +ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm); - ncclProxyProfileRecvWait = 1, - ncclProxyProfileRecvFlushWait = 2, - ncclProxyProfileRecvGPUWait = 3, +// Profiler Start/Stop Group Wrappers +ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan); +ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan); - ncclProxyProfileEnd = 4, +// Profiler Start/Stop Task Events Wrappers +ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan); +ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan); - ncclProxyProfileSleep = 8, - ncclProxyProfileWakeup = 9, +// Proxy Op Start/Stop Event Wrappers +ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args); +ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args); +ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args); - ncclProxyProfileIdle = 16, - ncclProxyProfileActive = 17, +// Proxy Step Start/Stop Event Wrappers +ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); +ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); +ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); - ncclProxyProfileAppend = 24, - ncclProxyProfileAppendEnd = 25 -}; +// Proxy Control Start/Stop Events Wrappers +ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle); +ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle); -ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); -void ncclProfilingDump(); +// Record Event Wrappers +ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState); +ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState); +ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState); + +// Profiler utility functions +ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op); #endif diff --git a/src/include/proxy.h b/src/include/proxy.h index eab6930..a1c44d6 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -13,7 +13,7 @@ #include "ipcsocket.h" #include "nccl_net.h" #include -#include "shm.h" +#include "shmutils.h" #include "p2p.h" typedef enum : uint8_t { @@ -28,6 +28,8 @@ typedef enum : uint8_t { ncclPatternCollnetDirect, ncclPatternNvls, ncclPatternNvlsTree, + ncclPatternPatUp, + ncclPatternPatDown, ncclPatternSend, ncclPatternRecv } ncclPattern_t; @@ -72,6 +74,19 @@ struct ncclProxyOp { union ncclProxyOpSpecifics specifics; + // Profiler plugin + union { + struct ncclTaskColl* coll; + struct ncclTaskP2p* p2p; + } task; + + int eActivationMask; + void* taskEventHandle; + int rank; + int peer; + pid_t pid; + void* profilerContext; + struct ncclProxyOp *enqNext; }; @@ -100,7 +115,15 @@ struct ncclProxySubArgs { uint64_t done; uint64_t end; void* requests[NCCL_STEPS]; - void* profilingEvents[NCCL_STEPS]; + + // Profiler plugin + int eActivationMask; + int rank; + void* taskEventHandle; + void* opEventHandle; + void* stepEventHandles[NCCL_STEPS]; + size_t transSize; + void* recvRequestsCache[NCCL_STEPS]; int recvRequestsSubCount; }; @@ -129,6 +152,10 @@ struct ncclProxyArgs { int idle; + // Profiler plugin + pid_t pid; + void* profilerContext; + // Element linking struct ncclProxyArgs* next; struct ncclProxyArgs* nextPeer; @@ -261,6 +288,7 @@ struct ncclProxyState { ncclNet_t* ncclNet; ncclCollNet_t* ncclCollNet; uint32_t* abortFlag; + bool directMode; // Service threads pthread_t thread; pthread_t threadUDS; @@ -281,6 +309,9 @@ struct ncclProxyState { // Progress thread struct ncclProxyProgressState progressState; + // Profiler plugin + void* profilerContext; + // Queue of expected responses from the proxy struct ncclExpectedProxyResponse* expectedResponses; }; @@ -332,8 +363,9 @@ enum ncclProxyMsgType { ncclProxyMsgAbort = 7, ncclProxyMsgStop = 8, ncclProxyMsgGetFd = 9, // cuMem API support (UDS) - ncclProxyMsgRegister = 10, - ncclProxyMsgDeregister = 11 + ncclProxyMsgQueryFd = 10, + ncclProxyMsgRegister = 11, + ncclProxyMsgDeregister = 12 }; // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types @@ -347,6 +379,7 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec // UDS support ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd); +ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd); ncclResult_t ncclProxyStop(struct ncclComm* comm); ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); diff --git a/src/include/register.h b/src/include/register.h index 9f7c83f..7c60535 100644 --- a/src/include/register.h +++ b/src/include/register.h @@ -11,7 +11,13 @@ enum { NVLS_REG_COMPLETE = 0x02, NVLS_REG_POSSIBLE = 0x04, NVLS_REG_NO_SUPPORT = 0x08, - COLLNET_REG_COMPLETE = 0x10 + COLLNET_REG_COMPLETE = 0x10, + IPC_REG_COMPLETE = 0x20 +}; + +struct ncclPeerRegIpcAddr { + uintptr_t* devPeerRmtAddrs; + uintptr_t* hostPeerRmtAddrs; }; struct ncclReg { @@ -34,7 +40,10 @@ struct ncclReg { uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */ // collnet reg void* collnetHandle; - struct ncclProxyConnector* proxyconn; + struct ncclProxyConnector* collnetProxyconn; + // general ipc reg + struct ncclPeerRegIpcAddr regIpcAddrs; + struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS]; }; struct ncclRegCache { diff --git a/src/include/shm.h b/src/include/shm.h index 1db1666..b519e5d 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -1,26 +1,37 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - #ifndef NCCL_SHM_H_ #define NCCL_SHM_H_ -#include "nccl.h" +#include "comm.h" -typedef void* ncclShmHandle_t; -ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); -ncclResult_t ncclShmClose(ncclShmHandle_t handle); -ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); - -struct ncclShmemCollBuff { - volatile size_t *cnt[2]; - volatile void *ptr[2]; - int round; - size_t maxTypeSize; +struct shmLegacyIpc { + char shmSuffix[7]; + ncclShmHandle_t handle; + size_t shmSize; }; -ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); +struct shmCuIpc { + union { + CUmemFabricHandle handle; + CUmemGenericAllocationHandle data; + }; + int tpProxyRank; + void *ptr; + size_t size; +}; + +struct shmIpcDesc { + union + { + struct shmLegacyIpc shmli; + struct shmCuIpc shmci; + }; + bool legacy; +}; + +typedef struct shmIpcDesc ncclShmIpcDesc_t; + +ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr); +ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut); +ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc); #endif diff --git a/src/include/timer.h b/src/include/timer.h index 284fec6..e8b0ba3 100644 --- a/src/include/timer.h +++ b/src/include/timer.h @@ -33,15 +33,15 @@ static double startTimes[8]; #define TIME_START(index) do { \ counts[index]++; \ startTimes[index] = gettime(); \ -} while (0); +} while (0) #define TIME_STOP(index) do { \ times[index] += gettime() - startTimes[index]; \ -} while (0); +} while (0) #define TIME_CANCEL(index) do { \ counts[index]--; \ -} while (0); +} while (0) #define TIME_PRINT(name) do { \ printf("%s stats", name); \ @@ -50,11 +50,11 @@ static double startTimes[8]; counts[i] = 0; \ } \ printf("\n"); \ -} while (0); +} while (0) #else -#define TIME_START(index) while(0); -#define TIME_STOP(index) while(0); -#define TIME_CANCEL(index) while(0); +#define TIME_START(index) do {} while(0) +#define TIME_STOP(index) do {} while(0) +#define TIME_CANCEL(index) do {} while(0) #define TIME_PRINT(name) #endif #endif diff --git a/src/include/transport.h b/src/include/transport.h index 07fbb3e..cbeb613 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -48,9 +48,10 @@ struct ncclPeerInfo { // MNNVL support nvmlGpuFabricInfoV_t fabricInfo; int cuMemSupport; + int version; }; -#define CONNECT_SIZE 128 +#define CONNECT_SIZE 256 struct ncclConnect { char data[CONNECT_SIZE]; }; @@ -91,7 +92,6 @@ struct ncclCollNetSharedRes { void* resources; int nChannels; size_t buffSize; - int intraHighestTransportType; }; struct ncclTransportComm { @@ -109,13 +109,14 @@ struct ncclTransportComm { struct ncclTransport { const char name[8]; - ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); + ncclResult_t (*canConnect)(int*, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); struct ncclTransportComm send; struct ncclTransportComm recv; }; ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); +ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode); ncclResult_t ncclNvlsInit(struct ncclComm* comm); ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); @@ -127,7 +128,7 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi ncclResult_t ncclNvlsFree(struct ncclComm* comm); enum { collNetRecv=0, collNetSend=1 }; -int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect); +bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect); ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm); ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle); @@ -136,6 +137,7 @@ ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConne ncclResult_t ncclTransportRingConnect(struct ncclComm* comm); ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm); +ncclResult_t ncclTransportPatConnect(struct ncclComm* comm); ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]); ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm); diff --git a/src/include/utils.h b/src/include/utils.h index abecf22..5a1b749 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -27,7 +27,6 @@ ncclResult_t busIdToInt64(const char* busId, int64_t* id); ncclResult_t getBusId(int cudaDev, int64_t *busId); ncclResult_t getHostName(char* hostname, int maxlen, const char delim); -uint64_t getHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); ncclResult_t getRandomData(void* buffer, size_t bytes); diff --git a/src/init.cc b/src/init.cc index 16e02d4..94c2fb1 100644 --- a/src/init.cc +++ b/src/init.cc @@ -37,7 +37,7 @@ #endif const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; -const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" }; +const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree", "PAT" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); @@ -101,9 +101,15 @@ NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { NCCLCHECK(ncclInit()); NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); - ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out); + struct ncclBootstrapHandle handle; + NCCLCHECK(bootstrapGetUniqueId(&handle)); + // ncclUniqueId and bootstrapHandle don't have the same size and alignment + // reset to 0 to avoid undefined data + memset(out, 0, sizeof(*out)); + // copy to avoid alignment mismatch + memcpy(out, &handle, sizeof(handle)); TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out)); - return res; + return ncclSuccess; } // Prevent compiler from optimizing out these operations @@ -147,7 +153,7 @@ void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) { } static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) { - CUDACHECK(cudaFreeHost(dtor->obj)); + NCCLCHECK(ncclCudaHostFree(dtor->obj)); return ncclSuccess; } void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) { @@ -180,13 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) { * free all intra-process communicators; therefore, we only need to focus on local * resource cleanup in commFree(). */ if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) { - pthread_join(comm->proxyState->thread, nullptr); + PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join"); if (comm->proxyState->threadUDS) { // UDS support - pthread_join(comm->proxyState->threadUDS, nullptr);; + PTHREADCHECK(pthread_join(comm->proxyState->threadUDS, nullptr), "pthread_join"); } } + CUDACHECK(cudaMemPoolDestroy(comm->memPool)); + delete[] comm->userRedOps; free(comm->connectSend); @@ -244,12 +252,14 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->topParentRanks); free(comm->topParentLocalRanks); + free(comm->gproxyConn); NCCLCHECK(ncclRegCleanup(comm)); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy"); commPoison(comm); // poison comm before free to avoid comm reuse. + NCCLCHECK(ncclProfilerPluginFinalize(comm)); NCCLCHECK(ncclNetFinalize(comm)); NCCLCHECK(ncclNetPluginUnload(comm)); free(comm); @@ -328,6 +338,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in NCCLCHECK(ncclNetPluginLoad(comm)); NCCLCHECK(ncclNetInit(comm)); + NCCLCHECK(ncclProfilerPluginInit(comm)); INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name); if (parent && parent->config.splitShare) { @@ -393,8 +404,28 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in } ncclIntruQueueMpscConstruct(&comm->callbackQueue); + ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue); comm->regCache.pageSize = sysconf(_SC_PAGESIZE); + + do { + cudaMemPoolProps props = {}; + props.allocType = cudaMemAllocationTypePinned; + props.handleTypes = cudaMemHandleTypeNone; + props.location.type = cudaMemLocationTypeDevice; + props.location.id = comm->cudaDev; + CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props)); + uint64_t releaseThreshold = ~uint64_t(0); + CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold)); + } while (0); + + ncclIntruQueueConstruct(&comm->eventCallbackQueue); + + // setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator + comm->intraComm0 = comm; + comm->intraRank = 0; + comm->intraRanks = 1; + return ncclSuccess; } @@ -408,12 +439,16 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail); NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); ncclCommPushCudaFree(comm, devCommAndChans); + NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); comm->devComm = &devCommAndChans->comm; tmpCommAndChans.comm.rank = comm->rank; tmpCommAndChans.comm.nRanks = nRanks; tmpCommAndChans.comm.node = comm->node; tmpCommAndChans.comm.nNodes = comm->nNodes; tmpCommAndChans.comm.abortFlag = comm->abortFlagDev; + tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo); for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) { tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p]; } @@ -498,10 +533,13 @@ static void showVersion() { } } +NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1); + static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { info->rank = comm->rank; info->cudaDev = comm->cudaDev; info->nvmlDev = comm->nvmlDev; + NCCLCHECK(ncclGetVersion(&info->version)); info->hostHash=getHostHash()+commHash; info->pidHash=getPidHash()+commHash; info->cuMemSupport = ncclCuMemEnable(); @@ -534,6 +572,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1], info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask); } + if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId(); } return ncclSuccess; @@ -677,7 +716,8 @@ static int checkMNNVL(struct ncclComm* comm) { #define TIMER_INIT_TOPO 4 #define TIMER_INIT_GRAPHS 5 #define TIMER_INIT_CONNECT 6 -#define TIMERS_INIT_COUNT 7 +#define TIMER_INIT_ALLOC 7 +#define TIMERS_INIT_COUNT 8 static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) { // We use 2 AllGathers @@ -693,7 +733,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN]; struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT]; struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS]; - struct ncclTopoGraph* graphs[] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph }; + struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph }; struct graphInfo { int pattern; @@ -722,7 +762,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p struct ncclProxyConnector proxyConn; int* pxnPeers = NULL; int *topParentLocalRanks = NULL; - int tpProxyRank; timers[TIMER_INIT_ALLGATHER] = clockNano(); // AllGather1 - begin @@ -732,6 +771,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm->cuMemSupport = 1; for (int i = 0; i < nranks; i++) { + if (comm->peerInfo[i].version != comm->peerInfo[rank].version) { + WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d", + i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version); + ret = ncclInvalidUsage; + goto fail; + } if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++; if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0; if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { @@ -869,7 +914,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p collNetChainGraph->maxChannels = ringGraph->nChannels; memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph)); - collNetDirectGraph->id = 2; + collNetDirectGraph->id = 4; collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT; collNetDirectGraph->collNet = 1; collNetDirectGraph->minChannels = 1; @@ -1031,18 +1076,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); comm->collNetSupport = 0; } - comm->collNetRegSupport = true; - for (int n=0; nnNodes; n++) { - if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) { - WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1); - comm->collNetSupport = 0; - break; - } - if (comm->nodeRanks[n].localRanks > 1) { - // As long as there is more than 1 rank on any node, we need to disable collnet reg - comm->collNetRegSupport = false; - } - } + // As long as there is more than 1 rank on any node, we need to disable collnet reg + comm->collNetRegSupport = (comm->maxLocalRanks == 1); } NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); @@ -1085,6 +1120,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } comm->topParentLocalRanks = topParentLocalRanks; + NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->intraNodeP2pSupport, &comm->directMode), ret, fail); // Launch proxy service thread, after this, the proxy calls can be used. if (parent && parent->config.splitShare) { comm->proxyState = parent->sharedRes->proxyState; @@ -1092,7 +1128,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } else { NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail); } - + NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail); + timers[TIMER_INIT_CONNECT] = clockNano(); do { // Build p2p schedule int node = comm->node; @@ -1168,6 +1205,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Connect Trees NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail); + // Connect PAT only for communicators with 1 GPU per node + if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail); + // Setup NVLS NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail); @@ -1179,12 +1219,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p if (comm->collNetSupport > 0) { ncclCollNetSetup(comm, parent, graphs); NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail); - NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail); + if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) { + NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail); + } } // Connect to local net proxy - tpProxyRank = comm->topParentRanks[comm->rank]; - NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); // Then to remote ones when using PXN @@ -1192,8 +1233,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p int nranks; NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail); for (int r=0; rtopParentRanks[pxnPeers[r]]; - NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); } } @@ -1286,17 +1326,20 @@ NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT); #define NCCL_MAX_CGA_CLUSTER_SIZE 8 +#define NCCL_COMMINIT_FUNCNAME_LEN 128 struct ncclCommInitRankAsyncJob { struct ncclAsyncJob base; struct ncclComm* comm; struct ncclComm** newcomm; int cudaDev; // For ncclCommInitRank - int nranks, myrank; - ncclUniqueId commId; + int nranks, myrank, nId; + ncclUniqueId* commId; // for ncclCommSplit struct ncclComm* parent; int color, key; + // name of the function calling + char funcName[NCCL_COMMINIT_FUNCNAME_LEN]; }; struct ncclCommFinalizeAsyncJob { @@ -1306,30 +1349,31 @@ struct ncclCommFinalizeAsyncJob { NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT); +typedef struct{ + int key; + int color; +} commSplitInfo; static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) { - int* colors = NULL; - int* keys = NULL; int nRanks = 0, myRank = 0; ncclResult_t ret = ncclSuccess; - NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail); + commSplitInfo* info = NULL; + NCCLCHECKGOTO(ncclCalloc(&info, parent->nRanks), ret, fail); // Compute nRanks, my rank and the ranks (of the original comm) before and after me - colors[parent->rank] = color; - keys[parent->rank] = key; - NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail); - NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail); + info[parent->rank].color = color; + info[parent->rank].key = key; + NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, info, sizeof(commSplitInfo)), ret, fail); // Negative color does not create a new comm. Return now. if (color == NCCL_SPLIT_NOCOLOR) goto exit; memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks); for (int i = 0; i < parent->nRanks; i++) { - if (colors[i] != color) continue; + if (info[i].color != color) continue; // Find where to insert this rank int insert = 0; - while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++; + while (insert < nRanks && info[parentRanksRet[insert]].key <= info[i].key) insert++; // Shift ranks by one after insert for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1]; // Insert our rank @@ -1345,8 +1389,7 @@ static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* par *myRankRet = myRank; exit: - free(colors); - free(keys); + free(info); return ret; fail: goto exit; @@ -1361,7 +1404,9 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { int cudaDev = job->cudaDev; int* parentRanks = NULL; int cudaArch; - uint64_t timers[TIMERS_INIT_COUNT]; + double sum_timers = 0; + uint64_t timers[TIMERS_INIT_COUNT] = {0}; + unsigned long long commIdHash; timers[TIMER_INIT_TOTAL] = clockNano(); CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); @@ -1379,34 +1424,42 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { } timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS]; - timers[TIMER_INIT_BOOTSTRAP] = clockNano(); if (job->parent) { NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail); NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail); // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now. if (job->color == NCCL_SPLIT_NOCOLOR) goto exit; - snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color); + timers[TIMER_INIT_ALLOC] = clockNano(); NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); - NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail); + timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; + // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color + ncclUniqueId tmpId; + memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits + snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color); + comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES); + INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName, + comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key); + timers[TIMER_INIT_BOOTSTRAP] = clockNano(); + NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail); + timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP]; + // debug info, no commId was used + commIdHash = 0; } else { + timers[TIMER_INIT_ALLOC] = clockNano(); NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); - NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail); + timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; + // obtain a unique hash using the first commId + comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES); + commIdHash = hashUniqueId(job->commId[0]); + INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName, + comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash); + timers[TIMER_INIT_BOOTSTRAP] = clockNano(); + NCCLCHECKGOTO(bootstrapInit(job->nId, (struct ncclBootstrapHandle*)job->commId, comm), res, fail); + timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP]; } - timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP]; - comm->cudaArch = cudaArch; - comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); - - if (job->parent) { - INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init START", - comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId)); - } else { - INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", - comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); - } NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail); - NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail); if (comm->tuner) { NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext)); @@ -1420,23 +1473,25 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { if (job->parent) { /* unlink child abort flag. */ __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE); - TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", - job->parent, job->color, job->key, comm, comm->rank, comm->nRanks); + TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks); + INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName, + comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key); } else { - TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", - comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev); + // the name for the replay tool is ncclCommInitRank for all the variations + TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev); + INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", job->funcName, + comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash); } - - if (job->parent) { - INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init COMPLETE", - comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId)); - } else { - INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", - comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); - } - INFO(NCCL_INIT|NCCL_PROFILE,"Init timings: rank %d nranks %d total %.2f (kernels %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, connections %.2f, rest %.2f)", comm->rank, comm->nRanks, timers[TIMER_INIT_TOTAL]/1e9, - timers[TIMER_INIT_KERNELS]/1e9, timers[TIMER_INIT_BOOTSTRAP]/1e9, timers[TIMER_INIT_ALLGATHER]/1e9, timers[TIMER_INIT_TOPO]/1e9, timers[TIMER_INIT_GRAPHS]/1e9, timers[TIMER_INIT_CONNECT]/1e9, - (timers[TIMER_INIT_TOTAL]-timers[TIMER_INIT_KERNELS]-timers[TIMER_INIT_BOOTSTRAP]-timers[TIMER_INIT_ALLGATHER]-timers[TIMER_INIT_TOPO]-timers[TIMER_INIT_GRAPHS]-timers[TIMER_INIT_CONNECT])/1e9); + sum_timers = 0.0; + for (int it = 1; it < TIMERS_INIT_COUNT; ++it) + sum_timers += (timers[it] / 1e9); + INFO(NCCL_INIT | NCCL_PROFILE, + "Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, " + "connections %.2f, rest %.2f)", + job->funcName, comm->rank, comm->nRanks, + timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9, + timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9, + timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers); exit: if (job->newcomm) { /* assign it to user pointer. */ @@ -1621,17 +1676,24 @@ fail: goto exit; } -static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) { - ncclResult_t res = ncclSuccess; - ncclComm_t comm = NULL; - struct ncclCommInitRankAsyncJob *job = NULL; - const char* env = ncclGetEnv("NCCL_COMM_ID"); - if (env && myrank == 0) { - INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); - NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail); - } +static void ncclCommInitJobFree(void* _job) { + struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)_job; + free(job->commId); + free(_job); +} +static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId, ncclUniqueId* commId, int myrank, int cudaDev, ncclConfig_t *config, const char funcName[]) { + if (nId <= 0 || nId > nranks) { + WARN("improper usage of ncclCommInitRank: nId = %d, nranks=%d", nId, nranks); + return ncclInvalidArgument; + } + ncclResult_t res = ncclSuccess; + const char* commIdEnv = NULL; + ncclComm_t comm = NULL; + struct ncclCommInitRankAsyncJob* job = NULL; + // first call ncclInit, this will setup the environment NCCLCHECKGOTO(ncclInit(), res, fail); + if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) { static pthread_once_t once = PTHREAD_ONCE_INIT; pthread_once(&once, showVersion); @@ -1659,19 +1721,37 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni *newcomm = comm; NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); + job->nId = nId; job->comm = comm; job->nranks = nranks; - job->commId = commId; // C++ struct assignment job->myrank = myrank; job->cudaDev = cudaDev; - NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); + snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", funcName); + // need to copy the commIds to allow async commInit and to avoid alignement issues when casting from ncclUNiqueId and ncclBootstrapHandle + // ncclUniqueIds and ncclBootstrapHandle don't have the same alignment requirements. + // Therefore the array of Ids coming from the user might not be properly aligned to be cast into a ncclBootstrapHandle + // copying into allocated memory guarantees that the memory is properly aligned for any objects, removing that issue + NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), res, fail); + memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES); + + commIdEnv = ncclGetEnv("NCCL_COMM_ID"); + if (commIdEnv && myrank == 0) { + INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commIdEnv); + if (nId > 1) { + INFO(NCCL_INIT | NCCL_ENV, "NCCL_COMM_ID cannot be used with more than one ncclUniqueId"); + job->nId = 1; + } + // start the bootstrap root before bootstrapping, use only the first handle + NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail); + } + NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail); exit: return ncclGroupErrCheck(res); fail: if (comm) { free(comm->abortFlag); - if (comm->abortFlagDev) ncclCudaHostFree((void*)comm->abortFlagDev); + if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev); free(comm->abortFlagRefCount); free(comm); } @@ -1703,7 +1783,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload) - NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config)); + NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__)); return ncclSuccess; } @@ -1713,6 +1793,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { int totalnDev; int *gpuFlags = NULL; ncclConfig_t config = NCCL_CONFIG_INITIALIZER; + int oldDev = 0; constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"} @@ -1722,6 +1803,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { // Load the CUDA driver and dlsym hooks (can fail on old drivers) (void)ncclCudaLibraryInit(); + CUDACHECK(cudaGetDevice(&oldDev)); NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail); if (ndev < 0) { WARN("Invalid device count requested : %d", ndev); @@ -1735,7 +1817,8 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { for (int i = 0; i < ndev; ++i) { /* invalid device check. */ if (devlist[i] < 0 || devlist[i] >= totalnDev) { - ret = ncclUnhandledCudaError; + WARN("Invalid device %d (totalnDev=%d)", devlist[i], totalnDev); + ret = ncclInvalidArgument; goto fail; } @@ -1756,13 +1839,18 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { NCCLCHECKGOTO(ncclGroupStart(), ret, fail); for (int i=0; iconfig.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); + return ret; +fail: + if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret); + goto exit; +} + +NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config); +ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) { + int cudaDev; + ncclResult_t ret = ncclSuccess; + ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; + ncclConfig_t *internalConfigPtr = NULL; + NCCLCHECK(ncclGroupStartInternal()); + + (void)ncclCudaLibraryInit(); + CUDACHECK(cudaGetDevice(&cudaDev)); + + NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; + NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload) + + if (config == NULL) + internalConfigPtr = &internalConfig; + else + internalConfigPtr = config; + NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, nId, commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail); exit: ncclGroupErrCheck(ret); @@ -1818,13 +1938,25 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult); if (comm->initState == ncclSuccess) { - NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail); + if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->hostStream)) != ncclSuccess) { + WARN("commDestroySync: comm %p rank %d sync hostStream error %d\n", comm, comm->rank, ret); + } + if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)) != ncclSuccess) { + WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret); + } + + NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail); NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); // And keep polling until all graphs referencing us die. while (comm->persistentRefs != 0) { NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail); - } + } + while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) { + struct ncclCommCallback* cb = ncclIntruQueueDequeue(&comm->legacyRegCleanupQueue); + if (cb->fn(comm, cb) != ncclSuccess) { + WARN("Legacy IPC cleanup callback failed comm %p (rank = %d) cb %p", comm, comm->rank, cb); + } + } } if ((ret = ncclProxyStop(comm)) != ncclSuccess) { @@ -1886,14 +2018,15 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) { /* launch async thread to finalize comm. */ NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); job->comm = comm; - NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail); + NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commDestroySync, NULL, free, comm), ret, fail); exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); - if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) }; + if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); } return ret; fail: + free(job); if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret); goto exit; } @@ -1940,13 +2073,15 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) { nextIntraComm = nextIntraComm->intraNext; if ((ret = commCleanup(curIntraComm)) != ncclSuccess) { + // We pass a freed pointer, but we don't dereference; we merely print its value, so it's OK. + // coverity[pass_freed_arg] WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret); } } } } - return ret; + return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); @@ -1975,12 +2110,11 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { NCCLCHECK(ncclCommEnsureReady(comm)); NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = comm; - NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail); + NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail); exit: return res; fail: - free(job); goto exit; } @@ -1991,15 +2125,6 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { return ncclSuccess; } - int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; - struct ncclCommFinalizeAsyncJob *job = NULL; - ncclResult_t res = ncclSuccess; - - NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload) - - TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); - // Ask anything that might still be running on the device to quit if (comm->childAbortFlag != nullptr) { __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE); @@ -2010,30 +2135,61 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { comm->destroyFlag = 1; /* init thread must be joined before we destroy the comm, * and we should ignore the init error here. */ - ncclCommEnsureReady(comm); + (void)ncclCommEnsureReady(comm); + + // once the comm is ready, we can access ranks etc + int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; + struct ncclCommFinalizeAsyncJob *job = NULL; + ncclResult_t res = ncclSuccess; + + NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; + NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload) + + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = comm; - NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail); + NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail); exit: return ncclSuccess; fail: - free(job); goto exit; } +struct NvtxParamsCommSplit { + int rank; + int nranks; + int cudaDev; + int color; + int key; +}; +constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = { + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)}, +}; + NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { struct ncclCommInitRankAsyncJob *job = NULL; struct ncclComm* childComm = NCCL_COMM_NULL; ncclResult_t res = ncclSuccess; + NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key}; + NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload) + + int oldDev; + CUDACHECK(cudaGetDevice(&oldDev)); + NCCLCHECK(ncclGroupStartInternal()); NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail); NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail); NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail); + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, fail); /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */ *newcomm = NCCL_COMM_NULL; if (color == NCCL_SPLIT_NOCOLOR) { @@ -2073,10 +2229,12 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc job->color = color; job->key = key; job->cudaDev = comm->cudaDev; - NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); + snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", __func__); + NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail); exit: - ncclGroupErrCheck(res); + cudaSetDevice(oldDev); + (void)ncclGroupErrCheck(res); NCCLCHECK(ncclGroupEndInternal()); return res; fail: @@ -2179,7 +2337,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { CUmemAccessDesc accessDesc = {}; CUmemGenericAllocationHandle handle; int cudaDev; - int flag = 0; + int flag; int dcnt; int mcSupport = 0; @@ -2193,12 +2351,18 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev)); if (mcSupport) { + int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + // Query device to see if FABRIC handle support is available + flag = 0; + (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));; + if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC; memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - memprop.requestedHandleTypes = ncclCuMemHandleType; + memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes; memprop.location.id = currentDev; // Query device to see if RDMA support is available - CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); + flag = 0; + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev)); if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1; CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); @@ -2207,14 +2371,25 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { mcprop.size = size; /* device cnt is a dummy value right now, it might affect mc granularity in the future. */ mcprop.numDevices = dcnt; - mcprop.handleTypes = ncclCuMemHandleType; + mcprop.handleTypes = requestedHandleTypes; mcprop.flags = 0; CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); /* only size needs to be aligned to mcGran */ ALIGN_SIZE(size, mcGran); - /* Allocate the physical memory on the device */ - CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); + if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) { + /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */ + CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0)); + if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) { + requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC; + memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes; + /* Allocate the physical memory on the device */ + CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); + } + } else { + /* Allocate the physical memory on the device */ + CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); + } /* Reserve a virtual address range */ CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0)); /* Map the virtual address range to the physical allocation */ @@ -2234,6 +2409,9 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { fallback: #endif + // Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though: + // we want CUDA to return an error to the caller. + // coverity[var_deref_model] CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail); exit: @@ -2272,7 +2450,7 @@ fallback: CUDACHECKGOTO(cudaFree(ptr), ret, fail); exit: - cudaSetDevice(saveDevice); + CUDACHECK(cudaSetDevice(saveDevice)); return ret; fail: goto exit; diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index 6ed5db2..b190684 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -53,6 +53,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) { return ncclInvalidArgument; } + // ncclMaxRedOp < info->op will always be false due to the sizes of + // the datatypes involved, and that's by design. We keep the check though + // just as a reminder. + // coverity[result_independent_of_operands] if (info->op < 0 || ncclMaxRedOp < info->op) { WARN("%s : invalid reduction operation %d", info->opName, info->op); return ncclInvalidArgument; diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index d44c063..03e3bde 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -11,7 +11,7 @@ // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2); - +NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0); // Handle type used for cuMemCreate() CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; @@ -49,6 +49,14 @@ int ncclCuMemEnable() { return param >= 0 ? param : (param == -2 && ncclCuMemSupported); } +int ncclCuMemHostEnable() { +#if CUDART_VERSION < 12020 + return 0; +#else + return ncclParamCuMemHostEnable(); +#endif +} + #define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr #if CUDART_VERSION >= 11030 @@ -81,6 +89,7 @@ DECLARE_CUDA_PFN(cuMemRelease); DECLARE_CUDA_PFN(cuMemRetainAllocationHandle); DECLARE_CUDA_PFN(cuMemSetAccess); DECLARE_CUDA_PFN(cuMemUnmap); +DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle); /* ncclMemAlloc/Free */ DECLARE_CUDA_PFN(cuPointerGetAttribute); #if CUDA_VERSION >= 11070 @@ -107,7 +116,7 @@ bool ncclCudaLaunchBlocking = false; #if CUDART_VERSION >= 12000 #define LOAD_SYM(symbol, ignore) do { \ - cudaDriverEntryPointQueryResult driverStatus; \ + cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \ res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \ if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \ if (!ignore) { \ @@ -157,6 +166,7 @@ static ncclResult_t cudaPfnFuncLoader(void) { LOAD_SYM(cuMemRetainAllocationHandle, 1); LOAD_SYM(cuMemSetAccess, 1); LOAD_SYM(cuMemUnmap, 1); + LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1); /* ncclMemAlloc/Free */ LOAD_SYM(cuPointerGetAttribute, 1); #if CUDA_VERSION >= 11070 @@ -208,6 +218,20 @@ static void initOnceFunc() { // Determine whether we support the cuMem APIs or not ncclCuMemSupported = ncclIsCuMemSupported(); +#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030 + /* To use cuMem* for host memory allocation, we need to create context on each + * visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */ + if (ncclCuMemSupported && ncclCuMemHostEnable()) { + int deviceCnt, saveDevice; + cudaGetDevice(&saveDevice); + cudaGetDeviceCount(&deviceCnt); + for (int i = 0; i < deviceCnt; ++i) { + cudaSetDevice(i); + cudaFree(NULL); + } + cudaSetDevice(saveDevice); + } +#endif initResult = ret; return; error: diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc index db61b31..2d17f47 100644 --- a/src/misc/ipcsocket.cc +++ b/src/misc/ipcsocket.cc @@ -41,6 +41,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); if (len > (sizeof(cliaddr.sun_path) - 1)) { WARN("UDS: Cannot bind provided name to socket. Name too large"); + close(fd); return ncclInternalError; } #ifndef USE_ABSTRACT_SOCKET @@ -66,7 +67,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v // Mark socket as non-blocking if (handle->abortFlag) { int flags; - EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(flags = fcntl(fd, F_GETFL), "fcntl"); SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); } @@ -186,20 +187,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick #endif - TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp); + TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp); - if (sendFd != -1) { - TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp); + msg.msg_control = control_un.control; + msg.msg_controllen = sizeof(control_un.control); - msg.msg_control = control_un.control; - msg.msg_controllen = sizeof(control_un.control); - - cmptr = CMSG_FIRSTHDR(&msg); - cmptr->cmsg_len = CMSG_LEN(sizeof(int)); - cmptr->cmsg_level = SOL_SOCKET; - cmptr->cmsg_type = SCM_RIGHTS; - memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); - } + cmptr = CMSG_FIRSTHDR(&msg); + cmptr->cmsg_len = CMSG_LEN(sizeof(int)); + cmptr->cmsg_level = SOL_SOCKET; + cmptr->cmsg_type = SCM_RIGHTS; + memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); msg.msg_name = (void *)&cliaddr; msg.msg_namelen = sizeof(struct sockaddr_un); diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc index a2b0be0..f441af8 100644 --- a/src/misc/nvmlwrap.cc +++ b/src/misc/nvmlwrap.cc @@ -102,6 +102,10 @@ ncclResult_t ncclNvmlEnsureInitialized() { for(Symbol sym: symbols) { *sym.ppfn = dlsym(libhandle, sym.name); } + // Coverity complains that we never dlclose this object, but that's + // deliberate, since we want the loaded object to remain in memory until + // the process terminates, so that we can use its code. + // coverity[leaked_storage] } #endif diff --git a/src/misc/param.cc b/src/misc/param.cc index 2248be9..eb50cfe 100644 --- a/src/misc/param.cc +++ b/src/misc/param.cc @@ -37,7 +37,7 @@ void setEnvFile(const char* fileName) { while (line[s] != '\0' && line[s] != '=') s++; if (line[s] == '\0') continue; strncpy(envVar, line, std::min(1023,s)); - envVar[s] = '\0'; + envVar[std::min(1023,s)] = '\0'; s++; strncpy(envValue, line+s, 1023); envValue[1023]='\0'; @@ -48,17 +48,28 @@ void setEnvFile(const char* fileName) { fclose(file); } -void initEnv() { +static void initEnvFunc() { char confFilePath[1024]; - const char * userDir = userHomeDir(); - if (userDir) { - sprintf(confFilePath, "%s/.nccl.conf", userDir); + const char* userFile = getenv("NCCL_CONF_FILE"); + if (userFile && strlen(userFile) > 0) { + snprintf(confFilePath, sizeof(confFilePath), "%s", userFile); setEnvFile(confFilePath); + } else { + const char* userDir = userHomeDir(); + if (userDir) { + snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir); + setEnvFile(confFilePath); + } } - sprintf(confFilePath, "/etc/nccl.conf"); + snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf"); setEnvFile(confFilePath); } +void initEnv() { + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once, initEnvFunc); +} + void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&mutex); @@ -80,8 +91,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6 pthread_mutex_unlock(&mutex); } -const char *ncclGetEnv(const char *name) { - static pthread_once_t once = PTHREAD_ONCE_INIT; - pthread_once(&once, initEnv); +const char* ncclGetEnv(const char* name) { + initEnv(); return getenv(name); } diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc index 785d616..9a4adf5 100644 --- a/src/misc/profiler.cc +++ b/src/misc/profiler.cc @@ -1,115 +1,524 @@ /************************************************************************* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ +#include "param.h" +#include "checks.h" +#include "comm.h" +#include "enqueue.h" +#include "utils.h" +#include "proxy.h" #include "profiler.h" -//#define PROFILE_PROXY 1 -#ifdef PROFILE_PROXY -#include "timer.h" -#include "alloc.h" +static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER; +static int profilerPluginRefCount; +static void* profilerPluginLib; +static ncclProfiler_t* ncclProfiler; -static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" }; -static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" }; -static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" }; -struct ncclProxyProfileEvent { - double timestamp[6]; - uint64_t opCount; - int peer; - int step; - uint16_t channel; - uint8_t type; // send / recv - uint8_t opIndex; -}; +#define MAX_STR_LEN 256 +#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1" -struct ncclProxyProfileEvent* profilingEvents = NULL; -int profilingIndex = 0; -double profilingStart = 0; -#define MAX_EVENTS 200000 - -ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { - if (profilingEvents == NULL) { - NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS)); - profilingStart = gettime(); +static void* tryOpenLib(char* name, int *err, char* errStr) { + if (nullptr == name || strlen(name) == 0) { + return nullptr; } - struct ncclProxyProfileEvent* event = NULL; - if (state%8 == 0) { - if (profilingIndex == MAX_EVENTS) return ncclSuccess; - args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++; - if (state == ncclProxyProfileBegin) { - // Proxy operation information - event->opCount = args->opCount; - event->channel = args->subs[sub].channelId; - event->peer = args->subs[sub].peer; - event->type = args->pattern; - event->step = step; - event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256; - } else event->peer = -state; + + if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { + name = nullptr; + } + + void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); + if (nullptr == handle) { + strncpy(errStr, dlerror(), MAX_STR_LEN); + errStr[MAX_STR_LEN] = 0; + if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { + *err = ENOENT; + } + } + + return handle; +} + +static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { + if (openErr == ENOENT) { + snprintf(nameList, *nameListLen, " %s", name); + nameList += strlen(name) + 1; + *nameListLen -= strlen(name) + 1; + return nameList; + } + INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr); + return nameList; +} + +static void* openProfilerPluginLib(char* couldNotFindNames, int len) { + int openErr; + void *pluginLib; + char profilerPluginLibName[PATH_MAX]; + char openErrStr[MAX_STR_LEN + 1] = { 0 }; + + const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN"); + if (envProfilerPluginName && strlen(envProfilerPluginName)) { + snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName); + pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); + if (pluginLib) { + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName); + return pluginLib; + } + + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); + pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); + if (pluginLib) { + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName); + return pluginLib; + } + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); } else { - event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS]; - if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL; - if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount; + snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so"); + pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); + if (pluginLib) { + return pluginLib; + } + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); } - // Timestamp - event->timestamp[state%8] = gettime()-profilingStart; + + return nullptr; +} + +enum { + profilerPluginLoadFailed = -1, + profilerPluginLoadReady = 0, + profilerPluginLoadSuccess = 1, +}; +static int profilerPluginStatus = profilerPluginLoadReady; +static pid_t pid; + +#define MAX_PLUGIN_LOAD 2 + +static ncclResult_t ncclProfilerPluginLoad(void) { + if (profilerPluginLoadFailed == profilerPluginStatus) { + return ncclSuccess; + } + + char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; + pthread_mutex_lock(&profilerLock); + if (profilerPluginLoadSuccess == profilerPluginStatus) { + ++profilerPluginRefCount; + goto exit; + } + + profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); + if (profilerPluginLib == nullptr) { + if (strlen(couldNotFindNames)) { + INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames); + } + goto fail; + } + + ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL); + if (ncclProfiler == nullptr) { + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL "."); + goto fail; + } + + ++profilerPluginRefCount; + profilerPluginStatus = profilerPluginLoadSuccess; + + // Store the pid of the process loading the profiler. + // This is attached to the proxyOp event descriptor + // so the plugin can figure out if the parent event + // is in the same address space or not + pid = getpid(); + +exit: + pthread_mutex_unlock(&profilerLock); + return ncclSuccess; +fail: + if (profilerPluginLib) dlclose(profilerPluginLib); + profilerPluginStatus = profilerPluginLoadFailed; + goto exit; +} + +static ncclResult_t ncclProfilerPluginUnload(void) { + pthread_mutex_lock(&profilerLock); + if (0 == (--profilerPluginRefCount)) { + INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name); + dlclose(profilerPluginLib); + profilerPluginLib = nullptr; + ncclProfiler = nullptr; + profilerPluginStatus = profilerPluginLoadReady; + } + pthread_mutex_unlock(&profilerLock); return ncclSuccess; } -void ncclProfilingDump() { - static int dumpDone = 0; - if (dumpDone) return; - dumpDone = 1; - const char* str = ncclGetEnv("NCCL_PROXY_PROFILE"); - if (!str) { free(profilingEvents); return; } - FILE* f = fopen(str, "w"); - fprintf(f, "[\n"); +#define ENABLE_TIMER 0 +#include "timer.h" - for (int i=0; ipeer >= 0; - const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") : - profilingEventStr[-(e->peer/8)]; +#if ENABLE_TIMER +static int64_t elapsedCount; +static int64_t initCount, finalizeCount; +static int64_t groupStartCount, groupStopCount; +static int64_t taskStartCount, taskStopCount; +static int64_t proxyOpStartCount, proxyOpStopCount; +static int64_t proxyStepStartCount, proxyStepStopCount; +static int64_t proxyCtrlStartCount, proxyCtrlStopCount; +static int64_t proxyOpRecordCount, proxyStepRecordCount, proxyCtrlRecordCount; + +static double elapsedTs[2]; +static double initTs[2], finalizeTs[2]; +static double groupStartTs[2], groupStopTs[2]; +static double taskStartTs[2], taskStopTs[2]; +static double proxyOpStartTs[2], proxyOpStopTs[2]; +static double proxyStepStartTs[2], proxyStepStopTs[2]; +static double proxyCtrlStartTs[2], proxyCtrlStopTs[2]; +static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2]; + +#define TIME_START_EVENT(event) do { \ + (event ## Count)++; \ + (event ## Ts)[0] = gettime(); \ +} while(0) + +#define TIME_STOP_EVENT(event) do { \ + double val = gettime() - (event ## Ts)[0]; \ + (event ## Ts)[1] += val; \ +} while(0) + +#define TIME_PRINT_EVENTS(name) do { \ + printf("%s ", name); \ + if (elapsedCount) printf("[elapsed] %g/%ld = %g ", elapsedTs[1], elapsedCount, elapsedTs[1]/elapsedCount); \ + if (initCount) printf("[init] %g/%ld = %g ", initTs[1], initCount, initTs[1]/initCount); \ + if (finalizeCount) printf("[finalize] %g/%ld = %g ", finalizeTs[1], finalizeCount, finalizeTs[1]/finalizeCount); \ + if (groupStartCount) printf("[groupStart] %g/%ld = %g ", groupStartTs[1], groupStartCount, groupStartTs[1]/groupStartCount); \ + if (groupStopCount) printf("[groupStop] %g/%ld = %g ", groupStopTs[1], groupStopCount, groupStopTs[1]/groupStopCount); \ + if (taskStartCount) printf("[taskStart] %g/%ld = %g ", taskStartTs[1], taskStartCount, taskStartTs[1]/taskStartCount); \ + if (taskStopCount) printf("[taskStop] %g/%ld = %g ", taskStopTs[1], taskStopCount, taskStopTs[1]/taskStopCount); \ + if (proxyOpStartCount) printf("[proxyOpStart] %g/%ld = %g ", proxyOpStartTs[1], proxyOpStartCount, proxyOpStartTs[1]/proxyOpStartCount); \ + if (proxyOpStopCount) printf("[proxyOpStop] %g/%ld = %g ", proxyOpStopTs[1], proxyOpStopCount, proxyOpStopTs[1]/proxyOpStopCount); \ + if (proxyStepStartCount) printf("[proxyStepStart] %g/%ld = %g ", proxyStepStartTs[1], proxyStepStartCount, proxyStepStartTs[1]/proxyStepStartCount); \ + if (proxyStepStopCount) printf("[proxyStepStop] %g/%ld = %g ", proxyStepStopTs[1], proxyStepStopCount, proxyStepStopTs[1]/proxyStepStopCount); \ + if (proxyCtrlStartCount) printf("[proxyCtrlStart] %g/%ld = %g ", proxyCtrlStartTs[1], proxyCtrlStartCount, proxyCtrlStartTs[1]/proxyCtrlStartCount); \ + if (proxyCtrlStopCount) printf("[proxyCtrlStop] %g/%ld = %g ", proxyCtrlStopTs[1], proxyCtrlStopCount, proxyCtrlStopTs[1]/proxyCtrlStopCount); \ + if (proxyOpRecordCount) printf("[proxyOpRecord] %g/%ld = %g ", proxyOpRecordTs[1], proxyOpRecordCount, proxyOpRecordTs[1]/proxyOpRecordCount); \ + if (proxyStepRecordCount) printf("[proxyStepRecord] %g/%ld = %g ", proxyStepRecordTs[1], proxyStepRecordCount, proxyStepRecordTs[1]/proxyStepRecordCount); \ + if (proxyCtrlRecordCount) printf("[proxyCtrlRecord] %g/%ld = %g", proxyCtrlRecordTs[1], proxyCtrlRecordCount, proxyCtrlRecordTs[1]/proxyCtrlRecordCount); \ + printf("\n"); \ +} while(0) +#else +#define TIME_START_EVENT(event) do {} while(0) +#define TIME_STOP_EVENT(event) do {} while(0) +#define TIME_PRINT_EVENTS(name) do {} while(0) +#endif - if (sendrecv) { - int state = ncclProxyProfileBegin; - const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr; - fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n", - typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex); +static int eActivationMask; // Set by profiler +static int eActivationMaskGroup; // Cached for current group - while (statetimestamp[state]) { - const char* name = stateStr[state]; - fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", - name, i, e->channel, e->timestamp[state]); - state++; - while (e->timestamp[state] == 0) state++; - fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", - name, i, e->channel, e->timestamp[state]); - } - } - - fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", - typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]); - } else { - if (e->peer == -ncclProxyProfileAppend) { - fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n", - typeStr, i, e->timestamp[0], e->opCount); - } else { - fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", - typeStr, i, e->timestamp[0]); - } - fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", - typeStr, i, e->timestamp[1]); +ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) { + TIME_START_EVENT(elapsed); + TIME_START_EVENT(init); + ncclProfilerPluginLoad(); + if (__builtin_expect(ncclProfiler != NULL, 0)) { + int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask); + if (err) { + WARN("Profiler init failed with error (%d). Continue without profiler.", err); + ncclProfiler = NULL; } } - fprintf(f, "{} ]\n"); - fclose(f); - free(profilingEvents); + TIME_STOP_EVENT(init); + return ncclSuccess; +} + +ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) { + TIME_START_EVENT(finalize); + if (__builtin_expect(ncclProfiler != NULL, 0)) { + ncclProfiler->finalize(comm->profilerContext); + } + ncclProfilerPluginUnload(); + TIME_STOP_EVENT(finalize); + TIME_STOP_EVENT(elapsed); + TIME_PRINT_EVENTS("Profiler"); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) { + TIME_START_EVENT(groupStart); + eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED); + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) { + ncclProfilerEventDescr_v1_t eDescr = { 0 }; + eDescr.type = ncclProfileGroup; + ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr); + } + } + TIME_STOP_EVENT(groupStart); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) { + TIME_START_EVENT(groupStop); + if (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle) { + ncclProfiler->stopEvent(plan->groupEventHandle); + } + TIME_STOP_EVENT(groupStop); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { + TIME_START_EVENT(taskStart); + if (__builtin_expect(ncclProfiler != NULL, 0)) { + int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl); + if (plan->groupEventHandle && enable) { + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + while (ct) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileColl; + eDescr.parentObj = plan->groupEventHandle; + eDescr.rank = plan->comm->rank; + eDescr.coll.name = plan->comm->commName; + eDescr.coll.commHash = plan->comm->commHash; + eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++; + eDescr.coll.func = ct->func; + eDescr.coll.sendBuff = ct->sendbuff; + eDescr.coll.recvBuff = ct->recvbuff; + eDescr.coll.count = ct->count; + eDescr.coll.root = ct->root; + eDescr.coll.datatype = ct->datatype; + eDescr.coll.op = ct->opHost; + eDescr.coll.trafficBytes = ct->trafficBytes; + eDescr.coll.nMaxChannels = ct->nMaxChannels; + eDescr.coll.nWarps = ct->nWarps; + eDescr.coll.algo = ct->algorithm; + eDescr.coll.proto = ct->protocol; + eDescr.coll.isCollnet = ct->isCollnet; + eDescr.coll.isNvls = ct->isNvls; + ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr); + + // update collective task with group event activation mask + ct->eActivationMask = eActivationMaskGroup; + ct = ct->next; + } + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + while (pt) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileP2p; + eDescr.parentObj = plan->groupEventHandle; + eDescr.rank = plan->comm->rank; + eDescr.p2p.name = plan->comm->commName; + eDescr.p2p.commHash = plan->comm->commHash; + eDescr.p2p.func = pt->func; + eDescr.p2p.buff = pt->buff; + eDescr.p2p.count = pt->count; + eDescr.p2p.datatype = pt->datatype; + eDescr.p2p.peer = pt->root; + ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); + + // update collective task with group event activation mask + pt->eActivationMask = eActivationMaskGroup; + pt = pt->next; + } + } + } + TIME_STOP_EVENT(taskStart); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) { + TIME_START_EVENT(taskStop); + if (__builtin_expect(ncclProfiler != NULL, 0)) { + int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl); + if (plan->groupEventHandle && enable) { + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + while (ct) { + ncclProfiler->stopEvent(ct->eventHandle); + ct = ct->next; + } + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + while (pt) { + ncclProfiler->stopEvent(pt->eventHandle); + pt = pt->next; + } + } + } + TIME_STOP_EVENT(taskStop); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) { + TIME_START_EVENT(proxyOpStart); + struct ncclProxySubArgs* sub = &args->subs[s]; + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileProxyOp; + eDescr.parentObj = sub->taskEventHandle; + eDescr.rank = sub->rank; + eDescr.proxyOp.pid = args->pid; + eDescr.proxyOp.channelId = sub->channelId; + eDescr.proxyOp.peer = sub->peer; + eDescr.proxyOp.nSteps = sub->nsteps; + eDescr.proxyOp.chunkSize = args->chunkSize; + eDescr.proxyOp.isSend = 1; + ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr); + } + } + TIME_STOP_EVENT(proxyOpStart); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) { + TIME_START_EVENT(proxyOpStart); + struct ncclProxySubArgs* sub = &args->subs[s]; + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileProxyOp; + eDescr.parentObj = sub->taskEventHandle; + eDescr.rank = sub->rank; + eDescr.proxyOp.pid = args->pid; + eDescr.proxyOp.channelId = sub->channelId; + eDescr.proxyOp.peer = sub->peer; + eDescr.proxyOp.nSteps = sub->nsteps; + eDescr.proxyOp.chunkSize = args->chunkSize; + eDescr.proxyOp.isSend = 0; + ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr); + } + } + TIME_STOP_EVENT(proxyOpStart); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) { + TIME_START_EVENT(proxyOpStop); + struct ncclProxySubArgs* sub = &args->subs[s]; + if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) { + ncclProfiler->stopEvent(sub->opEventHandle); + sub->opEventHandle = NULL; + } + TIME_STOP_EVENT(proxyOpStop); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) { + TIME_START_EVENT(proxyStepStart); + struct ncclProxySubArgs* sub = &args->subs[s]; + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) { + for (uint64_t step = stepLo; step < stepHi; step++) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileProxyStep; + eDescr.parentObj = sub->opEventHandle; + eDescr.rank = sub->rank; + eDescr.proxyStep.step = step; + ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr); + } + } + } + TIME_STOP_EVENT(proxyStepStart); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) { + TIME_START_EVENT(proxyStepStart); + struct ncclProxySubArgs* sub = &args->subs[s]; + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) { + for (uint64_t step = stepLo; step < stepHi; step++) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileProxyStep; + eDescr.parentObj = sub->opEventHandle; + eDescr.rank = sub->rank; + eDescr.proxyStep.step = step; + ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr); + } + } + } + TIME_STOP_EVENT(proxyStepStart); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) { + TIME_START_EVENT(proxyStepStop); + struct ncclProxySubArgs* sub = &args->subs[s]; + if (__builtin_expect(ncclProfiler != NULL, 0)) { + for (uint64_t step = stepLo; step < stepHi; step++) { + if (sub->stepEventHandles[step%NCCL_STEPS]) { + ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]); + sub->stepEventHandles[step%NCCL_STEPS] = NULL; + } + } + } + TIME_STOP_EVENT(proxyStepStop); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle) { + TIME_START_EVENT(proxyCtrlStart); + if (__builtin_expect(ncclProfiler != NULL, 0)) { + // for proxy control events we allow profiling mode to change on a per event basis + int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED); + if (eActivationMaskProxy & ncclProfileProxyCtrl) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileProxyCtrl; + ncclProfiler->startEvent(profilerContext, eHandle, &eDescr); + TIME_STOP_EVENT(proxyCtrlStart); + return ncclSuccess; + } + } + *eHandle = NULL; + TIME_STOP_EVENT(proxyCtrlStart); + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) { + TIME_START_EVENT(proxyCtrlStop); + if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle) { + ncclProfiler->stopEvent(eHandle); + } + TIME_STOP_EVENT(proxyCtrlStop); + return ncclSuccess; +} + +ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) { + TIME_START_EVENT(proxyOpRecord); + struct ncclProxySubArgs* sub = &args->subs[s]; + if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) { + ncclProfilerEventStateArgs_t a = { 0 }; + a.proxyOp.steps = steps; + a.proxyOp.transSize = transSize; + ncclProfiler->recordEventState(sub->opEventHandle, eState, &a); + } + TIME_STOP_EVENT(proxyOpRecord); + return ncclSuccess; +} + +ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) { + TIME_START_EVENT(proxyStepRecord); + struct ncclProxySubArgs* sub = &args->subs[s]; + if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) { + for (uint64_t step = stepLo; step < stepHi; step++) { + if (sub->stepEventHandles[step%NCCL_STEPS]) { + ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0); + } + } + } + TIME_STOP_EVENT(proxyStepRecord); + return ncclSuccess; +} + +ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) { + TIME_START_EVENT(proxyCtrlRecord); + if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) { + ncclProfilerEventStateArgs_t args = { 0 }; + args.proxyCtrl.appendedProxyOps = appended; + ncclProfiler->recordEventState(eHandle, eState, &args); + } + TIME_STOP_EVENT(proxyCtrlRecord); + return ncclSuccess; +} + +ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) { + op->pid = pid; + return ncclSuccess; } -#else -ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } -void ncclProfilingDump() {} -#endif diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc index a481643..daf3b33 100644 --- a/src/misc/shmutils.cc +++ b/src/misc/shmutils.cc @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "shm.h" +#include "shmutils.h" #include "comm.h" #include "checks.h" #include @@ -75,7 +75,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de goto fail; } } else { - SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail); + SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail); } retry_fallocate: @@ -90,7 +90,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de } INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath); } else { - SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail); + SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail); } hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); @@ -114,7 +114,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de } if (devShmPtr) { - CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail); + CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail); CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail); } @@ -129,7 +129,7 @@ fail: shmPath, shmSize, strerror(errno), errno); if (tmphandle) { shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle); - ncclShmClose((ncclShmHandle_t)tmphandle); + (void)ncclShmClose((ncclShmHandle_t)tmphandle); tmphandle = NULL; } hptr = NULL; @@ -182,7 +182,7 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) { ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) { ncclResult_t ret = ncclSuccess; - int curRound = shmem->round; + int curRound; size_t mycnt; if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) { @@ -190,6 +190,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff goto exit; } + curRound = shmem->round; memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize); /* sync among local ranks */ mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL); diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 9ade0e4..93e577e 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -284,6 +284,7 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char sin6.sin6_scope_id = 0; // should be global scope, set to 0 } else { WARN("Net : unsupported IP family"); + freeaddrinfo(p); return ncclInvalidArgument; } @@ -408,7 +409,7 @@ ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* static ncclResult_t socketTryAccept(struct ncclSocket* sock) { socklen_t socklen = sizeof(union ncclSocketAddress); - sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen); + sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen); if (sock->fd != -1) { sock->state = ncclSocketStateAccepted; } else if (errno != EAGAIN && errno != EWOULDBLOCK) { @@ -501,8 +502,9 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) { } else if (ret < 0) { WARN("socketPollConnect poll() failed with error %s", strerror(errno)); return ncclRemoteError; - } else { - EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0); + } else if (ret != 1 || (pfd.revents & POLLOUT) == 0) { + WARN("socketPollConnect poll() returned %d%s", ret, (pfd.revents & POLLOUT) ? "" : ", no POLLOUT events"); + return ncclSystemError; } /* check socket status */ @@ -734,13 +736,17 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad /* Set socket as non-blocking if async or if we need to be able to abort */ if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) { int flags; - EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail); - SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail); + SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail); + SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail); } exit: return ret; fail: + if (sock->fd != -1) { + close(sock->fd); + sock->fd = -1; + } goto exit; } diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc index 608062b..f1a9756 100644 --- a/src/misc/tuner.cc +++ b/src/misc/tuner.cc @@ -77,6 +77,8 @@ static void* tryOpenLib(const char* name, int* err, char* errStr) { if (nullptr == handle) { strncpy(errStr, dlerror(), MAX_STR_LEN); errStr[MAX_STR_LEN] = '\0'; + // "handle" and "name" won't be NULL at the same time. + // coverity[var_deref_model] if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { *err = ENOENT; } diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 12504bc..bb59947 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -65,15 +65,7 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { return ncclSuccess; } -uint64_t getHash(const char* string, int n) { - // Based on DJB2a, result = result * 33 ^ char - uint64_t result = 5381; - for (int c = 0; c < n; c++) { - result = ((result << 5) + result) ^ string[c]; - } - return result; -} - +static uint64_t hostHashValue = 0; /* Generate a hash of the unique identifying string for this host * that will be unique for both bare-metal and container instances * Equivalent of a hash of; @@ -83,7 +75,7 @@ uint64_t getHash(const char* string, int n) { * This string can be overridden by using the NCCL_HOSTID env var. */ #define HOSTID_FILE "/proc/sys/kernel/random/boot_id" -uint64_t getHostHash(void) { +static void getHostHashOnce() { char hostHash[1024]; const char *hostId; @@ -103,8 +95,8 @@ uint64_t getHostHash(void) { strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); free(p); } + fclose(file); } - fclose(file); } // Make sure the string is terminated @@ -112,7 +104,12 @@ uint64_t getHostHash(void) { TRACE(NCCL_INIT,"unique hostname '%s'", hostHash); - return getHash(hostHash, strlen(hostHash)); + hostHashValue = getHash(hostHash, strlen(hostHash)); +} +uint64_t getHostHash(void) { + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once, getHostHashOnce); + return hostHashValue; } /* Generate a hash of the unique identifying string for this process diff --git a/src/nccl.h.in b/src/nccl.h.in index 9efdf9f..431ecb5 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -168,6 +168,13 @@ ncclResult_t pncclCommAbort(ncclComm_t comm); ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +/* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig. + * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation. + * The number of ncclUniqueIds and their order must be the same for every rank. + */ +ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config); +ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config); + /* Returns a string for each error code. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); diff --git a/src/net.cc b/src/net.cc index 0f5d336..97a8c73 100644 --- a/src/net.cc +++ b/src/net.cc @@ -355,6 +355,8 @@ static void* tryOpenLib(char* name, int* err, char* errStr) { if (nullptr == handle) { strncpy(errStr, dlerror(), MAX_STR_LEN); errStr[MAX_STR_LEN] = '\0'; + // "handle" and "name" won't be NULL at the same time. + // coverity[var_deref_model] if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { *err = ENOENT; } @@ -422,11 +424,10 @@ static int netPluginStatus = netPluginLoadReady; ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; - if (netPluginLoadFailed == netPluginStatus) { - return ncclSuccess; - } - pthread_mutex_lock(&netPluginLock); + if (netPluginLoadFailed == netPluginStatus) { + goto exit; + } if (netPluginLoadSuccess == netPluginStatus) { ++netPluginRefCount; goto exit; diff --git a/src/proxy.cc b/src/proxy.cc index eef71a5..5e657c0 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -8,18 +8,21 @@ #include "info.h" #include "collectives.h" #include "socket.h" -#include "shm.h" +#include "shmutils.h" #include "profiler.h" #define ENABLE_TIMER 0 #include "timer.h" +#include "profiler.h" #include "transport.h" #include #include #include #include +#include enum { proxyRecv=0, proxySend=1 }; +void* ncclProxyServiceUDS(void* _args); static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; @@ -67,8 +70,10 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi return ncclInternalError; } - memcpy(elem->respBuff, respBuff, respSize); - free(respBuff); + if (respSize > 0) { + memcpy(elem->respBuff, respBuff, respSize); + free(respBuff); + } elem->done = true; elem->res = res; return ncclSuccess; @@ -360,12 +365,17 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr sub->nsteps = op->nsteps; sub->nbytes = op->nbytes; sub->offset = 0; - sub->peer = op->root; + sub->peer = op->peer; sub->reg = op->reg; sub->sendMhandle = op->sendMhandle; sub->recvMhandle = op->recvMhandle; sub->sendbuff = op->sendbuff; sub->recvbuff = op->recvbuff; + sub->eActivationMask = op->eActivationMask; + sub->taskEventHandle = op->taskEventHandle; + sub->rank = op->rank; + args->pid = op->pid; + args->profilerContext = op->profilerContext; args->nsubs = subIndex+1; if (subIndex) { if ((args->sliceSteps != op->sliceSteps) || @@ -527,6 +537,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel if (justInquire) *justInquire = true; else { + op->peer = peer; NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op)); } return ncclSuccess; @@ -588,6 +599,64 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire)); } break; + case ncclPatternPatUp: { + // Run full algorithm to count the number of steps for each peer. + int *nstepsSend, *nstepsRecv; + const int rank = comm->rank, nranks = comm->nRanks; + NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks))); + NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks))); + const ssize_t size = op->nbytes/comm->nRanks; + PatRSAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); + int last = 0; + while (last == 0) { + int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem; + size_t inpIx, outIx; + algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last); + if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++; + if (sendDim != -1 && postSend) nstepsSend[sendDim]++; + } + for (int i=0; insteps = nstepsSend[i]; + NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire)); + } + if (nstepsRecv[i]) { + int recvPeer = (rank - (1<nsteps = nstepsRecv[i]; + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire)); + } + } + } break; + case ncclPatternPatDown: { + // Run full algorithm to count the number of steps for each peer. + int *nstepsSend, *nstepsRecv; + const int rank = comm->rank, nranks = comm->nRanks; + NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks))); + NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks))); + const ssize_t size = op->nbytes/comm->nRanks; + PatAGAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); + int last = 0; + while (last == 0) { + int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem; + size_t inpIx, outIx; + algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last); + if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++; + if (sendDim != -1 && postSend) nstepsSend[sendDim]++; + } + for (int i=0; insteps = nstepsSend[i]; + NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire)); + } + if (nstepsRecv[i]) { + int recvPeer = (rank + (1<nsteps = nstepsRecv[i]; + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire)); + } + } + } break; case ncclPatternSend: case ncclPatternRecv: { if (op->root == comm->rank) return ncclSuccess; @@ -657,9 +726,9 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int if (state->opsPool == NULL) return ncclInternalError; struct ncclProxyOpsPool* pool = state->opsPool; - struct ncclProxyArgs profArgs; // Only used for profiling purposes if (state->nextOps != -1) goto process_nextops; + void* eHandle; // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock // to be available. Exit, continue progress, and come back later. if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess; @@ -667,10 +736,11 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int if (state->active == NULL) { pthread_mutex_lock(&pool->mutex); while (pool->nextOps == -1 && !state->stop) { - struct ncclProxyArgs profArgs; // Only used for profiling purposes - ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep); + ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); + ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep); pthread_cond_wait(&pool->cond, &pool->mutex); - ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup); + ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup); + ncclProfilerStopProxyCtrlEvent(eHandle); } if (state->stop) { // We might have been woken up to stop. pthread_mutex_unlock(&pool->mutex); @@ -684,7 +754,8 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int if (state->nextOps == -1) return ncclInternalError; process_nextops: - ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend); + ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); + ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend); TIME_START(2); int freeOp[NCCL_MAX_LOCAL_RANKS]; int freeOpEnd[NCCL_MAX_LOCAL_RANKS]; @@ -720,6 +791,10 @@ process_nextops: if (freeOp[i] == -1) continue; int newFree = freeOp[i]; int oldFree = pool->freeOps[i]; + // Coverity gets confused by the complex code structure here. The previous "for" loop ensures that freeOpEnd[i] + // is initialized so long as freeOp[i] is initialized (is not -1). In the current loop we filter out uninitialized + // freeOp[i], hence ensuring that freeOpEnd[i] is also initialized. + // coverity[uninit_use:FALSE] pool->ops[freeOpEnd[i]].next = oldFree; if (oldFree == -1) { // Nothing for the main thread to consume, we can set it. @@ -735,8 +810,8 @@ process_nextops: } } } - profArgs.opCount = *added; - ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd); + ncclProfilerRecordProxyCtrlEventState(eHandle, *added, ncclProfilerProxyCtrlAppendEnd); + ncclProfilerStopProxyCtrlEvent(eHandle); TIME_STOP(2); return ncclSuccess; } @@ -758,6 +833,7 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) { if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) { WARN("Unable to create thread context due to old driver, disabling."); createThreadContext = 0; + goto exit; } } } @@ -767,15 +843,17 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) { NULL, 0, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) { WARN("Failed to create CUDA context on device %d", proxyState->cudaDev); createThreadContext = 0; + goto exit; } } else { if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) { WARN("Failed to set CUDA context on device %d", proxyState->cudaDev); - return 0; + goto exit; } - return 1; } + return 1; } +exit: #endif return 0; } @@ -787,12 +865,14 @@ NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8); void* ncclProxyProgress(void *proxyState_) { struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_; if (setProxyThreadContext(proxyState)) { - INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev); + INFO(NCCL_INIT, "[Proxy Progress] Set CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev); } // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); + struct ncclProxyProgressState* state = &proxyState->progressState; state->nextOps = -1; const int sig = ncclParamProxyDumpSignal(); @@ -809,9 +889,7 @@ void* ncclProxyProgress(void *proxyState_) { * ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */ int proxyOpAppendCounter = 0; - struct ncclProxyArgs profArgs; // Only used for profiling purposes - while ((state->stop == 0 || (state->stop == 1 && state->active)) && - __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0) { + while (state->stop == 0 || (state->stop == 1 && state->active)) { int idle = 1; ncclResult_t ret = progressOps(proxyState, state, state->active, &idle); if (ret != ncclSuccess) { @@ -819,8 +897,11 @@ void* ncclProxyProgress(void *proxyState_) { INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); continue; } - if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle); - if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive); + void* eHandle; + ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); + if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle); + if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive); + ncclProfilerStopProxyCtrlEvent(eHandle); if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) { int added = 0; proxyOpAppendCounter = 0; @@ -860,7 +941,7 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) { static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; if (!state->thread) { - pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState); + PTHREADCHECK(pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState), "pthread_create"); ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks); } return ncclSuccess; @@ -875,7 +956,7 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) { state->stop = 1; pthread_cond_signal(&state->opsPool->cond); pthread_mutex_unlock(&state->opsPool->mutex); - pthread_join(state->thread, NULL); + PTHREADCHECK(pthread_join(state->thread, NULL), "pthread_join"); } // Free off any memory allocated for the proxy arg pools @@ -885,7 +966,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) { state->pools = next; } - ncclProfilingDump(); TIME_PRINT("Proxy"); return ncclSuccess; } @@ -962,23 +1042,17 @@ struct ncclProxyInitResp { char devShmPath[6]; // "XXXXXX" - May or may not be set }; -ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) { +ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn) { struct ncclSocket* sock; - int ready, proxyRank = -1; + int ready; struct ncclProxyState* sharedProxyState = comm->proxyState; + int tpProxyRank = comm->topParentRanks[proxyRank]; - // Keep one connection per local rank - for (int i = 0; i < comm->localRanks; ++i) { - /* find the proxy rank in comm. */ - if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) { - proxyRank = comm->localRankToRank[i]; - break; - } - } proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; // Keep one connection per local rank proxyConn->connection = NULL; proxyConn->tpRank = tpProxyRank; + proxyConn->rank = proxyRank; if (sharedProxyState->peerSocks == NULL) { NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks)); NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks)); @@ -1020,68 +1094,93 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; } } + proxyConn->initialized = true; INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection); return ncclSuccess; } // UDS support -ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) { +ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int* reqFd, int *respFd) { ncclResult_t res = ncclSuccess; struct ncclIpcSocket ipcSock = { 0 }; void *opId; NCCLCHECK(getRandomData(&opId, sizeof(opId))); + int reqFdtmp = -1; int rank = comm->topParentLocalRanks[comm->localRank]; struct ncclProxyState* sharedProxyState = comm->proxyState; - uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank]; + uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank]; INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p", - comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId); + comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, respFd, opId); // cuMem: Create a UDS socket to receive the response NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag)); + if (reqFd) { + reqFdtmp = *reqFd; + } else { + // give a dummy fd for the other side of UDS socket + NCCLCHECK(ncclIpcSocketGetFd(&ipcSock, &reqFdtmp)); + } + ncclIpcHdr hdr; hdr.type = type; hdr.rank = rank; hdr.reqSize = reqSize; hdr.respSize = respSize; hdr.opId = opId; + assert(reqSize <= sizeof(hdr.data)); memcpy(&hdr.data, reqBuff, reqSize); - NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error); + NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), reqFdtmp, proxyConn->tpRank, pidHash), res, error); NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error); NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error); INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE", - comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId); + comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId); return res; error: NCCLCHECK(ncclIpcSocketClose(&ipcSock)); - WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res); + WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", proxyConn->tpRank, pidHash, res); return res; } // cuMem API support // The request/response is sent out-of-band using ncclIpcSocket for this specific command -ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) { +ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int proxyRank, void *handle, int* convertedFd) { ncclResult_t ret = ncclSuccess; // Request the allocation of a UDS fd for the handle - NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error); + if (comm->gproxyConn[proxyRank].initialized == false) { + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, proxyRank, &comm->gproxyConn[proxyRank]), ret, error); + } + NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, &comm->gproxyConn[proxyRank], ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, NULL, convertedFd), ret, error); // We have now received the converted fd over UDS - INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd); + INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d sameProcess %d", *(uint64_t*)handle, comm->topParentRanks[proxyRank], *convertedFd, comm->gproxyConn[proxyRank].sameProcess); return ret; error: - WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret); + WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", comm->topParentRanks[proxyRank], *(uint64_t*)handle, ret); return ret; } +ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd) { + ncclResult_t ret = ncclSuccess; + NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, proxyConn, ncclProxyMsgQueryFd, NULL, 0, (void*)rmtFd, sizeof(int), &localFd, NULL), ret, fail); +exit: + // We have now received the converted fd over UDS + INFO(NCCL_PROXY, "UDS: ClientQueryFd localFd %d tpRank %d remote fd %d sameProcess %d", localFd, proxyConn->tpRank, *rmtFd, proxyConn->sameProcess); + return ret; +fail: + WARN("ncclProxyClientQueryFdBlocking call to tpRank %d localFd %d failed : %d", proxyConn->tpRank, localFd, ret); + goto exit; +} + const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" }; ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) { struct ncclSocket* sock; @@ -1091,7 +1190,6 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector if (sharedProxyState->peerSocks == NULL) return ncclInternalError; sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; - if (sock == NULL) return ncclInternalError; NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error); @@ -1267,6 +1365,22 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr return ncclSuccess; } +static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, void *opId, int rmtFd) { +#if CUDART_VERSION >= 11030 + struct ncclIpcSocket ipcSock = { 0 }; + uint64_t hash = (uint64_t) opId; + ncclResult_t ret = ncclSuccess; + + NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit); + NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit); +exit: + NCCLCHECK(ncclIpcSocketClose(&ipcSock)); + return ncclSuccess; +#else + return ncclInternalError; +#endif +} + // cuMem API support static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) { #if CUDART_VERSION >= 11030 @@ -1286,7 +1400,7 @@ static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void error: NCCLCHECK(ncclIpcSocketClose(&ipcSock)); // We can now safely close the exported fd - (void) close(fd); + SYSCHECK(close(fd), "close"); return ret; #else return ncclInternalError; @@ -1352,30 +1466,37 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP } static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) { + ncclResult_t ret = ncclSuccess; struct ncclSocket* sock = &peer->sock; struct ncclProxyAsyncOp* asyncOp; NCCLCHECK(ncclCalloc(&asyncOp, 1)); asyncOp->type = type; - NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*))); + NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)), ret, fail); - NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int))); - NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int))); + NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)), ret, fail); + NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)), ret, fail); if (asyncOp->reqSize) { - NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize)); - NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize)); + NCCLCHECKGOTO(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize), ret, fail); + NCCLCHECKGOTO(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize), ret, fail); } // Store opId for completion response - NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId))); + NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)), ret, fail); - if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize)); + if (asyncOp->respSize) NCCLCHECKGOTO(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize), ret, fail); asyncProxyOpEnqueue(peer, asyncOp); (*asyncOpCount)++; NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool)); - return ncclSuccess; +exit: + return ret; +fail: + if (asyncOp->reqBuff) free(asyncOp->reqBuff); + if (asyncOp->respBuff) free(asyncOp->respBuff); + free(asyncOp); + goto exit; } #include @@ -1395,6 +1516,12 @@ static bool proxyMatchOpType(int type) { } } +enum { + PROXY_RUNNING = 0, + PROXY_STOP = 1, + PROXY_ABORT = 2 +}; + void* ncclProxyService(void* _args) { struct ncclProxyState* proxyState = (struct ncclProxyState*) _args; // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); @@ -1405,6 +1532,8 @@ void* ncclProxyService(void* _args) { } // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); + // Prepare poll descriptor struct ncclProxyConnectionPool connectionPool; connectionPool.pools = NULL; @@ -1426,13 +1555,13 @@ void* ncclProxyService(void* _args) { int maxnpeers = 0; int npeers = 0; - int stop = 0; + int stop = PROXY_RUNNING; int asyncOpCount = 0; - while (stop == 0 || (stop == 1 && npeers > 0)) { + while (stop == PROXY_RUNNING || npeers > 0) { /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer * connections. Need to wait until all other related comms call abort and safely exit * together, or we could face segmentation fault. */ - if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = 1; + if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = PROXY_ABORT; /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ int ret; do { @@ -1474,10 +1603,14 @@ void* ncclProxyService(void* _args) { if (pollfds[s].fd == -1) continue; // Progress all ops for this ncclProxyLocalPeer + if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1; ncclProxyAsyncOp* op = peer->asyncOps; while (op != nullptr) { ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */ type = op->type; + // Coverity gets confused here by complex code structure. Yes, connectionPool.pools gets dereferenced, and + // while calling proxyProgressAsync() connectionPool.pools is NULL, but that changes before it's dereferenced. + // coverity[var_deref_model:FALSE] res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool); if (res == ncclSuccess || res == ncclInProgress) { op = opnext; @@ -1494,14 +1627,15 @@ void* ncclProxyService(void* _args) { int closed; res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/); if (res != ncclSuccess && res != ncclInProgress) { - WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed); + if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED)) + WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed); closeConn = 1; } else if (closed) { INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank); closeConn = 1; } else if (res == ncclSuccess) { // We received something from the sock if (type == ncclProxyMsgStop) { - stop = 1; + stop = PROXY_STOP; closeConn = 1; } else if (type == ncclProxyMsgClose) { closeConn = 1; @@ -1518,12 +1652,13 @@ void* ncclProxyService(void* _args) { closeConn = 1; } if (res != ncclSuccess && res != ncclInProgress) { - WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res); + if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED)) + WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res); closeConn = 1; } if (closeConn) { - ncclSocketClose(sock); + (void)ncclSocketClose(sock); if (op != nullptr) { asyncProxyOpDequeue(peer, op); @@ -1540,10 +1675,10 @@ void* ncclProxyService(void* _args) { WARN("[Proxy Service] proxyDestroy failed"); } for (int s=0; slistenSock); + (void)ncclSocketClose(proxyState->listenSock); free(proxyState->listenSock); proxyOpsFree(proxyState); return NULL; @@ -1553,12 +1688,17 @@ void* ncclProxyService(void* _args) { // Process a request on the UDS socket static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) { ncclIpcHdr hdr; - NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL)); + int rmtFd = -1; + + NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd)); if (hdr.type == ncclProxyMsgGetFd) { // cuMem API support uint64_t handle = *(uint64_t*)hdr.data; INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle); return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle); + } else if (hdr.type == ncclProxyMsgQueryFd) { + INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd); + return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd); } return ncclInternalError; @@ -1570,11 +1710,13 @@ void* ncclProxyServiceUDS(void* _args) { struct pollfd pollfds[1]; if (setProxyThreadContext(proxyState)) { - INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev); + INFO(NCCL_INIT, "[Proxy Service UDS] Set CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev); } + INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); + if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) { WARN("[Proxy Service UDS] Get listenSock fd fails"); return NULL; @@ -1593,7 +1735,7 @@ void* ncclProxyServiceUDS(void* _args) { } // Check for stop/abort - if (proxyState->stop || *proxyState->abortFlag) break; + if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) || __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE)) break; if (pollfds[0].revents) { // A request was seen on the UDS fd @@ -1638,14 +1780,16 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) { proxyState->dmaBufSupport = comm->dmaBufSupport; proxyState->ncclNet = comm->ncclNet; proxyState->ncclCollNet = comm->ncclCollNet; + proxyState->profilerContext = comm->profilerContext; + proxyState->directMode = comm->directMode; memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes)); - pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState); + PTHREADCHECK(pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState), "pthread_create"); ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev); // UDS support INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank); - pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState); + PTHREADCHECK(pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState), "pthread_create"); ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev); } return ncclSuccess; @@ -1658,17 +1802,17 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) { if (comm->proxyState->threadUDS) { // UDS support - comm->proxyState->stop = 1; + __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE); } - if (sharedProxyState->peerAddresses) { + if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) { struct ncclSocket sock; int type = ncclProxyMsgStop; ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag); if (ncclSocketConnect(&sock) == ncclSuccess) { - ncclSocketSend(&sock, &type, sizeof(int)); + (void)ncclSocketSend(&sock, &type, sizeof(int)); } - ncclSocketClose(&sock); + (void)ncclSocketClose(&sock); } if (sharedProxyState->peerSocks) { @@ -1686,7 +1830,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { } } int type = ncclProxyMsgClose; - ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)); + (void)ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)); NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i)); } } @@ -1700,13 +1844,15 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState; - assert(sharedProxyState->refCount == 0); - free(sharedProxyState->peerAddresses); - free(sharedProxyState->peerAddressesUDS); - free(sharedProxyState->peerSocks); - free(sharedProxyState->proxyOps); - free(sharedProxyState->sharedDevMems); - expectedProxyResponseFree(sharedProxyState); - free(sharedProxyState); + if (sharedProxyState) { + assert(sharedProxyState->refCount == 0); + free(sharedProxyState->peerAddresses); + free(sharedProxyState->peerAddressesUDS); + free(sharedProxyState->peerSocks); + free(sharedProxyState->proxyOps); + free(sharedProxyState->sharedDevMems); + expectedProxyResponseFree(sharedProxyState); + free(sharedProxyState); + } return ncclSuccess; } diff --git a/src/register.cc b/src/register.cc index 90d429f..c4ca4b4 100644 --- a/src/register.cc +++ b/src/register.cc @@ -26,8 +26,8 @@ ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) { ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) { struct ncclRegCache* cache = &comm->regCache; - int netCount; - NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount)); + int netCount = 0; + if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount)); if (netCount == 0) return ncclSuccess; ncclResult_t ret = ncclSuccess; @@ -105,7 +105,11 @@ ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, s NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1); ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) { - if (!ncclParamLocalRegister()) return ncclSuccess; + if (!ncclParamLocalRegister()) { + *handle = NULL; + return ncclSuccess; + } + INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size); struct ncclRegCache* cache = &comm->regCache; uintptr_t pageSize = cache->pageSize; uintptr_t addr = (uintptr_t)data & -pageSize; @@ -166,6 +170,10 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { struct ncclReg* reg = (struct ncclReg*)handle; struct ncclRegCache* cache = &comm->regCache; int slot; + int saveDev; + if (handle == NULL) goto exit; + CUDACHECK(cudaGetDevice(&saveDev)); + CUDACHECK(cudaSetDevice(comm->cudaDev)); for (slot=0; slotpopulation && cache->slots[slot] != reg; slot++); if (slot == cache->population) { WARN("Deregister: Could not find handle"); @@ -178,10 +186,19 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { reg->regAddr = (CUdeviceptr)NULL; } if (reg->state & COLLNET_REG_COMPLETE) { - NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle)); + NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle)); + } + if (reg->state & IPC_REG_COMPLETE) { + for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i) + if (reg->ipcInfos[i]) + NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i])); + if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs); + if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs)); } free(reg); memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*)); cache->population -= 1; + CUDACHECK(cudaSetDevice(saveDev)); +exit: return ncclSuccess; } diff --git a/src/transport.cc b/src/transport.cc index 5df4706..eeee7a2 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -28,7 +28,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* struct ncclTransport *transport = ncclTransports[t]; struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv; int ret = 0; - NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo)); + NCCLCHECK(transport->canConnect(&ret, comm, graph, myInfo, peerInfo)); if (ret) { connector->transportComm = transportComm; NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex)); @@ -70,25 +70,52 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128); NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0); #include +ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode) { + bool supportFlag = true; + bool directFlag = false; + if (comm->localRanks == 1) { + supportFlag = false; + } else { + for (int i = 0; i < comm->localRanks; ++i) { + for (int j = i + 1; j < comm->localRanks; ++j) { + int ipeer = comm->localRankToRank[i]; + int jpeer = comm->localRankToRank[j]; + struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer]; + struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer]; + int canConnect = 0; + NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, NULL, ipeerInfo, jpeerInfo)); + if (!canConnect && supportFlag == true) { + supportFlag = false; + } + if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true; + if (!supportFlag && directFlag) break; + } + } + } + *intraNodeP2pSupport = supportFlag; + *directMode = directFlag; + return ncclSuccess; +} + ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) { // Stream used during transport setup; need for P2P pre-connect + CUDA Graph ncclResult_t ret = ncclSuccess; int highestType = TRANSPORT_UNDEFINED; // track highest transport type struct ncclConnect** data; // Store intermediate send/recvData structs for connect - struct ncclConnect** recvData; // Points to entries inside data for given recv connection within a channel - struct ncclConnect** sendData; // Points to entries inside data for given send connection within a channel + struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel + struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel int done = 0; - int maxPeers = ncclParamConnectRoundMaxPeers(); - NCCLCHECK(ncclCalloc(&data, maxPeers)); - NCCLCHECK(ncclCalloc(&recvData, maxPeers)); - NCCLCHECK(ncclCalloc(&sendData, maxPeers)); struct timeval timeStart, timeLast; gettimeofday(&timeStart, NULL); timeLast = timeStart; // struct copy bool timeReported = false; + NCCLCHECK(ncclCalloc(&data, maxPeers)); + NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); // First time initialization for (int i=1; inRanks; i++) { @@ -104,7 +131,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* // The next M entries contain sendData, connection information for send connections // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections int p = i-(done+1); - if (recvMask || sendMask) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS)); + if (recvMask || sendMask) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail); recvData[p] = data[p]; int sendChannels = 0, recvChannels = 0; int type; @@ -163,7 +190,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex; // This connector hasn't completed connection yet if (conn->connected == 0) { - NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset++, 1, comm->rank, conn), ret, fail); + NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset, 1, comm->rank, conn), ret, fail); if (ret == ncclSuccess) { conn->connected = 1; /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */ @@ -172,6 +199,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* allChannelsConnected = false; } } + sendDataOffset++; } TIME_STOP(3); @@ -181,7 +209,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex; // This connector hasn't completed connection yet if (conn->connected == 0) { - NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset++, 1, comm->rank, conn), ret, fail); + NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset, 1, comm->rank, conn), ret, fail); if (ret == ncclSuccess) { conn->connected = 1; /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */ @@ -190,6 +218,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* allChannelsConnected = false; } } + recvDataOffset++; } TIME_STOP(4); } @@ -198,7 +227,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* data[p] = NULL; } } - if (ncclParamReportConnectProgress() && comm->rank == 0) { + if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) { struct timeval now; gettimeofday(&now, NULL); if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) { @@ -236,34 +265,31 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* int bootstrapTag = (i << 8) + (1 << 7) + (graph ? graph->id + 1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; - int flag = 0; if (recvPeer != sendPeer) { - if (comm->connectSend[sendPeer] != 0UL) - NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); - if (comm->connectRecv[recvPeer] != 0UL) - NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); - - if (comm->connectSend[sendPeer] != 0UL) - NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); - if (comm->connectRecv[recvPeer] != 0UL) - NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); + if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail); + if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail); + if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail); + if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail); } else { if (comm->connectSend[sendPeer] != 0UL || comm->connectRecv[recvPeer] != 0UL) { - NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); - NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); + NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail); + NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail); } } comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL; } - free(data); - free(sendData); - free(recvData); - if (highestTransportType != NULL) *highestTransportType = highestType; TIME_PRINT("P2P Setup/Connect"); exit: + for(int i=0; isharedRes->deviceStream, &comm->sharedRes->hostStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream)); return ret; @@ -275,8 +301,8 @@ extern struct ncclTransport collNetTransport; // All ranks must participate in collNetSetup call // We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails -int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) { - int fail = 1; +bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) { + ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; int nMasters = comm->nNodes; @@ -297,24 +323,23 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send); conn->transportComm = transportComm; // setup - struct ncclConnect myConnect; - if (isMaster) { - NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type)); - } - // prepare connect handles - ncclResult_t res; + struct ncclConnect myConnect = { 0 }; struct { int isMaster; ncclConnect connect; } *allConnects = NULL; ncclConnect *masterConnects = NULL; + if (isMaster) { + NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type)); + } + // prepare connect handles NCCLCHECK(ncclCalloc(&masterConnects, nMasters)); if (type == collNetRecv) { // recv side: AllGather // all ranks must participate - NCCLCHECK(ncclCalloc(&allConnects, nranks)); + NCCLCHECKGOTO(ncclCalloc(&allConnects, nranks), ret, cleanup); allConnects[rank].isMaster = isMaster; memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect)); - NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup); + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), ret, cleanup); // consolidate int c = 0; for (int r = 0; r < nranks; r++) { @@ -328,21 +353,20 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN } // connect if (isMaster) { - NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), res, cleanup); + NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), ret, cleanup); struct ncclDevChannelPeer* devRoot; - CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup); + CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), ret, cleanup); struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type; - CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup); + CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), ret, cleanup); } if (isMaster && type == collNetRecv) { memcpy(connect, masterConnects+comm->node, sizeof(struct ncclConnect)); TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, comm->node, nMasters, masterPeer); } - fail = 0; cleanup: if (allConnects != NULL) free(allConnects); if (masterConnects != NULL) free(masterConnects); - return fail; + return ret != ncclSuccess; } ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index ae1fe0f..7d2f298 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -18,15 +18,15 @@ int64_t ncclParamGdrCopySyncEnable(); int64_t ncclParamGdrCopyFlushEnable(); struct collNetRecvConnectInfo { - int rank; - int nranks; collNetHandle_t collNetHandle; }; +static_assert(sizeof(collNetRecvConnectInfo) <= CONNECT_SIZE, "Collnet Recv Connect info is too large"); struct collNetSendConnectInfo { void* mhandles[NCCL_NUM_PROTOCOLS]; void* reqFifo; }; +static_assert(sizeof(collNetSendConnectInfo) <= CONNECT_SIZE, "Collnet Send Connect info is too large"); #define COLLNET_GROUP_NSUBS 8 #define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS) @@ -135,7 +135,7 @@ struct recvResources { int collNetRank; }; -static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // This transport cannot be used for p2p *ret = 0; return ncclSuccess; @@ -154,15 +154,14 @@ struct setupReq { static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct setupReq req = { 0 }; - int proxyRank, tpProxyRank; + int proxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; - tpProxyRank = comm->topParentRanks[myInfo->rank]; - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn)); ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); @@ -175,7 +174,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct setupReq req = { 0 }; - int proxyRank, tpProxyRank; + int proxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr)); @@ -184,8 +183,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; - tpProxyRank = comm->topParentRanks[myInfo->rank]; - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn)); + static_assert(sizeof(collNetRecvConnectInfo) <= sizeof(struct ncclConnect), "Collnet Recv Connect info is too big"); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); req.collNet = comm->collNetSharedRes; @@ -442,6 +441,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; + static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big"); struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); struct sendResources* resources = (struct sendResources*)(connection->transportResources); @@ -1039,7 +1039,7 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail); if (handle) { regRecord->state |= COLLNET_REG_COMPLETE; - regRecord->proxyconn = proxyconn; + regRecord->collnetProxyconn = proxyconn; *outHandle = regRecord->collnetHandle = handle; *outRegBufFlag = 1; } @@ -1091,7 +1091,7 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u record->size = buffSize; *outHandle = record->mhandle = handle; *outRegBufFlag = 1; - ncclIntruQueueEnqueue(cleanupQueue, &record->base); + ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record); *nCleanupQueueElts += 1; exit: @@ -1214,23 +1214,6 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) { } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail); - // Exchange highest intra-node transport type among ranks - // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer - if (highestTransportType0 != TRANSPORT_UNDEFINED && highestTransportType1 != TRANSPORT_UNDEFINED) { - int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_UNDEFINED }; - - comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; - NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail); - for (int i = 0; i < comm->localRanks; i++) { - if (highestTypes[i] > comm->intraHighestTransportType) - comm->intraHighestTransportType = highestTypes[i]; - } - if (comm->collNetSharedRes->intraHighestTransportType < comm->intraHighestTransportType) - comm->collNetSharedRes->intraHighestTransportType = comm->intraHighestTransportType; - } else if (comm->intraHighestTransportType == TRANSPORT_UNDEFINED) { - // reuse previous shared intraHighestTransportType - comm->intraHighestTransportType = comm->collNetSharedRes->intraHighestTransportType; - } INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank); exit: diff --git a/src/transport/generic.cc b/src/transport/generic.cc index a0efaab..7fd7e59 100644 --- a/src/transport/generic.cc +++ b/src/transport/generic.cc @@ -34,3 +34,26 @@ exit: fail: goto exit; } + +ncclResult_t ncclTransportPatConnect(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + if (comm && comm->nRanks > 1) { + for (int mask=1; masknRanks; mask<<=1) { + int prevPeer = (comm->rank + mask) % comm->nRanks; + int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks; + for (int c = 0; c < comm->nChannels; c++) { + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail); + for (int c = 0; c < comm->nChannels; c++) { + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail); + } + INFO(NCCL_INIT, "Connected binomial trees"); + } +exit: + return ret; +fail: + goto exit; +} diff --git a/src/transport/net.cc b/src/transport/net.cc index d5a585d..00eca60 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -10,10 +10,11 @@ #include "proxy.h" #include "collectives.h" #include "gdrwrap.h" -#include "shm.h" +#include "shmutils.h" #include "p2p.h" #include "profiler.h" #include "transport.h" +#include "shm.h" static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); @@ -62,9 +63,8 @@ struct connectMapMem{ char* cpuPtr; int size; ncclIpcDesc ipcDesc; - char shmPath[PATH_MAX]; - ncclShmHandle_t attachHandle; - ncclShmHandle_t createHandle; + ncclShmIpcDesc_t attachDesc; + ncclShmIpcDesc_t createDesc; }; struct connectMap { @@ -142,11 +142,11 @@ struct recvNetResources { }; /* Determine if two peers can communicate with NET */ -static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; if (info1->hostHash == info2->hostHash) { // If on the same host, check intra-node net is not disabled. - NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret)); + NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, ret)); } return ncclSuccess; } @@ -173,9 +173,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct * information for this peer */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct setupReq req = { 0 }; - int tpProxyRank; - send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; + send->conn.shared = req.shared = graph || connIndex == 0 ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; req.connIndex = connIndex; @@ -185,8 +184,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; - tpProxyRank = comm->topParentRanks[proxyRank]; - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn)); req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; @@ -199,7 +197,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } - *((int*)connectInfo) = tpProxyRank; + *((int*)connectInfo) = comm->topParentRanks[proxyRank]; return ncclSuccess; } @@ -212,12 +210,12 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct setupReq req = { 0 }; - recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; + recv->conn.shared = req.shared = graph || connIndex == 0 ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; req.connIndex = connIndex; // Use myInfo->rank as the receiver uses its own NIC - int proxyRank, tpProxyRank; + int proxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr)); @@ -226,8 +224,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); // We don't support PXN on receive yet - tpProxyRank = comm->topParentRanks[myInfo->rank]; - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn)); req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; @@ -238,26 +235,24 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph return ncclSuccess; } -static ncclResult_t netMapShm(struct connectMapMem* mem) { - mem->cpuPtr = NULL; - mem->gpuPtr = NULL; - NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle)); +static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) { + NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc)); return ncclSuccess; } -static ncclResult_t netCreateShm(struct connectMapMem* mem) { - mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file - NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->createHandle)); + +static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) { + NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr)); return ncclSuccess; } static ncclResult_t netDumpMap(struct connectMap* map) { printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared); struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; - printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); + printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_DEVMEM; printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; - printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); + printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", @@ -328,10 +323,10 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne } } } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { - if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM)); + if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL; - NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, + NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank, map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); @@ -341,7 +336,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank; if (*sharedDevMemPtr == NULL) { map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = NULL; - NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, + NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size, &map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc, sharedDevMemPtr)); @@ -463,24 +458,19 @@ static ncclResult_t sendFree(struct ncclConnector* send) { if (map) { int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); - if (map->sameProcess && map->cudaDev == cudaDev) { - // Our own GPU, so it wasn't mapped in - free(map); - return ncclSuccess; - } - if (!map->sameProcess || ncclCuMemEnable()) { - if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle)); - if (map->mems[NCCL_NET_MAP_DEVMEM].size) { - if (ncclCuMemEnable()) { - // cuMem API support - NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); - NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); - } else { - // Legacy CUDA IPC support - CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); - } + if (map->cudaDev != cudaDev && map->mems[NCCL_NET_MAP_DEVMEM].size) { + if (ncclCuMemEnable()) { + // cuMem API support + NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); + NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } else { + // Legacy CUDA IPC support + CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } } + if (!map->sameProcess) { + NCCLCHECK(ncclShmIpcClose(&map->mems[NCCL_NET_MAP_HOSTMEM].attachDesc)); + } free(map); } @@ -518,7 +508,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int if (cuda && state->cudaBuff == NULL) { if (sameProcess == 0 || ncclCuMemEnable()) { - NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff)); + NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, 0, &state->ipcDesc, (void**)&state->cudaBuff)); } else { NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size)); } @@ -527,7 +517,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size)); } if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; - if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL; + if (gpuPtr) *gpuPtr = (cpuPtr && sameProcess) ? *cpuPtr : NULL; if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc)); return ncclSuccess; } @@ -543,7 +533,7 @@ static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int chan static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) { if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError); struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank]; - if (peer == NULL) NCCLCHECK(ncclInternalError;) + if (peer == NULL) NCCLCHECK(ncclInternalError); struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; if (state->size == 0) NCCLCHECK(ncclInternalError); if (ncclAtomicRefCountDecrement(&state->refcount) == 0) { @@ -746,7 +736,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str if (resources->shared == 0) { if (!map->sameProcess || ncclCuMemEnable()) { ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN); - NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, + NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); @@ -758,7 +748,11 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; } else { - NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM)); + NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM)); + void* sendMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + void* recvMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); + memset(sendMem, 0, sizeof(struct ncclSendMem)); + memset(recvMem, 0, sizeof(struct ncclRecvMem)); } if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; @@ -896,7 +890,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (resources->shared == 0) { if (ncclCuMemEnable()) { - NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, + NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); @@ -968,7 +962,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct if (resources->map.sameProcess) { NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); } else { - NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle)); + NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc)); } NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (!resources->map.sameProcess || ncclCuMemEnable()) { @@ -1050,7 +1044,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct // Set step base for next op resources->step = sub->base + sub->nsteps; sub->posted = sub->transmitted = sub->done = 0; - for (uint64_t step=0; stepnsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); + ncclProfilerStartSendProxyOpEvent(s, args); if (sub->reg && sub->nbytes > 0) { NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle)); } else { @@ -1072,6 +1066,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); // Post buffers to the GPU if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) { + ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (resources->shared) { if (!sub->reg) { @@ -1087,9 +1082,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } else sub->posted += args->sliceSteps; - for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) { - ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait); - } + ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted); + ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait); args->idle = 0; continue; } @@ -1130,12 +1124,18 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset; } if (ready) { + ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait); // Data is ready, try to send. + // Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense, + // since size is a plain integer. + // coverity[use_invalid:FALSE] NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId); sub->transmitted += args->sliceSteps; - for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait); + ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted); + ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait); + sub->transSize += size; args->idle = 0; continue; } @@ -1165,7 +1165,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct __sync_synchronize(); TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); sub->done += args->sliceSteps; - for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd); + ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done); + ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone); if (resources->shared == 0) { volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; @@ -1188,6 +1189,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct } } if (args->done == args->nsubs) { + for (int s=0; snsubs; s++) { + ncclProfilerStopProxyOpEvent(s, args); + } args->state = ncclProxyOpNone; } } @@ -1229,7 +1233,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct resources->step = sub->base + sub->nsteps; sub->posted = sub->received = sub->transmitted = sub->done = 0; for (int i=0; insteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); + ncclProfilerStartRecvProxyOpEvent(s, args); if (sub->reg && sub->nbytes > 0) { // Register buffer NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle)); @@ -1254,6 +1258,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct ncclProxySubArgs* sub = subGroup + i; if (sub->posted < sub->nsteps) { if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; } + ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps); struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); if (sub->reg) maxDepth = 1; int stepSize = resources->buffSizes[p] / NCCL_STEPS; @@ -1294,7 +1299,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup+i; sub->posted += args->sliceSteps; - for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait); + ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted); + ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait); } args->idle = 0; } @@ -1337,7 +1343,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct } } sub->received += args->sliceSteps; - for (uint64_t step=sub->received-args->sliceSteps; stepreceived; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait); + sub->transSize += sizes[i]; + ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived); + ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait); if (step < sub->nsteps) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); if (resources->useGdr) needFlush |= resources->needFlush; @@ -1393,7 +1401,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct ncclProxySubArgs* sub = subGroup + i; sub->transmitted += args->sliceSteps; - for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait); + ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted); + ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait); if (step < sub->nsteps) { __sync_synchronize(); struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); @@ -1431,7 +1440,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL; } sub->done += args->sliceSteps; - for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd); + ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done); + ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone); args->idle = 0; if (sub->done == sub->nsteps) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); @@ -1447,6 +1457,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; + for (int s=0; snsubs; s++) { + ncclProfilerStopProxyOpEvent(s, args); + } } } return ncclSuccess; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index be8a8a3..d828c98 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -49,6 +49,11 @@ struct alignas(64) ncclIbMergedDev { int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs int speed; char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+' + int dmaBufSupported; // 0 = uninit, 1 = yes, -1 = no +}; + +struct ncclIbStats { + int fatalErrorCount; }; static int ncclNIbDevs = -1; @@ -69,6 +74,7 @@ struct alignas(64) ncclIbDev { struct ncclIbMrCache mrCache; int ar; // ADAPTIVE_ROUTING struct ibv_port_attr portAttr; + struct ncclIbStats stats; }; #define MAX_IB_DEVS 32 @@ -80,7 +86,7 @@ static int ncclIbRelaxedOrderingEnabled = 0; NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1); NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1); NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2); -NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18); +NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20); NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7); NCCL_PARAM(IbPkey, "IB_PKEY", 0); NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0); @@ -90,6 +96,32 @@ NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0); +NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1); +NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1); + +static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) { + __atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED); + return ncclSuccess; +} +static void ncclIbStatsFatalError(struct ncclIbStats* stat){ + __atomic_fetch_add(&stat->fatalErrorCount, 1, __ATOMIC_RELAXED); +} +static ncclResult_t ncclIbStatsCheckFatalCount(struct ncclIbStats* stat, const char* funcName) { + if (ncclParamIbAsyncEvents() && __atomic_load_n(&stat->fatalErrorCount, __ATOMIC_RELAXED)) { + WARN("communicator encountered a fatal error (detected in %s)\n", funcName); + return ncclSystemError; + } + return ncclSuccess; +} +static void ncclIbQpFatalError(struct ibv_qp* qp) { + ncclIbStatsFatalError((struct ncclIbStats*)qp->qp_context); +} +static void ncclIbCqFatalError(struct ibv_cq* cq) { + ncclIbStatsFatalError((struct ncclIbStats*)cq->cq_context); +} +static void ncclIbDevFatalError(struct ncclIbDev* dev) { + ncclIbStatsFatalError(&dev->stats); +} pthread_t ncclIbAsyncThread; static void* ncclIbAsyncThreadMain(void* args) { @@ -98,9 +130,53 @@ static void* ncclIbAsyncThreadMain(void* args) { struct ibv_async_event event; if (ncclSuccess != wrap_ibv_get_async_event(dev->context, &event)) { break; } char *str; + struct ibv_cq* cq = event.element.cq; // only valid if CQ error + struct ibv_qp* qp = event.element.qp; // only valid if QP error + struct ibv_srq* srq = event.element.srq; // only valid if SRQ error if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; } - if (event.event_type != IBV_EVENT_COMM_EST) - WARN("NET/IB : %s:%d Got async event : %s", dev->devName, dev->portNum, str); + switch (event.event_type) { + case IBV_EVENT_DEVICE_FATAL: + // the above is device fatal error + WARN("NET/IB : %s:%d async fatal event: %s", dev->devName, dev->portNum, str); + ncclIbDevFatalError(dev); + break; + case IBV_EVENT_CQ_ERR: + // the above is a CQ fatal error + WARN("NET/IB : %s:%d async fatal event on CQ (%p): %s", dev->devName, dev->portNum, cq, str); + ncclIbCqFatalError(cq); + break; + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + // the above are QP fatal errors + WARN("NET/IB : %s:%d async fatal event on QP (%p): %s", dev->devName, dev->portNum, qp, str); + ncclIbQpFatalError(qp); + break; + case IBV_EVENT_SRQ_ERR: + // SRQ are not used in NCCL + WARN("NET/IB : %s:%d async fatal event on SRQ, unused for now (%p): %s", dev->devName, dev->portNum, srq, str); + break; + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_PORT_ERR: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_LID_CHANGE: + case IBV_EVENT_PKEY_CHANGE: + case IBV_EVENT_SM_CHANGE: + case IBV_EVENT_QP_LAST_WQE_REACHED: + case IBV_EVENT_CLIENT_REREGISTER: + case IBV_EVENT_SRQ_LIMIT_REACHED: + // the above are non-fatal + WARN("NET/IB : %s:%d Got async error event: %s", dev->devName, dev->portNum, str); + break; + case IBV_EVENT_COMM_EST: + break; + default: + WARN("NET/IB : %s:%d unknown event type (%d)", dev->devName, dev->portNum, event.event_type); + break; + } + // acknowledgment needs to happen last to avoid user-after-free if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; } } return NULL; @@ -140,11 +216,11 @@ static void* envIbAddrRange(sa_family_t af, int* mask) { char addrString[128] = { 0 }; snprintf(addrString, 128, "%s", env); char *addrStrPtr = addrString; - char *maskStrPtr = strstr(addrString, "/") + 1; + char *maskStrPtr = strstr(addrString, "/"); if (NULL == maskStrPtr) { return NULL; } - *(maskStrPtr - 1) = '\0'; + *(maskStrPtr++) = '\0'; if (inet_pton(af, addrStrPtr, ret) == 0) { WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6"); @@ -242,12 +318,14 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, int fd = open(roceTypePath, O_RDONLY); if (fd == -1) { + WARN("NET/IB: open failed in ncclIbRoceGetVersionNum: %s", strerror(errno)); return ncclSystemError; } int ret = read(fd, gidRoceVerStr, 15); close(fd); if (ret == -1) { + WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno)); return ncclSystemError; } @@ -420,7 +498,7 @@ int ncclIbFindMatchingDev(int dev) { } ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { - ncclResult_t ret; + ncclResult_t ret = ncclSuccess; if (ncclParamIbDisable()) return ncclInternalError; static int shownIbHcaEnv = 0; if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } @@ -496,11 +574,12 @@ build_ib_list: ncclIbDevs[ncclNIbDevs].pdRefs = 0; ncclIbDevs[ncclNIbDevs].pd = NULL; strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); - NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort)); + NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail); ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0; ncclIbDevs[ncclNIbDevs].mrCache.population = 0; ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL; + NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats)); // Enable ADAPTIVE_ROUTING by default on IB networks // But allow it to be overloaded by an env parameter @@ -510,9 +589,9 @@ build_ib_list: TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); - pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs); + PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail); ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); - pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d + PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d int mergedDev = ncclNMergedIbDevs; if (mergeNics) { @@ -592,10 +671,11 @@ build_ib_list: } pthread_mutex_unlock(&ncclIbLock); } - return ncclSuccess; +exit: + return ret; fail: pthread_mutex_unlock(&ncclIbLock); - return ret; + goto exit; } ncclResult_t ncclIbDevices(int* ndev) { @@ -607,46 +687,63 @@ ncclResult_t ncclIbDevices(int* ndev) { // Returns : // ncclSuccess : GDR works // ncclSystemError : no module or module loaded but not supported by GPU +#define KNL_MODULE_LOADED(a) ((access(a, F_OK) == -1) ? 0 : 1) +static int ncclIbGdrModuleLoaded = 0; // 1 = true, 0 = false +static void ibGdrSupportInitOnce() { + // Check for the nv_peer_mem module being loaded + ncclIbGdrModuleLoaded = KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem/version") || + KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem_nc/version") || + KNL_MODULE_LOADED("/sys/module/nvidia_peermem/version"); +} ncclResult_t ncclIbGdrSupport() { - static int moduleLoaded = -1; - if (moduleLoaded == -1) { - // Check for the nv_peer_mem module being loaded - moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) && - // Also support the new nv_mem_nc module - (access("/sys/kernel/mm/memory_peers/nv_mem_nc/version", F_OK) == -1)) ? 0 : 1; - } - if (moduleLoaded == 0) return ncclSystemError; + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once, ibGdrSupportInitOnce); + if (!ncclIbGdrModuleLoaded) + return ncclSystemError; return ncclSuccess; } +static __thread int ibDmaSupportInitDev; // which device to init, must be thread local +static void ibDmaBufSupportInitOnce(){ + ncclResult_t res; + // select the appropriate + struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev; + // Test each real devices + int dev_fail = 0; + for (int i = 0; i < mergedDev->ndevs; i++) { + int ibDev = mergedDev->devs[i]; + struct ibv_pd* pd; + struct ibv_context* ctx = ncclIbDevs[ibDev].context; + NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); + // Test kernel DMA-BUF support with a dummy call (fd=-1) + (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/); + // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise) + dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT); + NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); + // stop the search and goto failure + if (dev_fail) goto failure; + } + mergedDev->dmaBufSupported = 1; + return; +failure: + mergedDev->dmaBufSupported = -1; + return; +} // Detect whether DMA-BUF support is present in the kernel // Returns : // ncclSuccess : DMA-BUF support is available // ncclSystemError : DMA-BUF is not supported by the kernel ncclResult_t ncclIbDmaBufSupport(int dev) { - static int dmaBufSupported = -1; - if (dmaBufSupported == -1) { - ncclResult_t res; - struct ibv_pd* pd; - struct ibv_context* ctx; - struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev; + struct oncewrap { + pthread_once_t once = PTHREAD_ONCE_INIT; + }; + static oncewrap onces[MAX_IB_DEVS]; + // init the device only once + ibDmaSupportInitDev = dev; + pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce); - // Test each dev - for (int i = 0; i < mergedDev->ndevs; i++) { - int ibDev = mergedDev->devs[i]; - ctx = ncclIbDevs[ibDev].context; - NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); - // Test kernel DMA-BUF support with a dummy call (fd=-1) - (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/); - // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise) - dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0; - NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); - } - } - if (dmaBufSupported == 0) return ncclSystemError; - return ncclSuccess; -failure: - dmaBufSupported = 0; + int dmaBufSupported = ncclIbMergedDevs[dev].dmaBufSupported; + if (dmaBufSupported == 1) return ncclSuccess; return ncclSystemError; } @@ -842,16 +939,19 @@ struct alignas(32) ncclIbNetCommBase { // Track necessary remDevInfo here int nRemDevs; struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC]; + // statistics about the comm + struct ncclIbStats stats; }; struct ncclIbSendComm { struct ncclIbNetCommBase base; + // Start with fifo and ibv structs as they have alignment restrictions struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS]; + struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1]; // Each dev correlates to a mergedIbDev struct ncclIbSendCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; - struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS]; - struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1]; struct ncclIbRemSizesFifo remSizesFifo; uint64_t fifoHead; int ar; // Use adaptive routing when all merged devices have it enabled @@ -903,8 +1003,7 @@ static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclI req->events[devIndex]++; req->devBases[devIndex] = base; } - -ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base) { +ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base, void* cq_context) { base->ibDevN = ibDevN; ncclIbDev* ibDev = ncclIbDevs + ibDevN; pthread_mutex_lock(&ibDev->lock); @@ -921,7 +1020,7 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base pthread_mutex_unlock(&ibDev->lock); // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv). - NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0)); + NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), cq_context, NULL, 0)); return ncclSuccess; } @@ -940,9 +1039,10 @@ returning: return res; } -ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, struct ncclIbQp* qp) { +ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) { struct ibv_qp_init_attr qpInitAttr; memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr)); + qpInitAttr.qp_context = qp_context; qpInitAttr.send_cq = base->cq; qpInitAttr.recv_cq = base->cq; qpInitAttr.qp_type = IBV_QPT_RC; @@ -1026,6 +1126,7 @@ ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) { } ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { + ncclResult_t ret = ncclSuccess; struct ncclIbListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; @@ -1033,14 +1134,20 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { memset(handle, 0, sizeof(struct ncclIbHandle)); comm->dev = dev; handle->magic = NCCL_SOCKET_MAGIC; - NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1)); - NCCLCHECK(ncclSocketListen(&comm->sock)); - NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr)); + NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail); + NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail); + NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail); *listenComm = comm; - return ncclSuccess; +exit: + return ret; +fail: + (void)ncclSocketClose(&comm->sock); + free(comm); + goto exit; } ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { + ncclResult_t ret = ncclSuccess; struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; struct ncclIbCommStage* stage = &handle->stage; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; @@ -1055,16 +1162,18 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet WARN("Error: trying to connect already connected sendComm"); return ncclInternalError; } + stage->buffer = NULL; NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm))); - NCCLCHECK(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1)); + NCCLCHECKGOTO(ncclIbStatsInit(&comm->base.stats), ret, fail); + NCCLCHECKGOTO(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail); stage->comm = comm; stage->state = ncclIbCommStateConnect; - NCCLCHECK(ncclSocketConnect(&comm->base.sock)); + NCCLCHECKGOTO(ncclSocketConnect(&comm->base.sock), ret, fail); ib_connect_check: /* since ncclSocketConnect is async, we must check if connection is complete */ - NCCLCHECK(ncclSocketReady(&comm->base.sock, &ready)); + NCCLCHECKGOTO(ncclSocketReady(&comm->base.sock, &ready), ret, fail); if (!ready) return ncclSuccess; // IB Setup @@ -1078,7 +1187,7 @@ ib_connect_check: comm->ar = 1; // Set to 1 for logic for (int i = 0; i < mergedDev->ndevs; i++) { int ibDevN = mergedDev->devs[i]; - NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base)); + NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base, &comm->base.stats), ret, fail); comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled } @@ -1091,13 +1200,17 @@ ib_connect_check: for (int q = 0; q < comm->base.nqps; q++) { ncclIbSendCommDev* commDev = comm->devs + devIndex; ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; - NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, comm->base.qps+q)); + NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q), ret, fail); comm->base.qps[q].devIndex = devIndex; meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num; meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex; - // Query ece capabilities (enhanced connection establishment) - NCCLCHECK(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); + if (ncclParamIbEceEnable()) { + // Query ece capabilities (enhanced connection establishment) + NCCLCHECKGOTO(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail); + } else { + meta.qpInfo[q].ece_supported = 0; + } devIndex = (devIndex + 1) % comm->base.ndevs; } @@ -1112,13 +1225,13 @@ ib_connect_check: devInfo->lid = ibDev->portAttr.lid; // Prepare my fifo - NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail); devInfo->fifoRkey = commDev->fifoMr->rkey; // Pack local GID info devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; - NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex)); - NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid)); + NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex), ret, fail); + NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid), ret, fail); devInfo->gid.global.subnet_prefix = commDev->base.gidInfo.localGid.global.subnet_prefix; devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id; @@ -1148,12 +1261,12 @@ ib_connect_check: stage->state = ncclIbCommStateSend; stage->offset = 0; - NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(meta))); + NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail); memcpy(stage->buffer, &meta, sizeof(meta)); ib_send: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset)); + NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset), ret, fail); if (stage->offset != sizeof(meta)) return ncclSuccess; stage->state = ncclIbCommStateConnecting; @@ -1163,7 +1276,7 @@ ib_send: ib_connect: struct ncclIbConnectionMetadata remMeta; - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset)); + NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset), ret, fail); if (stage->offset != sizeof(remMeta)) return ncclSuccess; memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata)); @@ -1197,7 +1310,7 @@ ib_connect: } for (int i=0; i < comm->base.ndevs; i++) { - NCCLCHECK(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); + NCCLCHECKGOTO(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail); } comm->base.nRemDevs = remMeta.ndevs; @@ -1212,10 +1325,10 @@ ib_connect: struct ibv_qp* qp = comm->base.qps[q].qp; if (remQpInfo->ece_supported) - NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported)); + NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail); - NCCLCHECK(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false)); - NCCLCHECK(ncclIbRtsQp(qp)); + NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail); + NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail); } if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE @@ -1233,19 +1346,23 @@ ib_connect: stage->offset = 0; ib_send_ready: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset)); + NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset), ret, fail); if (stage->offset != sizeof(int)) return ncclSuccess; - free(stage->buffer); - stage->state = ncclIbCommStateStart; - *sendComm = comm; - return ncclSuccess; +exit: + if (stage->buffer) free(stage->buffer); + stage->state = ncclIbCommStateStart; + return ret; +fail: + free(comm); + goto exit; } NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { + ncclResult_t ret = ncclSuccess; struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm; struct ncclIbCommStage* stage = &lComm->stage; struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; @@ -1262,22 +1379,23 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle } NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm))); + NCCLCHECKGOTO(ncclIbStatsInit(&rComm->base.stats), ret, fail); stage->comm = rComm; stage->state = ncclIbCommStateAccept; - NCCLCHECK(ncclSocketInit(&rComm->base.sock)); - NCCLCHECK(ncclSocketAccept(&rComm->base.sock, &lComm->sock)); + NCCLCHECKGOTO(ncclSocketInit(&rComm->base.sock), ret, fail); + NCCLCHECKGOTO(ncclSocketAccept(&rComm->base.sock, &lComm->sock), ret, fail); ib_accept_check: - NCCLCHECK(ncclSocketReady(&rComm->base.sock, &ready)); + NCCLCHECKGOTO(ncclSocketReady(&rComm->base.sock, &ready), ret, fail); if (!ready) return ncclSuccess; struct ncclIbConnectionMetadata remMeta; stage->state = ncclIbCommStateRecv; stage->offset = 0; - NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta))); + NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)), ret, fail); ib_recv: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset)); + NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset), ret, fail); if (stage->offset != sizeof(remMeta)) return ncclSuccess; /* copy back the received info */ @@ -1308,10 +1426,10 @@ ib_recv: for (int i = 0; i < rComm->base.ndevs; i++) { rCommDev = rComm->devs + i; ibDevN = mergedDev->devs[i]; - NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base)); + NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &rCommDev->base, &rComm->base.stats), ret, fail); ibDev = ncclIbDevs + ibDevN; - NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex)); - NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid)); + NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail); + NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail); } // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. @@ -1336,23 +1454,26 @@ ib_recv: // Local ibDevN ibDevN = rComm->devs[devIndex].base.ibDevN; ibDev = ncclIbDevs + ibDevN; - NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, qp)); + NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail); qp->devIndex = devIndex; devIndex = (devIndex + 1) % rComm->base.ndevs; // Set the ece (enhanced connection establishment) on this QP before RTR if (remMeta.qpInfo[q].ece_supported) { - NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); + // Coverity suspects a copy-paste error below due to the use of remMeta in one argument and meta in another. + // However, this has been confirmed to be intentional. + // coverity[copy_paste_error] + NCCLCHECKGOTO(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail); // Query the reduced ece for this QP (matching enhancements between the requestor and the responder) // Store this in our own qpInfo for returning to the requestor if (meta.qpInfo[q].ece_supported) - NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); + NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail); } bool override_tc = (q == 0) ? true : false; - NCCLCHECK(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc)); - NCCLCHECK(ncclIbRtsQp(qp->qp)); + NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc), ret, fail); + NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail); } rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess) @@ -1366,17 +1487,17 @@ ib_recv: // Retain remote fifo info and prepare my RDMA ops rCommDev->fifoRkey = remMeta.devs[i].fifoRkey; rComm->remFifo.addr = remMeta.fifoAddr; - NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); + NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail); rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey; if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE; // Allocate Flush dummy buffer for GPU Direct RDMA if (rComm->flushEnabled) { - NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE)); + NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE), ret, fail); rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem; rCommDev->gpuFlush.sge.length = 1; rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey; - NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp)); + NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->base.stats, &rCommDev->gpuFlush.qp), ret, fail); struct ncclIbDevInfo devInfo; devInfo.lid = ibDev->portAttr.lid; devInfo.link_layer = ibDev->portAttr.link_layer; @@ -1384,8 +1505,8 @@ ib_recv: devInfo.gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; devInfo.gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; devInfo.mtu = ibDev->portAttr.active_mtu; - NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false)); - NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp)); + NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail); + NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail); } // Fill Handle @@ -1400,7 +1521,7 @@ ib_recv: meta.devs[i].mtu = remMeta.devs[i].mtu; // Prepare sizes fifo - NCCLCHECK(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + NCCLCHECKGOTO(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail); meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey; } meta.fifoAddr = (uint64_t)rComm->sizesFifo; @@ -1415,30 +1536,36 @@ ib_recv: stage->state = ncclIbCommStateSend; stage->offset = 0; - if (stage->buffer) free(stage->buffer); - NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata))); + if (stage->buffer) { + free(stage->buffer); + stage->buffer = NULL; + } + NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)), ret, fail); memcpy(stage->buffer, &meta, sizeof(struct ncclIbConnectionMetadata)); ib_send: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset)); + NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset), ret, fail); if (stage->offset < sizeof(struct ncclIbConnectionMetadata)) return ncclSuccess; stage->offset = 0; stage->state = ncclIbCommStatePendingReady; ib_recv_ready: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset)); + NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset), ret, fail); if (stage->offset != sizeof(int)) return ncclSuccess; - free(stage->buffer); *recvComm = rComm; - +exit: /* reset lComm stage */ + if (stage->buffer) free(stage->buffer); stage->state = ncclIbCommStateStart; stage->offset = 0; stage->comm = NULL; stage->buffer = NULL; - return ncclSuccess; + return ret; +fail: + free(rComm); + goto exit; } ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbRequest** req) { @@ -1531,16 +1658,21 @@ struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, in /* DMA-BUF support */ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { + ncclResult_t ret = ncclSuccess; assert(size > 0); struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle)); for (int i = 0; i < base->ndevs; i++) { // Each ncclIbNetCommDevBase is at different offset in send and recv netComms struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i); - NCCLCHECK(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i)); + NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail); } *mhandle = (void*) mhandleWrapper; - return ncclSuccess; +exit: + return ret; +fail: + free(mhandleWrapper); + goto exit; } ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { @@ -1694,6 +1826,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } + NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__)); struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle; @@ -1858,6 +1991,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; + NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__)); struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->base, &req)); @@ -1937,10 +2071,13 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** return ncclSuccess; } +#define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name) + ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { struct ncclIbRequest *r = (struct ncclIbRequest*)request; *done = 0; while (1) { + NCCLCHECK(ncclIbStatsCheckFatalCount(&r->base->stats,__func__)); if (r->events[0] == 0 && r->events[1] == 0) { TRACE(NCCL_NET, "r=%p done", r); *done = 1; @@ -1996,7 +2133,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d", ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i); #endif - if (req->type == NCCL_NET_IB_REQ_SEND) { + if (req && req->type == NCCL_NET_IB_REQ_SEND) { for (int j = 0; j < req->nreqs; j++) { struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff); if ((sendReq->events[i] <= 0)) { @@ -2018,6 +2155,9 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { req->events[i]--; } } + // Once the IB fatal event is reported in the async thread, we want to propagate this error + // to communicator and prevent further polling to reduce error pollution. + NCCLCHECK(ncclIbStatsCheckFatalCount(&ncclIbDevs[r->devBases[i]->ibDevN].stats,__func__)); } } diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index e9e0357..73a5d55 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -73,22 +73,27 @@ ncclResult_t ncclNetSocketDevices(int* ndev) { } static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) { + ncclResult_t ret = ncclSuccess; *speed = 0; char speedPath[PATH_MAX]; sprintf(speedPath, "/sys/class/net/%s/speed", devName); - int fd = open(speedPath, O_RDONLY); + int fd = -1; + SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd); if (fd != -1) { char speedStr[] = " "; - if (read(fd, speedStr, sizeof(speedStr)-1) > 0) { + int n; + // Allow this to silently fail + n = read(fd, speedStr, sizeof(speedStr)-1); + if (n > 0) { *speed = strtol(speedStr, NULL, 0); } - close(fd); } if (*speed <= 0) { INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath); *speed = 10000; } - return ncclSuccess; + if (fd != -1) SYSCHECK(close(fd), "close"); + return ret; } ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { @@ -235,19 +240,24 @@ void* persistentSocketThread(void *args_) { } ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) { + ncclResult_t ret = ncclSuccess; int nSocksPerThread = ncclParamSocketNsocksPerThread(); int nThreads = ncclParamSocketNthreads(); if (nThreads > MAX_THREADS) { WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS); nThreads = MAX_THREADS; } + int fd = -1; + int nSocks; if (nThreads == -2 || nSocksPerThread == -2) { // Auto-detection int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads char vendorPath[PATH_MAX]; snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName); + // Coverity is wrong. NULL second argument to realpath() is OK by POSIX.1-2008. + // coverity[alias_transfer:FALSE] char* rPath = realpath(vendorPath, NULL); - int fd = open(rPath, O_RDONLY); + fd = open(rPath, O_RDONLY); free(rPath); if (fd == -1) { // Could not find device vendor. This is handled silently so @@ -257,9 +267,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) { } char vendor[7]; strncpy(vendor, "0x0000", 7); - int len; - SYSCHECKVAL(read(fd, vendor, 6), "read", len); - SYSCHECK(close(fd), "close"); + SYSCHECKGOTO(read(fd, vendor, 6), "read", ret, fail); if (strcmp(vendor, "0x1d0f") == 0) { // AWS autoNt = 2; autoNs = 8; @@ -271,7 +279,7 @@ end: if (nThreads == -2) nThreads = autoNt; if (nSocksPerThread == -2) nSocksPerThread = autoNs; } - int nSocks = nSocksPerThread * nThreads; + nSocks = nSocksPerThread * nThreads; if (nSocks > MAX_SOCKETS) { nSocksPerThread = MAX_SOCKETS/nThreads; WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread); @@ -280,28 +288,38 @@ end: *ns = nSocks; *nt = nThreads; if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread); - return ncclSuccess; +exit: + if (fd != -1) close(fd); + return ret; +fail: + goto exit; } ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } + ncclResult_t ret = ncclSuccess; struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle; memset(handle, 0, sizeof(struct ncclNetSocketHandle)); static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large"); struct ncclNetSocketListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); handle->magic = NCCL_SOCKET_MAGIC; - NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1)); - NCCLCHECK(ncclSocketListen(&comm->sock)); - NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr)); - NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); + NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1), ret, fail); + NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail); + NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail); + NCCLCHECKGOTO(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads), ret, fail); handle->nSocks = comm->nSocks; handle->nThreads = comm->nThreads; comm->dev = dev; *listenComm = comm; - return ncclSuccess; +exit: + return ret; +fail: + (void)ncclSocketClose(&comm->sock); + free(comm); + goto exit; } ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { @@ -437,7 +455,7 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* res->comm = comm; pthread_mutex_init(&res->threadLock, NULL); pthread_cond_init(&res->threadCond, NULL); - pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res); + PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create"); ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev); } struct ncclNetSocketTask* r = queue->tasks+queue->next; @@ -482,7 +500,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; union ncclSocketAddress addr; - ncclSocketGetAddr(r->ctrlSock, &addr); + NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr)); WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", ncclSocketToString(&addr, line), data, r->size); @@ -579,7 +597,7 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) { res->stop = 1; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); - pthread_join(comm->helperThread[i], NULL); + PTHREADCHECK(pthread_join(comm->helperThread[i], NULL), "pthread_join"); } free(res->threadTaskQueue.tasks); } diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index 61d5946..aa9c486 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -26,7 +26,7 @@ struct localRegData { intptr_t offset; }; -ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +ncclResult_t nvlsCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // This transport cannot be used for p2p *ret = 0; return ncclSuccess; @@ -71,28 +71,31 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) { CUmemAllocationHandleType type = ncclCuMemHandleType; - + int fd = -1; + ncclResult_t ret = ncclSuccess; INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank); // Import and map the remote memory descriptor to the local GPU if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // cuMem UDS support - int fd = -1; TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank); - int tpProxyRank = comm->topParentRanks[rank]; TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank); - NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd)); + NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, rank, shareableHandle, &fd), ret, fail); TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank); - CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type)); - (void) close(fd); + CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type), ret, fail); + SYSCHECK(close(fd), "close"); } else { if (type == CU_MEM_HANDLE_TYPE_FABRIC) { - CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type)); + CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type), ret, fail); } else { memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle)); } } - return ncclSuccess; +exit: + return ret; +fail: + if (fd != -1) close(fd); + goto exit; } ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAllocationHandle* mcHandle) { @@ -100,7 +103,7 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zu dev %d", *mcHandle, size, dev); // Unbind physical memory from group for the given device - CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size)); + if (size) CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size)); return ncclSuccess; } @@ -117,14 +120,18 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr); // Release the UC memory and mapping - CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size)); - CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size)); - CUCHECK(cuMemRelease(*ucHandle)); + if (ucptr) { + CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size)); + CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size)); + CUCHECK(cuMemRelease(*ucHandle)); + } // Release the MC memory and mapping - CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size)); - CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size)); - CUCHECK(cuMemRelease(*mcHandle)); + if (mcptr) { + CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size)); + CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size)); + CUCHECK(cuMemRelease(*mcHandle)); + } return ncclSuccess; } @@ -191,7 +198,9 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit size_t size = *sizePtr; size_t originSize = size; size_t ucgran, mcgran; + int allocMcHandle = 0; + *ucptr = *mcptr = NULL; memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); mcprop.numDevices = comm->localRanks; mcprop.handleTypes = ncclCuMemHandleType; @@ -203,10 +212,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail); + allocMcHandle = 1; NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail); + allocMcHandle = 1; } CUCHECKGOTO(cuMulticastAddDevice(*mcHandle, comm->cudaDev), ret, fail); @@ -226,6 +237,8 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail); CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail); + // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort + NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); // Bind physical memory to the Multicast group // NB: It will block until all ranks have been added to the Group CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail); @@ -239,6 +252,7 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit exit: return ret; fail: + if (allocMcHandle && *mcptr == NULL && *ucptr == NULL) CUCHECK(cuMemRelease(*mcHandle)); goto exit; } @@ -350,10 +364,10 @@ setup: struct ncclNvlsSharedRes* resources = NULL; int nHeads = comm->channels[0].nvls.nHeads; int nChannels = comm->nChannels; - size_t memSize = 16; + size_t memSize = 64; size_t creditSize = nChannels * 2 * memSize * nHeads; int nvlsStepSize = comm->nvlsChunkSize; - + NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail); comm->nvlsResources->inited = false; comm->nvlsResources->refCount = 1; @@ -466,7 +480,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { if (!comm->MNNVL && resources->nvlsShmemHandle) NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle)); - if (resources->ucCredit && resources->mcCredit) { + if (resources->ucCredit || resources->mcCredit) { NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle)); NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle)); } @@ -490,7 +504,6 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t char shareableHandle[NVLS_HANDLE_SIZE]; CUmemGenericAllocationHandle mcHandle; size_t minSize = SIZE_MAX; - bool localRegBufUsed = false; struct localRegData* regData = NULL; cudaPointerAttributes attr; size_t ucgran, mcgran; @@ -500,7 +513,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t if (userBuff) { NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, ®Record), ret, fail); if (regRecord) { - CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr)); + CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->addr), ret, fail); if (attr.type == cudaMemoryTypeDevice) { size_t regSize = regRecord->pages * comm->regCache.pageSize; memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); @@ -508,7 +521,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t mcprop.handleTypes = ncclCuMemHandleType; mcprop.flags = 0; mcprop.size = regSize; - CUCHECK(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); memset(&ucprop, 0, sizeof(CUmemAllocationProp)); ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; @@ -517,7 +530,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t ucprop.requestedHandleTypes = ncclCuMemHandleType; CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); - CUCHECK(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr)); + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr), ret, fail); if (regSize % mcgran == 0) { regRecord->regSize = regSize; } else { @@ -560,6 +573,9 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t } CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail); + // Coverity complains that regRecord could be NULL. That won't in practice be the case because we've already checked + // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out. + // coverity[var_deref_op] CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail); // Create a VA for the NVLS @@ -584,15 +600,13 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t } } - localRegBufUsed = true; - + *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset; + *regUsed = true; exit: - if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset; - *regUsed = localRegBufUsed; free(regData); return ret; fail: - localRegBufUsed = false; + *regUsed = false; goto exit; } @@ -862,19 +876,21 @@ exit: } if (recvRecord) { + // Yes, it's a dead code. That's fine... + // coverity[dead_error_begin] ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size); free(recvRecord); } } else { if (sendRecord) { *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend); - ncclIntruQueueEnqueue(cleanupQueue, &sendRecord->base); + ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord); *nCleanupQueueEltsAdded += 1; } if (recvRecord) { *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv); - ncclIntruQueueEnqueue(cleanupQueue, &recvRecord->base); + ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord); *nCleanupQueueEltsAdded += 1; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 90a714b..6569ae1 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -7,9 +7,11 @@ #include "comm.h" #include "graph.h" #include "utils.h" -#include "shm.h" +#include "shmutils.h" #include "p2p.h" #include "transport.h" +#include +#include "shm.h" enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM }; @@ -19,16 +21,28 @@ struct ncclP2pBuff { ncclIpcDesc ipcDesc; }; +struct ncclP2pRequest { + size_t size; + int refcount; +}; + struct p2pConnectInfo { int rank; int read; struct ncclP2pBuff p2pBuff; // Used by CE memcpy - char shmName[7]; - int shmSize; + ncclShmIpcDesc_t desc; }; static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large"); +struct p2pIpcExpInfo { + ncclIpcDesc ipcDesc; + bool legacyIpcCap; + int impFd; + size_t size; + uintptr_t offset; +}; + struct p2pShm { struct ncclSendMem sendMem; struct ncclRecvMem recvMem; @@ -37,9 +51,7 @@ struct p2pShmProxyInfo { // Shared memory between proxy and receiving GPU struct p2pShm* shm; struct p2pShm* devShm; - char shmName[7]; - int shmSize; - ncclShmHandle_t handle; + ncclShmIpcDesc_t desc; // Intermediate step for sender struct ncclRecvMem* ceRecvMem; @@ -62,13 +74,14 @@ struct p2pResources { struct ncclRecvMem* recvDevMem; }; void* sendMemIpc; + int sendMemSameProc; void* recvMemIpc; + int recvMemSameProc; // CE memcpy support struct p2pShmProxyInfo proxyInfo; struct p2pShm* shm; struct p2pShm* devShm; - int shmSize; - ncclShmHandle_t handle; + ncclShmIpcDesc_t desc; }; // cuMem API support @@ -104,12 +117,12 @@ static void initCeOperation(); extern int64_t ncclParamMNNVLEnable(); /* Determine if two peers can communicate through p2p */ -ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { initCeOperation(); // MNNVL support - if (ncclParamMNNVLEnable() != 0 && info1->hostHash != info2->hostHash) { - NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret)); + if (comm->MNNVL && info1->hostHash != info2->hostHash) { + NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret)); if (*ret) return ncclSuccess; } @@ -121,7 +134,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop // Check topology / p2p level. int intermediateRank; - NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank)); + NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank)); if (*ret == 0) return ncclSuccess; if (intermediateRank != -1) { if (useMemcpy) *ret = 0; @@ -130,7 +143,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop // Check if NET would work better int useNet = 0; - NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); + NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet)); if (useNet) { *ret = 0; return ncclSuccess; @@ -197,7 +210,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop } while (0) // cuMem API support -ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) { +ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDesc *ipcDesc, void **ptr) { if (ncclCuMemEnable()) { #if CUDART_VERSION >= 11030 CUmemAllocationHandleType type = ncclCuMemHandleType; @@ -211,6 +224,10 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v } else { CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0)); } + if (refcount) { + memcpy(&ipcDesc->memHandle, &handle, sizeof(handle)); + for (int r = 0; r < refcount; ++r) CUCHECK(cuMemRetainAllocationHandle(&handle, *ptr)); + } #else return ncclInternalError; #endif @@ -233,7 +250,7 @@ ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) { return ncclSuccess; } -ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) { +ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) { if (ncclCuMemEnable()) { #if CUDART_VERSION >= 11030 // cuMem API support @@ -241,16 +258,25 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz CUmemAllocationHandleType type = ncclCuMemHandleType; CUmemGenericAllocationHandle handle; ncclCuDesc *cuDesc = &ipcDesc->cuDesc; + CUmemAllocationProp prop = {}; + size_t granularity = 0; + + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = type; + prop.location.id = comm->cudaDev; + CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + ALIGN_SIZE(size, granularity); // Import and map the remote memory descriptor to the local GPU if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // UDS fd support int fd = -1; // Send cuMem handle to remote for conversion to an fd - NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd)); - INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer); + NCCLCHECK(ncclProxyClientGetFdBlocking(comm, peer, &cuDesc->data, &fd)); + INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, peer); CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type)); - (void) close(fd); + SYSCHECK(close(fd), "close"); } else { CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type)); } @@ -291,7 +317,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* int p2p; // Queries the topology to see if the GPUs are Ampere and // connected via NVLink, if so we enable P2P Read by default - NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank)); + NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank)); int readEnable = ncclParamP2pReadEnable(); if (readEnable != -2) *read = readEnable; @@ -311,24 +337,23 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } -#if CUDART_VERSION >= 11030 - // cuMem API support if (ncclCuMemEnable()) { - // Allow direct access to the remote buffer from the local GPU - CUmemAccessDesc accessDesc = {}; - accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc.location.id = myInfo->cudaDev; - accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - INFO(NCCL_P2P, "Set Access for buffer %p size %zu on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev); - CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1)); + // for intra-process ranks, we should map memHandle of the peers to increase refcount. + // Otherwise, if peers abort and free the buffer, the rank can suffer invalid access. + NCCLCHECK(ncclCuMemAllocAddr(devMem, &p2pBuff->ipcDesc.memHandle, p2pBuff->size)); + CUCHECK(cuMemRelease(p2pBuff->ipcDesc.memHandle)); + *ipcPtr = *devMem; + } else { + *devMem = p2pBuff->directPtr; + *ipcPtr = NULL; } -#endif + } else { + *devMem = p2pBuff->directPtr; + *ipcPtr = NULL; } - *devMem = p2pBuff->directPtr; - *ipcPtr = NULL; } else { // Different PID - NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem)); + NCCLCHECK(ncclP2pImportShareableBuffer(comm, peerInfo->rank, p2pBuff->size, &p2pBuff->ipcDesc, devMem)); *ipcPtr = *devMem; } return ncclSuccess; @@ -338,7 +363,7 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct p2pResources* resources; - int tpProxyRank; + struct ncclP2pRequest req; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; int useRead, intermediateRank; @@ -387,15 +412,18 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st comm->peerInfo[intermediateRank].nvmlDev, useReadStr); } - tpProxyRank = comm->topParentRanks[info->rank]; - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn)); + req.size = sendSize; + req.refcount = 0; + if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++; + if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn)); if (useMemcpy) { NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo))); - info->shmSize = resources->proxyInfo.shmSize; - memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName)); + memcpy(&info->desc, &resources->proxyInfo.desc, sizeof(ncclShmIpcDesc_t)); } else { - NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff))); NCCLCHECK(p2pMap(comm, &send->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc)); + resources->sendMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank)); } return ncclSuccess; @@ -405,7 +433,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { struct p2pResources* resources; - int tpProxyRank; + struct ncclP2pRequest req; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; int useRead, intermediateRank; @@ -444,11 +472,15 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st info->rank = intermediateRank; } - tpProxyRank = comm->topParentRanks[info->rank]; - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn)); - NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); + req.size = recvSize; + req.refcount = 0; + if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++; + if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn)); + NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff))); NCCLCHECK(p2pMap(comm, &recv->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc)); + resources->recvMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank)); return ncclSuccess; } @@ -459,6 +491,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; NCCLCHECK(p2pMap(comm, &send->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); + resources->recvMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank)); char* buff = (char*)(remDevMem+1); for (int p=0; pshmName); - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); - resources->shmSize = info->shmSize; // Attach to peer's SHM segment - NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc)); recv->conn.tail = &resources->devShm->recvMem.tail; recv->conn.head = &resources->devShm->sendMem.head; } else { NCCLCHECK(p2pMap(comm, &recv->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); + resources->sendMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank)); struct ncclRecvMem* devMem = resources->recvDevMem; recv->conn.tail = &devMem->tail; @@ -538,8 +568,21 @@ ncclResult_t p2pSendFree(struct ncclConnector* send) { if (resources) { if (ncclCuMemEnable()) { // cuMem API support - if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); - if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); + if (resources->sendMemIpc) { + if (resources->sendMemSameProc) { + NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc)); + } else { + NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); + } + } + + if (resources->recvMemIpc) { + if (resources->recvMemSameProc) { + NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc)); + } else { + NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); + } + } } else { if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); @@ -555,14 +598,27 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) { if (resources) { if (ncclCuMemEnable()) { // cuMem API support - if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); - if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); + if (resources->sendMemIpc) { + if (resources->sendMemSameProc) { + NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc)); + } else { + NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); + } + } + + if (resources->recvMemIpc) { + if (resources->recvMemSameProc) { + NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc)); + } else { + NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); + } + } } else { if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); if (useMemcpy) { - NCCLCHECK(ncclShmClose(resources->handle)); + NCCLCHECK(ncclShmIpcClose(&resources->desc)); } } free(resources); @@ -574,29 +630,27 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st if (useMemcpy) { // CE memcpy support struct p2pShmProxyInfo* proxyInfo; + size_t shmSize; + + if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); connection->transportResources = proxyInfo; NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); - char shmPath[PATH_MAX]; - shmPath[0] = '\0'; - proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem); // Create a SHM segment for the peer to attach to - NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle)); - TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize); - memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName)); + shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem); + NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm)); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); - - if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo)); } else { - if (reqSize != sizeof(int)) return ncclInternalError; - int size = *((int*)reqBuff); + struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff; + if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError; + int size = req->size; if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; - NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); + NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); p2pBuff->size = size; if (ncclCuMemEnable()) { // cuMem API support @@ -613,11 +667,12 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st } static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { - if (reqSize != sizeof(int)) return ncclInternalError; - int size = *((int*)reqBuff); + struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff; + if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError; + int size = req->size; if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; - NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); + NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); p2pBuff->size = size; if (ncclCuMemEnable()) { // cuMem API support @@ -651,7 +706,7 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str if (useMemcpy) { struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources; if (proxyInfo) { - NCCLCHECK(ncclShmClose(proxyInfo->handle)); + NCCLCHECK(ncclShmIpcClose(&proxyInfo->desc)); NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem)); NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff)); CUDACHECK(cudaStreamDestroy(proxyInfo->stream)); @@ -752,11 +807,382 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru return ncclSuccess; } +ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) { + ncclResult_t ret = ncclSuccess; + struct ncclReg *regRecord = NULL; + struct ncclIpcRegInfo* newInfo = NULL; + uintptr_t* peerRmtAddrs = NULL; + bool legacyIpcCap = false; + size_t baseSize = 0; + void* baseAddr = NULL; + bool needUpdate = false; + + *regBufFlag = 0; + *offsetOut = 0; + *peerRmtAddrsOut = NULL; + if (comm && userbuff && buffSize > 0 && nPeers > 0) { + NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); + if (regRecord) { + // buffer was registered by by users, we need to start to register or reuse it + int peerLocalRank; + for (int p = 0; p < nPeers; p++) { + int peerRank = peerRanks[p]; + peerLocalRank = comm->rankToLocalRank[peerRank]; + if (regRecord->ipcInfos[peerLocalRank]) { + // We already have IPC info for peerLocalRank, no need to register it, we can reuse it + *regBufFlag = 1; + INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr); + } else { + // Register buffer with peerLocalRank + struct ncclProxyConnector* proxyConn = NULL; + struct p2pIpcExpInfo ipcInfo; + + if (baseAddr == NULL) { + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail); + CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail); + } + if (comm->gproxyConn[peerRank].initialized == false) + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail); + proxyConn = &comm->gproxyConn[peerRank]; + + ipcInfo.legacyIpcCap = legacyIpcCap; + // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll + // get the CUDA legacy mem handle, or through cuMem*. + if (ipcInfo.legacyIpcCap) { + // legacy export + if (comm->directMode) goto fail; + CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); + } else if (ncclCuMemEnable()) { + CUmemGenericAllocationHandle handle; + if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) { + // if cuMem* export fails, retry legacy export + if (comm->directMode) goto fail; + CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); + ipcInfo.legacyIpcCap = true; + } else { + // cuMem* export to file descriptor or fabric handle + if (proxyConn->sameProcess) { + memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle)); + } else { + if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + int expFd = -1; + CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail); + NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail); + SYSCHECKGOTO(close(expFd), "close", ret, fail); + } else { + // Allow this to silently fail for cases where the user buff cannot be registered + if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) { + CUCHECKGOTO(cuMemRelease(handle), ret, fail); + goto fail; + } + } + } + CUCHECKGOTO(cuMemRelease(handle), ret, fail); + } + } else { + // nothing works, just return + goto fail; + } + + void* rmtRegAddr = NULL; + ipcInfo.size = baseSize; + ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr; + // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side + // and get the remote register address back. + if (proxyConn) + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail); + if (rmtRegAddr) { + NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail); + assert(regRecord->ipcInfos[peerLocalRank] == NULL); + regRecord->state |= IPC_REG_COMPLETE; + newInfo->peerRank = peerRank; + newInfo->baseAddr = baseAddr; + newInfo->impInfo.rmtRegAddr = rmtRegAddr; + newInfo->impInfo.offset = ipcInfo.offset; + newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap; + newInfo->ipcProxyconn = proxyConn; + regRecord->ipcInfos[peerLocalRank] = newInfo; + if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) { + NCCLCHECKGOTO(ncclCalloc(®Record->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail); + } + regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr; + needUpdate = true; + *regBufFlag = 1; + INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr); + } + } + } + + if (*regBufFlag) { + if (type == NCCL_IPC_COLLECTIVE) { + // for collective, store registered remote buffers into dev memory for future reference + if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) { + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); + if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL) + NCCLCHECKGOTO(ncclCudaCallocAsync(®Record->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); + if (needUpdate) + NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail); + } + peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs; + } else { + assert(nPeers == 1); + // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct + peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank]; + } + *offsetOut = (uintptr_t)userbuff - regRecord->addr; + *peerRmtAddrsOut = peerRmtAddrs; + } + } + } + +exit: + return ret; +fail: + *regBufFlag = 0; + *offsetOut = 0; + *peerRmtAddrsOut = NULL; + if (newInfo) free(newInfo); + goto exit; +} + +struct ncclIpcCleanupCallback { + struct ncclCommCallback base; + bool isAddrs; + union { + struct ncclIpcRegInfo regInfo; + struct ncclPeerRegIpcAddr regIpcAddrs; + }; +}; + +static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) { + struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb; + if (obj->isAddrs) { + if (obj->regIpcAddrs.hostPeerRmtAddrs) + free(obj->regIpcAddrs.hostPeerRmtAddrs); + if (obj->regIpcAddrs.devPeerRmtAddrs) + NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs)); + } else { + NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo)); + } + free(obj); + return ncclSuccess; +} + +ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) { + ncclResult_t ret = ncclSuccess; + struct ncclProxyConnector* proxyConn = NULL; + struct p2pIpcExpInfo ipcInfo; + void* baseAddr; + size_t baseSize; + struct ncclIntruQueue* cleanupQueue = reinterpret_cast*>(cleanupQueuePtr); + uintptr_t* peerRmtAddrs = NULL; + struct ncclIpcCleanupCallback* addrsRecord = NULL; + + *regBufFlag = 0; + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail); + CUCHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail); + + if (type == NCCL_IPC_COLLECTIVE) { + // collective needs host memory array to hold all remote buffer addrs. + // We need to put this into graph release queue + NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail); + addrsRecord->base.fn = cleanupIpc; + addrsRecord->isAddrs = true; + NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail); + } else { + assert(nPeers == 1); + // p2p does not need anything, just returning the remote buffer is enough, but for now, we register + // peer one by one so nPeers must be 1 + } + + for (int p = 0; p < nPeers; ++p) { + int peerRank = peerRanks[p]; + if (comm->gproxyConn[peerRank].initialized == false) + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail); + proxyConn = &comm->gproxyConn[peerRank]; + // Same as local registration. Get the mem handle for that buffer. It may have been allocated through + // cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*. + if (ipcInfo.legacyIpcCap) { + if (comm->directMode) goto fail; + CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); + } else if (ncclCuMemEnable()) { + // cuMem* export + CUmemGenericAllocationHandle handle; + if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) { + if (comm->directMode) goto fail; + CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); + ipcInfo.legacyIpcCap = true; + } else { + if (proxyConn->sameProcess) { + memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle)); + } else { + if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + int expFd = -1; + CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail); + if (proxyConn->sameProcess) { + ipcInfo.impFd = expFd; + } else { + NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail); + SYSCHECKGOTO(close(expFd), "close", ret, fail); + } + } else { + CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail); + } + } + CUCHECKGOTO(cuMemRelease(handle), ret, fail); + } + } else { + goto fail; + } + + void* rmtRegAddr = NULL; + ipcInfo.size = baseSize; + ipcInfo.offset = 0; + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail); + if (rmtRegAddr) { + struct ncclIpcCleanupCallback* record; + NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail); + record->base.fn = cleanupIpc; + record->isAddrs = false; + record->regInfo.peerRank = peerRank; + record->regInfo.baseAddr = baseAddr; + record->regInfo.impInfo.rmtRegAddr = rmtRegAddr; + record->regInfo.impInfo.offset = 0; + record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap; + record->regInfo.ipcProxyconn = proxyConn; + // store the remote address into host addr array + if (type == NCCL_IPC_COLLECTIVE) + addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr; + else + peerRmtAddrs = (uintptr_t*)rmtRegAddr; + *regBufFlag = 1; + if (ipcInfo.legacyIpcCap) + ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base); + else + ncclIntruQueueEnqueue(cleanupQueue, &record->base); + if (nCleanupQueueElts) *nCleanupQueueElts += 1; + INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr); + } + } + + if (type == NCCL_IPC_COLLECTIVE) { + // allocate the dev addr array and copy all previously stored addrs into it. + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail); + peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs; + if (ipcInfo.legacyIpcCap) + ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base); + else + ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base); + } + *offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr; + *peerRmtAddrsOut = peerRmtAddrs; + +exit: + return ret; +fail: + *regBufFlag = 0; + *offsetOut = 0; + *peerRmtAddrsOut = NULL; + goto exit; +} + +ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo) { + NCCLCHECK(ncclProxyCallBlocking(comm, regInfo->ipcProxyconn, ncclProxyMsgDeregister, ®Info->impInfo, sizeof(struct ncclIpcImpInfo), NULL, 0)); + INFO(NCCL_REG, "rank %d - IPC deregistered buffer %p peer %d ipc remote buffer %p", comm->rank, regInfo->baseAddr, regInfo->peerRank, regInfo->impInfo.rmtRegAddr); + return ncclSuccess; +} + +static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff; + void* regAddr = NULL; + ncclResult_t ret = ncclSuccess; + bool mapped = false; + bool imported = false; + CUmemGenericAllocationHandle handle; + + assert(sizeof(struct p2pIpcExpInfo) == reqSize); + assert(sizeof(void*) == respSize); + + // request peer passes all necessary buffer info to import. The proxy thread would register + // the buffer locally and return register addr back + if (ipcExpInfo->legacyIpcCap) { + // legacy import + CUDACHECKGOTO(cudaIpcOpenMemHandle(®Addr, ipcExpInfo->ipcDesc.devIpc, cudaIpcMemLazyEnablePeerAccess), ret, fail); + regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset); + } else { + // cuMem import + if (connection->sameProcess) { + // if proxy is same process as request peer, we just need to map the handle. + memcpy(&handle, &ipcExpInfo->ipcDesc.memHandle, sizeof(CUmemGenericAllocationHandle)); + } else { + if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)(uintptr_t)ipcExpInfo->impFd, ncclCuMemHandleType), ret, fail); + SYSCHECKGOTO(close(ipcExpInfo->impFd), "close", ret, fail); + } else { + CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)&ipcExpInfo->ipcDesc.cuDesc, ncclCuMemHandleType), ret, fail); + } + } + imported = true; + CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)®Addr, ipcExpInfo->size, /* alignment */ 0, /* addr */ 0, /* flags */ 0), ret, fail); + CUCHECKGOTO(cuMemMap((CUdeviceptr)regAddr, ipcExpInfo->size, /* offset */ 0, handle, /* flags */ 0), ret, fail); + mapped = true; + // Allow access by the local GPU + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = proxyState->cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)regAddr, ipcExpInfo->size, &accessDesc, 1), ret, fail); + regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset); + } + INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess); + +exit: + memcpy(respBuff, (void*)®Addr, sizeof(void*)); + *done = 1; + return ret; +fail: + if (!ipcExpInfo->legacyIpcCap) { + if (mapped) CUCHECK(cuMemUnmap((CUdeviceptr)regAddr, ipcExpInfo->size)); + if (regAddr) CUCHECK(cuMemAddressFree((CUdeviceptr)regAddr, ipcExpInfo->size)); + if (imported) CUCHECK(cuMemRelease(handle)); + } + regAddr = NULL; + goto exit; +} + +static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) { + ncclResult_t ret = ncclSuccess; + struct ncclIpcImpInfo* ipcInfo = (struct ncclIpcImpInfo*)reqBuff; + assert(sizeof(struct ncclIpcImpInfo) == reqSize); + + if (ipcInfo->legacyIpcCap) { + CUDACHECKGOTO(cudaIpcCloseMemHandle((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail); + } else { + if (connection->sameProcess) { + NCCLCHECKGOTO(ncclCuMemFreeAddr((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail); + } else { + NCCLCHECKGOTO(ncclCudaFree((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail); + } + } + +exit: + *done = 1; + return ret; +fail: + goto exit; +} + struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, - { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, NULL }, - { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, NULL } + { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister }, + { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister } }; static void initCeOperation() { diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 7fc6251..9be95fd 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -5,35 +5,58 @@ ************************************************************************/ #include "comm.h" +#include "shmutils.h" #include "shm.h" #include "transport.h" -struct shmConnectInfo { - char shmName[7]; - int shmSize; +#define SHM_PATH_MAX 128 +#define SHM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR + +struct shmBuffInfo { + void *hptr; + void *dptr; +}; + +struct shmConnectInfo { + ncclShmIpcDesc_t desc; + struct shmBuffInfo buf; }; -static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large"); struct shmSendResources { - int remShmSize; struct ncclRecvMem* remHostMem; struct ncclRecvMem* devRemHostMem; - ncclShmHandle_t remHandle; - int shmSize; + ncclShmIpcDesc_t remDesc; struct ncclSendMem* hostMem; struct ncclSendMem* devHostMem; - ncclShmHandle_t hostHandle; }; struct shmRecvResources { - int remShmSize; struct ncclSendMem* remHostMem; struct ncclSendMem* devRemHostMem; - ncclShmHandle_t remHandle; - int shmSize; + ncclShmIpcDesc_t remDesc; struct ncclRecvMem* hostMem; struct ncclRecvMem* devHostMem; - ncclShmHandle_t hostHandle; +}; + +struct shmProxyInfo { + struct ncclRecvMem* ceRecvMem; + char* devFifo; + char* shmFifo; + struct ncclSendMem* sendMem; + struct ncclRecvMem* recvMem; + + // used by progress only + uint64_t step; + cudaStream_t stream; + cudaEvent_t events[NCCL_STEPS]; + + // ipc desc + ncclShmIpcDesc_t desc; +}; + +struct shmRequest { + size_t size; + bool legacy; }; #define SHM_SEND_SIDE 1 @@ -48,14 +71,14 @@ static int shmLocality = 0; static void initCeOperation(); /* Determine two peers can communicate with SHM */ -static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +static ncclResult_t shmCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 0; initCeOperation(); if (ncclParamShmDisable() == 1) return ncclSuccess; int useNet = 0; - NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); + NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet)); if (useNet) return ncclSuccess; // Same host? @@ -76,22 +99,29 @@ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct /* Create and return connect structures for this peer to connect to me */ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct shmSendResources* resources; + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; + size_t shmSize = sizeof(struct ncclSendMem); + struct shmRequest req; + + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); + NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); - struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; - - char shmPath[PATH_MAX]; - shmPath[0] = '\0'; - int shmSize = sizeof(struct ncclSendMem); if (shmLocality == SHM_SEND_SIDE) { for (int p=0; pbuffSizes[p]; } - info->shmSize = resources->shmSize = shmSize; - NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle)); - TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); - memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); + req.size = shmSize; + if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash) + req.legacy = true; + else + req.legacy = false; + + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn)); + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo))); + + resources->hostMem = (struct ncclSendMem*)info->buf.hptr; + resources->devHostMem = (struct ncclSendMem*)info->buf.dptr; INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%d] -> %d[%d] via SHM/%s/%s", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct"); return ncclSuccess; @@ -99,52 +129,43 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct shmRecvResources* resources; + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; + size_t shmSize = sizeof(struct ncclRecvMem); + struct shmRequest req; + NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); - struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; - char shmPath[PATH_MAX]; - shmPath[0] = '\0'; - int shmSize = sizeof(struct ncclRecvMem); if (shmLocality == SHM_RECV_SIDE) { for (int p=0; pbuffSizes[p]; } - info->shmSize = resources->shmSize = shmSize; - NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle)); - TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); - memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); + req.size = shmSize; + if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash) + req.legacy = true; + else + req.legacy = false; + + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn)); + NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo))); + + resources->hostMem = (struct ncclRecvMem*)info->buf.hptr; + resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr; return ncclSuccess; } -struct shmProxyInfo { - struct ncclRecvMem* ceRecvMem; - char* devFifo; - char* shmFifo; - struct ncclSendMem* sendMem; - struct ncclRecvMem* recvMem; - - // used by progress only - uint64_t step; - cudaStream_t stream; - cudaEvent_t events[NCCL_STEPS]; -}; - /* Connect to this peer */ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; + char* buff; - char shmPath[PATH_MAX]; - sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); - resources->remShmSize = info->shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); - NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); - char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); + buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1); for (int p=0; pconn.buffs[p] = buff; buff += comm->buffSizes[p]; @@ -157,9 +178,6 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co send->conn.connFifo = resources->devRemHostMem->connFifo; } if (useMemcpySend) { - int tpProxyRank; - tpProxyRank = comm->topParentRanks[comm->rank]; - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn)); struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; @@ -177,14 +195,11 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co // Setup device pointers struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; + char* buff; - char shmPath[PATH_MAX]; - sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); - resources->remShmSize = info->shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); - NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); - char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); + buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1); for (int p=0; pconn.buffs[p] = buff; buff += comm->buffSizes[p]; @@ -194,7 +209,6 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; if (useMemcpyRecv) { - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn)); struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; @@ -210,8 +224,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co static ncclResult_t shmSendFree(struct ncclConnector* send) { struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources; if (resources) { - NCCLCHECK(ncclShmClose(resources->hostHandle)); - NCCLCHECK(ncclShmClose(resources->remHandle)); + NCCLCHECK(ncclShmIpcClose(&resources->remDesc)); free(resources); send->transportResources = NULL; } @@ -221,8 +234,7 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) { static ncclResult_t shmRecvFree(struct ncclConnector* recv) { struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; if (resources) { - NCCLCHECK(ncclShmClose(resources->hostHandle)); - NCCLCHECK(ncclShmClose(resources->remHandle)); + NCCLCHECK(ncclShmIpcClose(&resources->remDesc)); free(resources); recv->transportResources = NULL; } @@ -230,51 +242,76 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) { } static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + ncclResult_t ret = ncclSuccess; + if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; struct shmProxyInfo* proxyInfo; - NCCLCHECK(ncclCalloc(&proxyInfo, 1)); - if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; - memcpy(proxyInfo, reqBuff, reqSize); - NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); - NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); - CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); + struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff; + + proxyInfo = (struct shmProxyInfo*)connection->transportResources; + proxyInfo->shmFifo = reqInfo->shmFifo; + proxyInfo->sendMem = reqInfo->sendMem; + proxyInfo->recvMem = reqInfo->recvMem; + NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail); + NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail); + CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail); for (int i=0; ievents+i)); + CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail); } connection->proxyAppendPtr = &connection->proxyAppend; connection->transportResources = proxyInfo; if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, respSize); - return ncclSuccess; + *done = 1; +exit: + return ret; +fail: + if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem); + if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo); + free(proxyInfo); + goto exit; } static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + ncclResult_t ret = ncclSuccess; + if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; struct shmProxyInfo* proxyInfo; - NCCLCHECK(ncclCalloc(&proxyInfo, 1)); - if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; - memcpy(proxyInfo, reqBuff, reqSize); - NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); - NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); - CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); + struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff; + + proxyInfo = (struct shmProxyInfo*)connection->transportResources; + proxyInfo->shmFifo = reqInfo->shmFifo; + proxyInfo->sendMem = reqInfo->sendMem; + proxyInfo->recvMem = reqInfo->recvMem; + NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail); + NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail); + CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail); for (int i=0; ievents+i)); + CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail); } connection->proxyAppendPtr = &connection->proxyAppend; - connection->transportResources = proxyInfo; - if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, respSize); - return ncclSuccess; + *done = 1; +exit: + return ret; +fail: + if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem); + if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo); + free(proxyInfo); + goto exit; } static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; if (resources) { - CUDACHECK(cudaStreamDestroy(resources->stream)); - NCCLCHECK(ncclCudaFree(resources->devFifo)); - NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); - for (int i=0; ievents[i])); + if (useMemcpySend) { + CUDACHECK(cudaStreamDestroy(resources->stream)); + NCCLCHECK(ncclCudaFree(resources->devFifo)); + NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); + for (int i=0; ievents[i])); + } } + NCCLCHECK(ncclShmIpcClose(&resources->desc)); free(connection->transportResources); connection->transportResources = NULL; } @@ -285,12 +322,15 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; if (resources) { - CUDACHECK(cudaStreamDestroy(resources->stream)); - NCCLCHECK(ncclCudaFree(resources->devFifo)); - NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); - for (int i=0; ievents[i])); + if (useMemcpyRecv) { + CUDACHECK(cudaStreamDestroy(resources->stream)); + NCCLCHECK(ncclCudaFree(resources->devFifo)); + NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); + for (int i=0; ievents[i])); + } } + NCCLCHECK(ncclShmIpcClose(&resources->desc)); free(connection->transportResources); connection->transportResources = NULL; } @@ -413,12 +453,37 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru return ncclSuccess; } -struct ncclTransport shmTransport = { - "SHM", - shmCanConnect, - { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL, NULL }, - { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL, NULL } -}; +static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct shmRequest* req = (struct shmRequest*)reqBuff; + /* check message size */ + if (reqSize != sizeof(struct shmRequest)) return ncclInternalError; + if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError; + + struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff; + struct shmProxyInfo* proxyInfo; + + NCCLCHECK(ncclCalloc(&proxyInfo, 1)); + NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr)); + memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t)); + connection->transportResources = proxyInfo; + return ncclSuccess; +} + +static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct shmRequest* req = (struct shmRequest*)reqBuff; + /* check message size */ + if (reqSize != sizeof(struct shmRequest)) return ncclInternalError; + if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError; + + struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff; + struct shmProxyInfo* proxyInfo; + + NCCLCHECK(ncclCalloc(&proxyInfo, 1)); + NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr)); + memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t)); + connection->transportResources = proxyInfo; + return ncclSuccess; +} static void initCeOperation() { static int init = 0; @@ -427,12 +492,10 @@ static void initCeOperation() { useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2); if (useMemcpySend) { shmTransport.send.proxyConnect = shmSendProxyConnect; - shmTransport.send.proxyFree = shmSendProxyFree; shmTransport.send.proxyProgress = shmSendProxyProgress; } if (useMemcpyRecv) { shmTransport.recv.proxyConnect = shmRecvProxyConnect; - shmTransport.recv.proxyFree = shmRecvProxyFree; shmTransport.recv.proxyProgress = shmRecvProxyProgress; } shmLocality = ncclParamShmLocality(); @@ -443,3 +506,152 @@ static void initCeOperation() { init = 1; } } + +ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) { + if (desc == NULL || hptr == NULL || tpProxyRank < -1) { + WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank); + return ncclInvalidArgument; + } +#if CUDART_VERSION >= 12020 + if (ncclCuMemEnable() && ncclCuMemHostEnable() && !legacy) { + // cuMem API support + CUmemAllocationHandleType type = SHM_HANDLE_TYPE; + CUmemGenericAllocationHandle handle; + + NCCLCHECK(ncclCuMemHostAlloc(hptr, &handle, size)); + if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + // Return the native cuMem handle for later Export/Import via UDS + memcpy(&desc->shmci.data, &handle, sizeof(handle)); + desc->shmci.tpProxyRank = tpProxyRank; + } else { + CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0)); + } + desc->shmci.size = size; + desc->shmci.ptr = *hptr; + if (dptr) *dptr = *hptr; + desc->legacy = false; + INFO(NCCL_SHM, "CUMEM allocated shareable buffer %p size %zi", desc->shmci.ptr, desc->shmci.size); + } else { + char shmPath[SHM_PATH_MAX] = { '\0' }; + desc->shmli.shmSize = size; + NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle)); + memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix)); + desc->legacy = true; + INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr); + } +#else /* CUDART_VERSION >= 12020 */ + char shmPath[SHM_PATH_MAX] = { '\0' }; + desc->shmli.shmSize = size; + NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle)); + memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix)); + desc->legacy = true; + INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr); +#endif /* CUDART_VERSION >= 12020 */ + return ncclSuccess; +} + +ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) { + if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) { + WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut); + return ncclInvalidArgument; + } +#if CUDART_VERSION >= 12020 + if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) { + // cuMem API support + CUdeviceptr hostptr = 0; + CUmemAllocationHandleType type = SHM_HANDLE_TYPE; + CUmemGenericAllocationHandle handle; + int cudaDev; + CUdevice currentDev; + CUmemAccessDesc accessDesc = {}; + int cpuNumaNodeId; + size_t granularity; + size_t size = desc->shmci.size; + CUmemAllocationProp prop = {}; + + // Import and map the remote memory descriptor to the local GPU + if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + // UDS fd support + int fd = -1; + // Send cuMem handle to remote for conversion to an fd + NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd)); + CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type)); + (void) close(fd); + } else { + CUCHECK(cuMemImportFromShareableHandle(&handle, &desc->shmci.handle, type)); + } + + // Get cpu numa id + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuDeviceGet(¤tDev, cudaDev)); + CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev)); + if (cpuNumaNodeId < 0) cpuNumaNodeId = 0; + + // Get granularity + prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.requestedHandleTypes = type; + prop.location.id = cpuNumaNodeId; + CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + ALIGN_SIZE(size, granularity); + + // Reserve and map address + CUCHECK(cuMemAddressReserve(&hostptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0)); + CUCHECK(cuMemMap(hostptr, size, /* offset */ 0, handle, /* flags */ 0)); + + // Allow access by the local GPU + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1)); + + // Allow access by the local numa + accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA; + accessDesc.location.id = cpuNumaNodeId; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1)); + + descOut->shmci.ptr = *hptr = (void *)hostptr; + descOut->legacy = false; + if (dptr) *dptr = (void *)hostptr; + INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity); + } else { + char shmPath[SHM_PATH_MAX]; + sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix); + NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle)); + descOut->legacy = true; + INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr); + } +#else /* CUDART_VERSION >= 12020 */ + char shmPath[SHM_PATH_MAX]; + sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix); + NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle)); + descOut->legacy = true; + INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr); +#endif + return ncclSuccess; +} + +ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc) { + if (desc) { +#if CUDART_VERSION >= 12020 + if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) { + NCCLCHECK(ncclCuMemHostFree(desc->shmci.ptr)); + } else { + NCCLCHECK(ncclShmClose(desc->shmli.handle)); + } +#else + NCCLCHECK(ncclShmClose(desc->shmli.handle)); +#endif + } + + return ncclSuccess; +} + +struct ncclTransport shmTransport = { + "SHM", + shmCanConnect, + { shmSendSetup, shmSendConnect, shmSendFree, NULL, shmSendProxySetup, NULL, shmSendProxyFree, NULL }, + { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, shmRecvProxySetup, NULL, shmRecvProxyFree, NULL } +}; From 1a16f427507cb985123e7565874f57b77a1ba2df Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Mon, 16 Sep 2024 14:47:37 -0700 Subject: [PATCH 2/2] Add missing header files --- src/include/nccl_profiler.h | 150 ++++++++++++++++++++++++++++++++++++ src/include/shmutils.h | 26 +++++++ 2 files changed, 176 insertions(+) create mode 100644 src/include/nccl_profiler.h create mode 100644 src/include/shmutils.h diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h new file mode 100644 index 0000000..556a0f6 --- /dev/null +++ b/src/include/nccl_profiler.h @@ -0,0 +1,150 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_H_ +#define NCCL_PROFILER_H_ + +#include + +enum { + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileNumEvents = ( 6), +}; + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + uint8_t func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + uint8_t datatype; + uint32_t op; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + uint8_t algo; + uint8_t proto; + int isCollnet; + int isNvls; + } coll; + + struct { + const char* name; + uint64_t commHash; + uint8_t func; + void* buff; + uint8_t datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v1_t; + +typedef enum { + ncclProfilerProxyOpSendPosted, + ncclProfilerProxyOpSendRemFifoWait, + ncclProfilerProxyOpSendTransmitted, + ncclProfilerProxyOpSendDone, + ncclProfilerProxyOpRecvPosted, + ncclProfilerProxyOpRecvReceived, + ncclProfilerProxyOpRecvTransmitted, + ncclProfilerProxyOpRecvDone, + + /* Legacy proxy profiler states */ + ncclProfilerProxyStepSendGPUWait, + ncclProfilerProxyStepSendWait, + ncclProfilerProxyStepRecvWait, + ncclProfilerProxyStepRecvFlushWait, + ncclProfilerProxyStepRecvGPUWait, + + /* Legacy proxy control states */ + ncclProfilerProxyCtrlIdle, + ncclProfilerProxyCtrlActive, + ncclProfilerProxyCtrlSleep, + ncclProfilerProxyCtrlWakeup, + ncclProfilerProxyCtrlAppend, + ncclProfilerProxyCtrlAppendEnd, +} ncclProfilerEventState_v1_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v1_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v1_t; + +typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t; +typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v1_t ncclProfiler_t; + +#endif diff --git a/src/include/shmutils.h b/src/include/shmutils.h new file mode 100644 index 0000000..43e8afb --- /dev/null +++ b/src/include/shmutils.h @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_SHMUTILS_H_ +#define NCCL_SHMUTILS_H_ + +#include "nccl.h" + +typedef void* ncclShmHandle_t; +ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); +ncclResult_t ncclShmClose(ncclShmHandle_t handle); +ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); + +struct ncclShmemCollBuff { + volatile size_t *cnt[2]; + volatile void *ptr[2]; + int round; + size_t maxTypeSize; +}; + +ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); + +#endif