Merge tag 'v2.19.3-1'

This commit is contained in:
Sylvain Jeaugey 2023-10-25 06:51:36 -07:00
commit 0e35f5d390
108 changed files with 4826 additions and 2114 deletions

View File

@ -5,7 +5,7 @@
#
NCCL_HOME:=../../build/
CUDA_HOME:=/usr/local/cuda
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
PLUGIN_SO:=libnccl-net.so
default: $(PLUGIN_SO)

View File

@ -24,6 +24,7 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#include "net_v7.h"
#include "net_v6.h"
#include "net_v5.h"
#include "net_v4.h"

View File

@ -0,0 +1,31 @@
/*************************************************************************
* Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NET_DEVICE_H_
#define NET_DEVICE_H_
#include "net_device.h"
#define NCCL_NET_DEVICE_INVALID_VERSION 0x0
#define NCCL_NET_MTU_SIZE 4096
// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
typedef struct {
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
void* handle;
size_t size;
int needsProxyProgress;
} ncclNetDeviceHandle_v7_t;
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
#endif

View File

@ -18,8 +18,6 @@ typedef struct {
int maxRecvs; // Maximum number of grouped receives.
}ncclNetProperties_v6_t;
typedef ncclNetProperties_v6_t ncclNetProperties_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;

View File

@ -0,0 +1,79 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_NET_V7_H_
#define NCCL_NET_V7_H_
#include "net_device.h"
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v7_t;
typedef ncclNetProperties_v7_t ncclNetProperties_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v7_t;
#endif // end include guard

View File

@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <nccl/net.h>
#include "net.h"
#define __hidden __attribute__ ((visibility("hidden")))
@ -15,14 +15,14 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess;
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v6_t* props) {
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) {
//pluginPciPath(dev, &props.pciPath);
//pluginPtrSupport(dev, &props.ptrSupport);
return ncclInternalError;
}
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
@ -33,10 +33,12 @@ __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return n
__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
#define PLUGIN_NAME "Plugin"
const ncclNet_v6_t ncclNetPlugin_v6 = {
const ncclNet_v7_t ncclNetPlugin_v7 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
@ -54,6 +56,37 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
};
__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
//pluginPciPath(dev, &props.pciPath);
//pluginPtrSupport(dev, &props.ptrSupport);
return ncclInternalError;
}
__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { return ncclInternalError; }
const ncclNet_v6_t ncclNetPlugin_v6 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v6,
.listen = pluginListen,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
.irecv = pluginIrecv,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen
};
/* v5 Compat */
@ -61,10 +94,10 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
.getProperties = pluginGetProperties,
.getProperties = pluginGetProperties_v6,
.listen = pluginListen,
.connect = pluginConnect,
.accept = pluginAccept,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
@ -79,7 +112,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
/* v4 Compat */
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
ncclNetProperties_v6_t props_v6;
ncclResult_t ret = pluginGetProperties(dev, &props_v6);
ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6);
if (ret != ncclSuccess) return ret;
props->name = props_v6.name;
props->pciPath = props_v6.pciPath;
@ -103,14 +136,16 @@ static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void*
static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
ncclResult_t ret;
do {
ret = pluginConnect(dev, handle, sendComm);
ncclNetDeviceHandle_v7_t* handle = NULL;
ret = pluginConnect(dev, handle, sendComm, &handle);
} while (ret == ncclSuccess && *sendComm == NULL);
return ret;
}
static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
ncclResult_t ret;
do {
ret = pluginAccept(listenComm, recvComm);
ncclNetDeviceHandle_v7_t* handle = NULL;
ret = pluginAccept(listenComm, recvComm, &handle);
} while (ret == ncclSuccess && *recvComm == NULL);
return ret;
}
@ -151,12 +186,12 @@ static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V3);
memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
return ret;
}
static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) {
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V3);
memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V4);
return pluginConnect_v4(dev, &pluginHandle, sendComm);
}
const ncclNet_v3_t ncclNetPlugin_v3 = {

View File

@ -0,0 +1,17 @@
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
NCCL_HOME:=../../build/
CUDA_HOME:=/usr/local/cuda
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
PLUGIN_SO:=libnccl-tuner.so
default: $(PLUGIN_SO)
$(PLUGIN_SO): plugin.c
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
clean:
rm -f $(PLUGIN_SO)

View File

@ -0,0 +1,77 @@
/*************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_TUNER_H_
#define NCCL_TUNER_H_
#include "nccl.h"
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_UNDEF -1
#define NCCL_PROTO_LL 0
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2
// API to be implemented by external tuner
typedef struct {
// Name of the tuner
const char* name;
// Initializes tuner states.
// nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// nNodes: number of nodes in current communicator.
// logFunction: a logFunction can be useful to integrate logging together with NCCL core.
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
// Inputs:
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetSupport: whether collnet supports this type
// - nvlsSupport: whether nvlink sharp supports this time
// - numPipeOps: number of operations in the group
//
// Outputs:
// - algorithm: selected algorithm to be used for the given collective
// - protocol: selected protocol to be used for the given collective
// - nChannels: number of channels (hence SMs) to be used.
//
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
// default tuning for the given collective.
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
// Terminates the plugin and cleans up any resources that the plugin allocated.
ncclResult_t (*destroy)();
} ncclTuner_v1_t;
typedef ncclTuner_v1_t ncclTuner_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
#endif

View File

@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "tuner.h"
#define __hidden __attribute__ ((visibility("hidden")))
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginGetCollInfo(ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
__hidden ncclResult_t pluginDestroy() { return ncclSuccess; }
#define PLUGIN_NAME "Example"
const ncclTuner_v1_t ncclTunerPlugin_v1 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,
.destroy = pluginDestroy
};

View File

@ -9,6 +9,7 @@ PREFIX ?= /usr/local
VERBOSE ?= 0
KEEP ?= 0
DEBUG ?= 0
ASAN ?= 0
TRACE ?= 0
PROFAPI ?= 1
NVTX ?= 1
@ -85,6 +86,13 @@ NVCUFLAGS += -O0 -G -g
CXXFLAGS += -O0 -g -ggdb3
endif
# Make sure to run with ASAN_OPTIONS=protect_shadow_gap=0 otherwise CUDA will fail with OOM
ifneq ($(ASAN), 0)
CXXFLAGS += -fsanitize=address
LDFLAGS += -fsanitize=address -static-libasan
NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
CXXFLAGS += -Wall -Wextra

View File

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 18
NCCL_PATCH := 6
NCCL_MINOR := 19
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1

View File

@ -3,19 +3,17 @@
#
# See LICENSE.txt for license information
#
include ../makefiles/common.mk
include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
misc/ipcsocket.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
LIBSRCFILES := \
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
$(wildcard graph/*.cc) \
$(wildcard misc/*.cc) \
$(wildcard transport/*.cc)
##### lib files
LIBNAME := libnccl.so
@ -45,7 +43,7 @@ LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
DEPFILES := $(LIBOBJ:%.o=%.d)
LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a
DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
##### rules
build : lib staticlib
@ -54,8 +52,8 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
staticlib : $(LIBDIR)/$(STATICLIBTARGET)
$(DEVICELIB): ALWAYS_REBUILD $(INCTARGETS)
$(MAKE) -C collectives/device
$(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS)
$(MAKE) -C ./device
# Empty target to force rebuild
ALWAYS_REBUILD:
@ -75,21 +73,17 @@ $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
-e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
$< > $@
$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
null :=
space := $(null) #
comma := ,
$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
mkdir -p $(LIBDIR)
printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M
ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST))
$(PKGDIR)/nccl.pc : nccl.pc.in
mkdir -p $(PKGDIR)
@ -126,7 +120,7 @@ $(OBJDIR)/%.o : %.cc $(INCTARGETS)
@rm -f $(@:%.o=%.d.tmp)
clean :
$(MAKE) -C collectives/device clean
$(MAKE) -C device clean
rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
install : build

View File

@ -12,6 +12,7 @@
#include <unistd.h>
#include <sys/types.h>
#include "proxy.h"
#include "param.h"
struct bootstrapRootArgs {
struct ncclSocket* listenSock;
@ -28,21 +29,24 @@ ncclResult_t bootstrapNetInit() {
if (bootstrapNetInitDone == 0) {
pthread_mutex_lock(&bootstrapNetLock);
if (bootstrapNetInitDone == 0) {
char* env = getenv("NCCL_COMM_ID");
const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env) {
union ncclSocketAddress remoteAddr;
if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclInvalidArgument;
}
if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclSystemError;
}
} else {
int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
if (nIfs <= 0) {
WARN("Bootstrap : no socket interface found");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclInternalError;
}
}
@ -189,7 +193,7 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
memset(handle, 0, sizeof(ncclBootstrapHandle));
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
char* env = getenv("NCCL_COMM_ID");
const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) {

167
src/collectives.cc Normal file
View File

@ -0,0 +1,167 @@
/*************************************************************************
* Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "argcheck.h" // Need some checks here since we access comm
#include "collectives.h"
#include "enqueue.h"
#include "nccl.h"
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
};
size_t msgsize = sendcount * ncclTypeSize(datatype);
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsAllReduce {
size_t bytes;
ncclRedOp_t op;
};
// Just pass the size of one message and not the total bytes sent/received.
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsAllReduce, op)}
};
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsBroadcast {
size_t bytes;
int root;
};
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
};
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
}
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsReduce {
size_t bytes;
int root;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduce, op)}
};
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsReduceScatter {
size_t bytes;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduceScatter, op)}
};
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
struct NvtxParamsSendRecv {
size_t bytes;
int peer;
};
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
};
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
struct ncclInfo info = { ncclFuncSend, "Send",
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
ret = ncclEnqueueCheck(&info);
NCCLCHECK(ncclGroupEnd());
return ret;
}
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
struct ncclInfo info = { ncclFuncRecv, "Recv",
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
ret = ncclEnqueueCheck(&info);
NCCLCHECK(ncclGroupEnd());
return ret;
}

View File

@ -1,25 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
};
size_t msgsize = sendcount * ncclTypeSize(datatype);
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}

View File

@ -1,31 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "nccl.h"
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsAllReduce {
size_t bytes;
ncclRedOp_t op;
};
// Just pass the size of one message and not the total bytes sent/received.
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsAllReduce, op)}
};
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
}

View File

@ -1,37 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsBroadcast {
size_t bytes;
int root;
};
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
};
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
}

View File

@ -1,76 +0,0 @@
#
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../../makefiles/common.mk
include ../../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../../build)
OBJDIR := $(BUILDDIR)/obj/collectives/device
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu
LIBSRCFILES += functions.cu
DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
DEPENDFILES:= $(DEPFILES:%.d=%.dep)
STATICLIB := $(OBJDIR)/colldevice.a
DEVOBJ := $(OBJDIR)/devlink.o
RULESFILE := $(OBJDIR)/Makefile.rules
NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
all: $(STATICLIB)
# Dummy rule so that the extra dependency (%.dep) files are preserved by make
all_deps: $(DEPENDFILES)
# Auto-generating the rules per op/reduction/datatype/algorithm
$(RULESFILE) : gen_rules.sh
@printf "Generating %-35s > %s\n" rules $@
@mkdir -p $(OBJDIR)
@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
-include $(RULESFILE)
LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o
-include $(DEPFILES)
$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@printf "Archiving %-35s > %s\n" objects $@
ar cr $@ $^
# We do not want make to build *.d when running make clean.
# So we only provide targets for .dep which will produce .dep and .d,
# with only .d being included, and .dep keeping track of what needs to
# be regenerated.
$(OBJDIR)/%.dep : %.cu
@mkdir -p $(OBJDIR)
@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $@
@rm -f $@.tmp
@cp $@ $(@:.dep=.d)
# Compiled kernels and collectives with relocatable device code ...
$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) $(NVCUFLAGS) -dc $< -o $@
# ... and create the device-side linked object with all those.
$(DEVOBJ) : $(LIBOBJ)
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
clean:
rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)

View File

@ -1,11 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "all_gather.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_C(AllGather);

View File

@ -1,11 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "all_reduce.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(AllReduce);

View File

@ -1,11 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "broadcast.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_C(Broadcast);

View File

@ -1,122 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "common.h"
__shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ < 700
__shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
#endif
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
#define NCCL_FUNC4(func, devredop, type, nullify) \
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify)
#if defined(__CUDA_BF16_TYPES_EXIST__)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, uint8_t, 0), \
NCCL_FUNC4(func, devredop, int32_t, 0), \
NCCL_FUNC4(func, devredop, uint32_t, 0), \
NCCL_FUNC4(func, devredop, int64_t, 0), \
NCCL_FUNC4(func, devredop, uint64_t, 0), \
NCCL_FUNC4(func, devredop, half, nullForFloat), \
NCCL_FUNC4(func, devredop, float, nullForFloat), \
NCCL_FUNC4(func, devredop, double, nullForFloat), \
NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
#define NCCL_FUNCS3B(func, devredop) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0)
#else
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, uint8_t, 0), \
NCCL_FUNC4(func, devredop, int32_t, 0), \
NCCL_FUNC4(func, devredop, uint32_t, 0), \
NCCL_FUNC4(func, devredop, int64_t, 0), \
NCCL_FUNC4(func, devredop, uint64_t, 0), \
NCCL_FUNC4(func, devredop, half, nullForFloat), \
NCCL_FUNC4(func, devredop, float, nullForFloat), \
NCCL_FUNC4(func, devredop, double, nullForFloat)
#define NCCL_FUNCS3B(func, devredop) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0)
#endif
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Prod, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Max, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Min, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, PreMulSum, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1)
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
// Must be consistent with the ncclFuncSet enum
__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
#if defined(__CUDA_BF16_TYPES_EXIST__)
NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
#endif
NCCL_FUNCS2B(Broadcast),
NCCL_FUNCS2A(Reduce),
NCCL_FUNCS2B(AllGather),
NCCL_FUNCS2A(ReduceScatter),
NCCL_FUNCS2A(AllReduce)
#endif
};
// Workaround for https://reviews.llvm.org/D55580
__device__ void ncclWorkaroundClangD55580() {}

View File

@ -1,43 +0,0 @@
#!/bin/bash
#
# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
dir=$1
datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64"
if [ "$CUDA_MAJOR" -ge 11 ]
then
datatypes+=" bf16"
fi
targets="GENOBJS := \\\\\n"
for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
opn=0
for op in sum prod min max premulsum sumpostdiv; do
dtn=0
# Order must match that of the ncclDataType_t enum
for dt in ${datatypes}; do
# Generate a unique filename for each compilation unit,
# otherwise the __nv_module_id may conflict at link time
echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu"
echo " @printf \"Copying %-35s > %s\\\\n\" \$< \$@"
echo " cp \$< \$@"
echo ""
# Compile the file
echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep"
echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
echo " mkdir -p ${dir}"
echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
echo ""
targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
dtn=$(($dtn + 1))
done
opn=$(($opn + 1))
done
done
echo -e "$targets"

View File

@ -1,62 +0,0 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "common_kernel.h"
#include "common.h"
namespace {
template<typename T, typename RedOp>
__device__ __forceinline__ void oneRankReduce() {
ncclWork *w = &ncclShmem.work;
int tid = threadIdx.x;
int tn = blockDim.x;
#pragma unroll 1
for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
ncclWorkElem *we = &w->elems[e];
intptr_t eltN = we->count;
int bid = we->bid;
int bn = we->nChannels;
T const *src = (T const*)we->sendbuff;
T *dst = (T*)we->recvbuff;
// each block/channel gets a roughly equal segment of 16 byte packs
constexpr int EltPerPack = 16/sizeof(T);
intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack;
intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn);
intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn);
i0 *= EltPerPack;
i0 = i0 < eltN ? i0 : eltN;
i1 *= EltPerPack;
i1 = i1 < eltN ? i1 : eltN;
src += i0;
dst += i0;
void *vsrc = (void*)src;
void *vdst = (void*)dst;
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
(tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
}
}
}
#define INSTANTIATE(devredop, type) \
__device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
oneRankReduce<type, Func##devredop<type>>(); \
}
INSTANTIATE(PreMulSum, int8_t)
INSTANTIATE(PreMulSum, uint8_t)
INSTANTIATE(PreMulSum, int32_t)
INSTANTIATE(PreMulSum, uint32_t)
INSTANTIATE(PreMulSum, int64_t)
INSTANTIATE(PreMulSum, uint64_t)
INSTANTIATE(PreMulSum, half)
#if defined(__CUDA_BF16_TYPES_EXIST__)
INSTANTIATE(PreMulSum, __nv_bfloat16)
#endif
INSTANTIATE(PreMulSum, float)
INSTANTIATE(PreMulSum, double)

View File

@ -1,11 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "reduce.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(Reduce);

View File

@ -1,11 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "reduce_scatter.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(ReduceScatter);

View File

@ -1,11 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "sendrecv.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_P(SendRecv);

View File

@ -1,33 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "nccl.h"
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsReduce {
size_t bytes;
int root;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduce, op)}
};
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
}

View File

@ -1,31 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "nccl.h"
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsReduceScatter {
size_t bytes;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduceScatter, op)}
};
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}

View File

@ -1,52 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "argcheck.h" // Need some checks here since we access comm
struct NvtxParamsSendRecv {
size_t bytes;
int peer;
};
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
};
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
struct ncclInfo info = { ncclFuncSend, "Send",
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
ret = ncclEnqueueCheck(&info);
NCCLCHECK(ncclGroupEnd());
return ret;
}
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
struct ncclInfo info = { ncclFuncRecv, "Recv",
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
ret = ncclEnqueueCheck(&info);
NCCLCHECK(ncclGroupEnd());
return ret;
}

View File

@ -9,6 +9,7 @@
#include <stdlib.h>
#include <stdarg.h>
#include <sys/syscall.h>
#include "param.h"
int ncclDebugLevel = -1;
static int pid = -1;
@ -25,7 +26,7 @@ static __thread int tid = -1;
void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
const char* nccl_debug = getenv("NCCL_DEBUG");
const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
int tempNcclDebugLevel = -1;
if (nccl_debug == NULL) {
tempNcclDebugLevel = NCCL_LOG_NONE;
@ -45,7 +46,7 @@ void ncclDebugInit() {
* This can be a comma separated list such as INIT,COLL
* or ^INIT,COLL etc
*/
char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS");
if (ncclDebugSubsysEnv != NULL) {
int invert = 0;
if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
@ -97,7 +98,7 @@ void ncclDebugInit() {
* then create the debug file. But don't bother unless the
* NCCL_DEBUG level is > VERSION
*/
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE");
if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX+1] = "";

100
src/device/Makefile Normal file
View File

@ -0,0 +1,100 @@
#
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
SHELL := /usr/bin/env bash
MAKEFALGS += -r
.SUFFIXES:
.SECONDARY:
NCCLDIR := ../..
include $(NCCLDIR)/makefiles/common.mk
include $(NCCLDIR)/makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
OBJDIR := $(BUILDDIR)/obj/device
MANIFEST := $(OBJDIR)/manifest
DEVGLUE_OBJ := $(OBJDIR)/device_glue.o
INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include
NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
CXXFLAGS += $(INCFLAGS)
SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY
COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1
COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1
define COMPILE
@$(SAY) "Compiling" $2;\
mkdir -p $(dir $1);\
$(call COMPILE$(suffix $2),$1,$2)
endef
DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
DEPENDS.cc = $(CXX) $(CXXFLAGS) -M -c $1
define DEPENDS
@$(SAY) "Dependencies" $2;\
mkdir -p $(dir $1);\
mk=$$($(call DEPENDS$(suffix $2),$2));\
[[ $$mk =~ ^[^:]*:(.*)$$ ]];\
files=$${BASH_REMATCH[1]};\
files=$$(for x in $$files; do case "$$x" in '\'|$$'\t') ;; *) echo "$$x"; esac; done);\
files=$$(for x in $$files; do [[ "$$(realpath "$$x")" == "$$(realpath "$(NCCLDIR)")"* ]] && echo "$$x"; done);\
echo "$(patsubst %.d,%.o,$1) $1: " $$files > $1
endef
all: $(MANIFEST)
ifeq (1,1)
# Case if the <gensrc> directory is generated on-demand:
$(OBJDIR)/gensrc: generate.py
@mkdir -p $@
(which python3 >/dev/null || \
(bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \
printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \
exit 1)) \
&& ./generate.py $@ "$(ONLY_FUNCS)"
else
# Case if the <gensrc> directory is pre-generated and checked in the repo as ./gen:
$(OBJDIR)/gensrc:
@mkdir -p $(OBJDIR); ln -srfn ./gen $@
endif
# The trailing ";" is necessary to make this an "empty recipe":
# https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html
$(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ;
-include $(OBJDIR)/gensrc/rules.mk
# "gensrc/rules.mk" populates $(LIB_OBJS_GEN)
SRCS = common.cu onerank.cu
LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN)
$(OBJDIR)/%.o: % $(OBJDIR)/%.d
$(call COMPILE,$@,$<)
$(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d
$(call COMPILE,$@,$(OBJDIR)/gensrc/$*)
$(OBJDIR)/%.d: %
$(call DEPENDS,$@,$<)
$(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/%
$(call DEPENDS,$@,$<)
$(DEVGLUE_OBJ): $(LIB_OBJS)
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
$(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ)
@echo $^ > $@
-include $(wildcard $(OBJDIR)/*.d)
-include $(wildcard $(OBJDIR)/genobj/*.d)
.PHONY: clean
clean:
rm -rf $(OBJDIR)

View File

@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "device.h"
#include "collectives.h"
#include "primitives.h"
@ -108,33 +108,65 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t rank = ncclShmem.comm.rank;
const int nThreadsGather = 128;
const int nThreadsBcast = 384 + WARP_SIZE;
const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
const int tidEndGather = nThreadsGather;
const int tidEndBcast = tidEndGather + nThreadsBcast;
using Proto = ProtoSimple<1, 1>;
if (tid < tidEndGather) {
// Gather
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
if (!args->regUsed) {
if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.gather(offset, nvls->nHeads * size, nelem, size, -1, 0);
}
} else if (tid < tidEndBcast) {
// Bcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.send(offset, nelem);
}
}
} else if (tid < tidEndBcast) {
// Bcast through NVLS
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
} else {
/* direct allgather */
if (tid < tidEndGather) {
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
/* used as sync */
prims.scatter(0, 0, 0, 0, -1, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
prims.gather(0, 0, 0, 0, -1, 0);
}
} else if (tid < tidEndBcast) {
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL,
args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args);
/* used as sync */
prims.recv(0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t inpOffset = gridOffset + bid * chunkSize;
ssize_t outOffset = inpOffset + rank * size;
int nelem = min(chunkSize, size - inpOffset);
prims.directSend(inpOffset, outOffset, nelem);
}
}
}
}

View File

@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "device.h"
#include "collectives.h"
#include "primitives.h"
@ -377,7 +377,6 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
#if NCCL_NVLS_ENABLED
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
@ -387,10 +386,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const bool hasOut = nvls->out != -1;
const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5;
const int bcastWarps = hasOut ? 2 : 0;
const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
@ -406,67 +406,65 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce && nvls->headRank != -1) {
if (!hasOut) {
// Reduce, broadcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.directRecvDirectSend(offset, offset, nelem);
}
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1);
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.directRecvDirectSend(offset, offset, nelem);
}
}
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.directRecvDirectSend(offset, offset, nelem);
}
}
#endif // NCCL_NVLS_ENABLED
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
#if NCCL_NVLS_ENABLED
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
@ -478,10 +476,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const bool hasUp = treeUp != -1;
const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5;
const int bcastWarps = hasUp ? 4 : 0;
const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
@ -497,60 +496,59 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce && nvls->headRank != -1) {
if (!hasUp) {
// Reduce and Broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.directRecvDirectSend(offset, offset, nelem);
}
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.directRecvDirectSend(offset, offset, nelem);
}
}
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.directRecvDirectSend(offset, offset, nelem);
}
}
#endif // NCCL_NVLS_ENABLED
}
};

View File

@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "device.h"
#include "collectives.h"
#include "primitives.h"

24
src/device/common.cu Normal file
View File

@ -0,0 +1,24 @@
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "device.h"
#include "collectives.h"
#include "common.h"
__shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ < 700
__shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
#endif
struct RunWorkNop {
__device__ void run(ncclWork *w) {}
};
__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead);
}
__device__ void ncclDevFunc_Nop() {}

View File

@ -8,19 +8,23 @@
#define NCCL_DEVICE_COMMON_H_
#include "collectives.h"
#include "devcomm.h"
#include "device.h"
#include "op128.h"
#include "network/unpack/unpack_defs.h"
#define COLL_UNROLL (ncclCollUnroll())
typedef void(*ncclKern_t)();
extern __device__ ncclKern_t ncclFuncs[];
typedef void(*ncclDevFuncPtr_t)();
extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];
struct ncclShmemGroup {
ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
void* srcs[NCCL_MAX_NVLS_ARITY+1];
void* dsts[NCCL_MAX_NVLS_ARITY+1];
union {
unpackGroupShmem unpack;
} devicePlugin;
};
struct ncclShmemData {
@ -31,6 +35,9 @@ struct ncclShmemData {
alignas(16) struct ncclDevComm comm;
alignas(16) struct ncclDevChannel channel;
alignas(16) struct ncclWork work;
alignas(16) union {
unpackShmem unpack;
} devicePlugin;
};
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
@ -111,10 +118,8 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
}
}
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex>
__device__ void ncclKernel(
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
) {
template<int SpecializedFnId, typename SpecializedRunWork>
__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
int tid = threadIdx.x;
// To map blockId to channelId, we need the n'th set bit of channelMask which
@ -166,7 +171,7 @@ __device__ void ncclKernel(
bytes = 0;
break;
}
copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
}
__syncthreads(); // publish ncclShmem
@ -184,10 +189,10 @@ __device__ void ncclKernel(
}
__syncthreads();
if (ncclShmem.work.header.funcIndex == FnIndex) {
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) {
SpecializedRunWork().run(&ncclShmem.work);
} else {
ncclFuncs[ncclShmem.work.header.funcIndex]();
ncclDevFuncTable[ncclShmem.work.header.funcIndex]();
}
int workIxNext = ncclShmem.work.header.workNext;
@ -204,94 +209,17 @@ __device__ void ncclKernel(
}
}
// Only generate kernels for SUM
#if NCCL_OP == 0
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)( \
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead \
) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex> \
(comm, channelMask, workHead); \
}
#else
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded)
#endif
__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
__device__ void ncclDevFunc_Nop();
// Examples : AllReduce, RING, LL, Sum, uint8
#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
}
#define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \
__global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernelMain<specializedFnId, RunWork<coll, ty, redop<ty>, algo, proto>>(comm, channelMask, workHead); \
}
// Only generate inline kernels for LL
#define IMPL_COLL4(func, algo, devredop, type, ncclType) \
IMPL_COLL_FUNC(func, algo, LL, devredop, type) \
IMPL_COLL_FUNC(func, algo, LL128, devredop, type) \
IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) \
IMPL_COLL_KERN(func, algo, LL, devredop, type, FUNC_INDEX(ncclFunc##func, ncclDev##devredop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \
#define IMPL_COLL3(func, devredop, type, ncclType) \
IMPL_COLL4(func, TREE, devredop, type, ncclType) \
IMPL_COLL4(func, RING, devredop, type, ncclType) \
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \
IMPL_COLL4(func, NVLS, devredop, type, ncclType) \
IMPL_COLL4(func, NVLS_TREE, devredop, type, ncclType)
#if NCCL_TYPE == 0
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8)
#elif NCCL_TYPE == 1
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint8_t, ncclUint8)
#elif NCCL_TYPE == 2
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int32_t, ncclInt32)
#elif NCCL_TYPE == 3
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint32_t, ncclUint32)
#elif NCCL_TYPE == 4
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int64_t, ncclInt64)
#elif NCCL_TYPE == 5
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint64_t, ncclUint64)
#elif NCCL_TYPE == 6
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, half, ncclFloat16)
#elif NCCL_TYPE == 7
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, float, ncclFloat32)
#elif NCCL_TYPE == 8
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, double, ncclFloat64)
#elif NCCL_TYPE == 9 && defined(__CUDA_BF16_TYPES_EXIST__)
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, __nv_bfloat16, ncclBfloat16)
#endif
// Reduction define all functions
#if NCCL_OP == 0
#define IMPL_COLL_R(func) IMPL_COLL2(func, Sum);
#elif NCCL_OP == 1
#define IMPL_COLL_R(func) IMPL_COLL2(func, Prod);
#elif NCCL_OP == 2
#define IMPL_COLL_R(func) IMPL_COLL2(func, Min);
#elif NCCL_OP == 3
#define IMPL_COLL_R(func) IMPL_COLL2(func, Max);
#elif NCCL_OP == 4
#define IMPL_COLL_R(func) IMPL_COLL2(func, PreMulSum);
#elif NCCL_OP == 5
#if NCCL_TYPE < 6
#define IMPL_COLL_R(func) IMPL_COLL2(func, SumPostDiv);
#else
#define IMPL_COLL_R(func) // skip SumPostDiv for floating point
#endif
#endif
#if NCCL_OP == 0 && NCCL_TYPE == 0
// Copy primitives only define one function for copy
#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
// Point-to-point primitives only have one function/kernel.
#define IMPL_COLL_P(func) \
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
#else
#define IMPL_COLL_C(func)
#define IMPL_COLL_P(func)
#endif
#define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP))
#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
__device__ void ncclDevFunc_##suffix() { \
RunWork<coll, ty, redop<ty>, algo, proto>().run(&ncclShmem.work); \
}
#endif

View File

@ -7,7 +7,7 @@
#ifndef NCCL_COMMON_KERNEL_H_
#define NCCL_COMMON_KERNEL_H_
#include "devcomm.h"
#include "device.h"
#include "op128.h"
#include "reduce_kernel.h"
#include <cstdio>
@ -81,13 +81,13 @@ __device__ __forceinline__ void reduceCopyPacks(
for (int u=0; u < Unroll; u++) {
if (0 < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[0]);
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[0]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
}
minSrcs[0] += WARP_SIZE*BytePerPack;
if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
}
}
@ -99,7 +99,7 @@ __device__ __forceinline__ void reduceCopyPacks(
for (int u=0; u < Unroll; u++) {
if (s < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[s]);
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);

405
src/device/generate.py Executable file
View File

@ -0,0 +1,405 @@
#!/usr/bin/env python3
import os
import sys
# Order of redops, tys, protos, algos must match src/include/device.h
all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
all_protos = ["LL","LL128","SIMPLE"]
all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"]
################################################################################
# The first command line argument is the path to the directory to generate and
# populate.
gensrc = sys.argv[1]
if os.path.exists(gensrc):
for name in os.listdir(gensrc):
os.remove(os.path.join(gensrc, name))
#os.truncate(os.path.join(gensrc, name), 0)
else:
os.mkdir(gensrc)
################################################################################
# The second command line argument is used as a regex to filter the functions
# which make it into libnccl. This is helpful for reducing the binary when
# developing device code. The regex supports non-space containing globs '*',
# parentheses '(x)', and union 'a|b'. The string representing the function has
# one of the forms:
#
# SendRecv
# (AllGather|Broadcast) <algo> <proto>
# (AlLReduce|Reduce|ReduceScatter) <redop> <type> <algo> <proto>
#
# The possible values for redop, type, algo, proto can be found in the all_<foo>
# lists at the top of this file.
#
# Since the Makefile forwards this from the ONLY_FUNCS variable, useful command
# line examples are given:
"""
# Only send/recv:
make ONLY_FUNCS="SendRecv"
# Only non-reductions:
make ONLY_FUNCS="AllGather * *|Broadcast * *|SendRecv"
# Only AllReduce sum f32 (but all algos, protos)
make ONLY_FUNCS="AllReduce Sum f32 * *"
# Only AllReduce minmax i32 NVLS (but all protos)
make ONLY_FUNCS="AllReduce MinMax i32 NVLS *"
# AllReduce sum <all floats> RING LL128
make ONLY_FUNCS="AllReduce Sum f32 RING LL128"
"""
# Paste all non-None arguments together with `sep`.
def paste(sep, *args):
return sep.join(x for x in args if x is not None)
func_pattern = sys.argv[2:3]
if func_pattern and func_pattern[0]:
import re
func_pattern = func_pattern[0]
func_pattern = func_pattern.replace("*", "[^ ]*")
func_pattern += "$"
def func_filter(*fn):
return None is not re.match(func_pattern, paste(" ", *fn), flags=re.IGNORECASE)
else:
def func_filter(coll, redop, ty, algo, proto):
return True
################################################################################
algos_of_coll = {
"AllGather": ["RING","NVLS"],
"AllReduce": all_algos,
"Broadcast": ["RING"],
"Reduce": ["RING"],
"ReduceScatter": ["RING","NVLS"],
"SendRecv": [None]
}
coll_camel_to_lower = {
"AllGather": "all_gather",
"AllReduce": "all_reduce",
"Broadcast": "broadcast",
"Reduce": "reduce",
"ReduceScatter": "reduce_scatter",
"SendRecv": "sendrecv"
}
coll_lower_to_camel = {coll_camel_to_lower[x]: x for x in coll_camel_to_lower}
################################################################################
# Returns pair of minimum required values for (CUDART_VERSION, __CUDA_ARCH__)
# or None if function is never supported. Note that (0, 0) encodes universal
# support.
def required_cuda(coll, redop, ty, algo, proto):
cudart, arch = 0, 0
# kernels mapped to by coll="Nop" functions have coll="Generic"
if coll in ("SendRecv", "Generic", "Nop"): return (cudart, arch)
if proto!="SIMPLE" and algo not in ("RING","TREE"): return None
if coll in ("AllReduce","Reduce","ReduceScatter"):
if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
if ty=="bf16": cudart = max(cudart, 11000)
if "NVLS" in algo:
if coll in ("AllReduce","Reduce","ReduceScatter"):
# Must match ncclNvlsSupported() in src/include/device.h
nvls_ok = ((ty in ("i32","u32","i64","u64") and redop in ("Sum","MinMax")) or
(ty in ("f32","f64") and redop=="Sum") or
(ty in ("f16","bf16") and redop in ("Sum","MinMax")))
if not nvls_ok: return None
cudart = max(cudart, 12010)
arch = max(arch, 900)
return (cudart, arch)
# Maps functions to the chosen representative for the equivalence class it
# belongs to. For instance (sum, signed int) maps to (sum, unsigned int).
def equivalent_primary(coll, redop, ty, algo, proto):
if coll in ("AllReduce", "Reduce", "ReduceScatter"):
# map signed integer sum/prod to unsigned
if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i":
return (coll, redop, "u"+ty[1:], algo, proto)
# map signed integer min/max to unsigned for non-NVLS
if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
return (coll, redop, "u"+ty[1:], algo, proto)
return (coll, redop, ty, algo, proto)
# Map to another func representing the best kernel to use. Every distinct value
# returned will instantiate a ncclDevKernel specialized to run this func
# without function call overhead.
def best_kernel(coll, redop, ty, algo, proto):
def best(coll, redop, ty, algo, proto):
# Modify this logic to control how many kernels are specialized.
if coll=="Nop": return ("Generic", None, None, None, None)
if coll=="SendRecv": return ("SendRecv", None, None, None, None)
if coll in ("AllGather","Broadcast"): return (coll, None, None, "RING", "LL")
return (coll, "Sum", ty, ("TREE" if algo=="TREE" else "RING"), "LL")
# Need to ensure kernel is specialize for a primary function
kfn = equivalent_primary(*best(coll, redop, ty, algo, proto))
# And isn't filtered out.
if not func_filter(*kfn): return ("Generic", None, None, None, None)
return kfn
# Order rows are enumerated must match formula of `ncclDevFuncId()`:
def enumerate_func_rows():
yield ("SendRecv", None, None, None, None)
for coll in ("AllGather", "Broadcast"):
algos = algos_of_coll[coll]
for algo in algos:
for proto in all_protos:
yield (coll, None, None, algo, proto)
for coll in ("AllReduce", "Reduce", "ReduceScatter"):
algos = algos_of_coll[coll]
for redop in all_redops:
for ty in all_tys:
for algo in algos:
for proto in all_protos:
yield (coll, redop, ty, algo, proto)
################################################################################
def is_built(coll, redop, ty, algo, proto):
built = required_cuda(coll, redop, ty, algo, proto)
built = built and func_filter(coll, redop, ty, algo, proto)
return built
# Returns None if required_cuda(...) is None.
# Returns the coll="Nop" function if developer has filtered it out.
# Otherwise just returns func it was given.
def validate(coll, redop, ty, algo, proto):
valid = required_cuda(coll, redop, ty, algo, proto)
built = valid and func_filter(coll, redop, ty, algo, proto)
if built: return (coll, redop, ty, algo, proto)
if valid: return ("Nop", None, None, None, None)
return None
# Corresponds to ncclDevFuncRowToId[]
func_rows = [validate(*fn) for fn in enumerate_func_rows()]
# Corresponds to ncclDevFuncTable[]
primary_funcs = sorted(set(equivalent_primary(*fn) for fn in func_rows if fn is not None))
# primary_to_index[primary_funcs[i]] == i
primary_to_index = {fn: i for (i,fn) in zip(range(len(primary_funcs)), primary_funcs)}
kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs))
################################################################################
# Generate <gensrc>/device_table.cu
with open(os.path.join(gensrc, "device_table.cu"), "w") as f:
out = f.write
out('#include "common.h"\n')
out("\n")
for fn in primary_funcs:
sym = paste("_", "ncclDevFunc", *fn)
cudart, arch = required_cuda(*fn)
if (cudart, arch) != (0, 0):
out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
out("__device__ void %s();\n" % sym)
if (cudart, arch) != (0, 0):
out("#endif\n")
out("\n")
out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n");
index = 0
for fn in primary_funcs:
sym = paste("_", "ncclDevFunc", *fn)
cudart, arch = required_cuda(*fn)
if (cudart, arch) != (0, 0):
out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart ,arch))
out("/*%4d*/ %s,\n" % (index, sym))
if (cudart, arch) != (0, 0):
out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
index += 1
out("nullptr};\n")
out("\n")
out("// Workaround for https://reviews.llvm.org/D55580\n"
"__device__ void ncclWorkaroundClangD55580() {}\n")
# Generate <gensrc>/host_table.cc
with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
out = f.write
out('#include "device.h"\n')
out("\n")
# The mapping from function rows to valid primary function ids.
out("extern int const ncclDevFuncRowToId[] = {\n")
index = 0
for fn in func_rows:
fn_id, comment = -1, ""
if fn is not None:
fn_id = primary_to_index[equivalent_primary(*fn)]
comment = " // " + paste(" ", *fn)
out("/*%4d*/ %d,%s\n" % (index, fn_id, comment))
index += 1
out("-1};\n")
out("\n")
# Forward declarations of kernels.
for kfn in kernel_funcs:
cudart, _ = required_cuda(*kfn)
sym = paste("_", "ncclDevKernel", *kfn)
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
if cudart != 0: out("#endif\n")
out("\n")
# List of all kernel function pointers.
out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs))
out("extern void* const ncclDevKernelList[] = {\n")
index = 0
for kfn in kernel_funcs:
cudart, _ = required_cuda(*kfn)
sym = paste("_", "ncclDevKernel", *kfn)
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
out("/*%4d*/ (void*)%s,\n" % (index, sym));
if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
index += 1
out("nullptr};\n")
out("\n")
# Maps primary id to kernel function pointer.
out("extern void* const ncclDevKernelForFunc[] = {\n")
index = 0
for fn in primary_funcs:
kfn = best_kernel(*fn)
sym = paste("_", "ncclDevKernel", *kfn)
cudart, _ = required_cuda(*kfn)
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
out("/*%4d*/ (void*)%s,\n" % (index, sym))
if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
index += 1
out("nullptr};\n")
out("\n")
# Does the prior map use an explicitly specialized kernel.
out("extern bool const ncclDevKernelForFuncIsSpecialized[] = {\n")
index = 0
for fn in primary_funcs:
kfn = best_kernel(*fn)
specialized = "1" if fn == kfn else "0"
out("/*%4d*/ %s,\n" % (index, specialized))
index += 1
out("0};\n")
# Maps to .cu filename which implements this func. The only constraint is that
# "coll" is reflected in the name: formally that no two funcs having different
# coll's map to the same filename.
def impl_filename(coll, redop, ty, algo, proto):
return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty)
# Partition the functions and kernels to the .cu filenames. The partition is
# a dictionary mapping filename to (coll, func-tuple list)
def partition_by_name(fns):
ans = {}
for fn in fns:
name = impl_filename(*fn)
coll = fn[0]
if name not in ans:
ans[name] = (coll, [])
ans[name][1].append(fn)
return ans
name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop")
name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic")
# Generate <gensrc>/rules.mk
with open(os.path.join(gensrc, "rules.mk"), "w") as f:
out = f.write
impl_names = sorted(name_to_funcs.keys())
names = impl_names + ["host_table.cc", "device_table.cu"]
out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n"
.format(names=" ".join(names)))
out("\n")
# For each <coll>_<op>_<ty>.cu compile to a .cu.o file. Notice the dependencies
# come from the suffix-erased file (e.g. 'gensrc/all_reduce.cu')
for name in impl_names:
coll = name_to_funcs[name][0]
out(
"$(OBJDIR)/genobj/{name}.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/{lower_coll}.cu.d\n"
"\t" "$(call COMPILE,$@,$(OBJDIR)/gensrc/{name})\n"
"\n"
.format(name=name, lower_coll=coll_camel_to_lower[coll])
)
# Add the suffix-erased .cu's which are used only for dependency scraping.
for coll in set(coll for (coll,_,_,_,_) in primary_funcs if coll!="Nop"):
name = impl_filename(coll, None, None, None, None)
if name not in name_to_funcs:
name_to_funcs[name] = (coll, [])
redop_to_cxx = {
None: "FuncCopy",
"Sum": "FuncSum",
"Prod": "FuncProd",
"MinMax": "FuncMinMax",
"PreMulSum": "FuncPreMulSum",
"SumPostDiv": "FuncSumPostDiv"
}
ty_to_cxx = {
None: "int8_t",
"i8": "int8_t",
"u8": "uint8_t",
"i32": "int32_t",
"u32": "uint32_t",
"i64": "int64_t",
"u64": "uint64_t",
"f16": "half",
"f32": "float",
"f64": "double",
"bf16": "__nv_bfloat16"
}
# Generate each <gensrc>/<impl>.cu:
for name in name_to_funcs.keys():
(coll, fns) = name_to_funcs[name]
with open(os.path.join(gensrc, name), "w") as f:
out = f.write
out(
'#include "common.h"\n'
'#include "{lower_coll}.h"\n'
.format(lower_coll=coll_camel_to_lower[coll])
)
(_, kfns) = name_to_kernels.get(name) or (None, [])
for kfn in kfns:
(coll, redop, ty, algo, proto) = kfn
sym = paste("_", coll, redop, ty, algo, proto)
fn_id = primary_to_index[kfn]
cudart, arch = required_cuda(*kfn)
if (cudart, arch) != (0, 0):
out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
out(
"DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
.format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id)
)
if (cudart, arch) != (0, 0):
out("#endif\n")
for fn in fns:
(coll, redop, ty, algo, proto) = fn
sym = paste("_", coll, redop, ty, algo, proto)
cudart, arch = required_cuda(*fn)
if (cudart, arch) != (0, 0):
out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
out(
"DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto})\n"
.format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
algo=(algo or "RING"), proto=(proto or "SIMPLE"))
)
if (cudart, arch) != (0, 0):
out("#endif\n")

View File

@ -0,0 +1,280 @@
/*************************************************************************
* Copyright (c) 2023, Google LLC. All rights reserved.
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NET_DEVICE_UNPACK_H
#define NET_DEVICE_UNPACK_H
#include "unpack_defs.h"
#include "op128.h"
#include "align.h"
#include "device.h"
#include "common.h"
// #define ALIGNED_LOAD
inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
#if __CUDA_ARCH__ >= 700
asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];"
: "=l"(v) : "l"(ptr));
#else
asm volatile("ld.volatile.global.u64 {%0}, [%1];"
: "=l"(v) : "l"(ptr));
#endif
}
#define PAGE_META_SIZE 16
#define META_LOAD_SIZE 16
#define DATA_LOAD_SIZE 16
// Map internal association of handle with group and peer index (called once at init time)
inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) {
struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
ncclShmem.groups[group].devicePlugin.unpack.head = handle->head;
}
inline __device__ void ncclNetDeviceIncrementHead(const int group) {
ncclShmem.groups[group].devicePlugin.unpack.head++;
}
inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) {
struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
handle->head = ncclShmem.groups[group].devicePlugin.unpack.head;
}
template <uint8_t sz>
inline __device__ void bulkLoad(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<sz> *reg, const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
bulkLoad<1>(t, len, cpy_src, cpy_dst, reg, w, g_meta, s_meta, src_off, dst_off);
}
template <>
inline __device__ void bulkLoad<1>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<1> reg[16], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
uint64_t data_s;
for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
#ifdef ALIGNED_LOAD
load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
#else
#pragma unroll
for (int i=0; i<16; i++) {
reg[i] = ld_volatile_global<1>((uintptr_t)((uint8_t*)(cpy_src + data_s) + i));
}
#endif
#pragma unroll
for (int i=0; i<16; i++) {
st_global<1>((uintptr_t)((uint8_t*)(cpy_dst + data_s) + i), reg[i]);
}
}
}
template <>
inline __device__ void bulkLoad<2>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<2> reg[8], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
uint64_t data_s;
for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
#ifdef ALIGNED_LOAD
load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
#else
#pragma unroll
for (int i=0; i<8; i++) {
reg[i] = ld_volatile_global<2>((uintptr_t)((uint16_t*)(cpy_src + data_s) + i));
}
#endif
#pragma unroll
for (int i=0; i<8; i++) {
st_global<2>((uintptr_t)((uint16_t*)(cpy_dst + data_s) + i), reg[i]);
}
}
}
template <>
inline __device__ void bulkLoad<4>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<4> reg[4], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
uint64_t data_s;
for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
#ifdef ALIGNED_LOAD
load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
#else
#pragma unroll
for (int i=0; i<4; i++) {
reg[i] = ld_volatile_global<4>((uintptr_t)((uint32_t *)(cpy_src + data_s) + i));
}
#endif
#pragma unroll
for (int i=0; i<4; i++) {
st_global<4>((uintptr_t)((uint32_t*)(cpy_dst + data_s) + i), reg[i]);
}
}
}
template <>
inline __device__ void bulkLoad<8>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<8> reg[2], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
uint64_t data_s;
for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
#ifdef ALIGNED_LOAD
load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
#else
#pragma unroll
for (int i=0; i<2; i++) {
reg[i] = ld_volatile_global<8>((uintptr_t)((uint64_t*)(cpy_src + data_s) + i));
}
#endif
#pragma unroll
for (int i=0; i<2; i++) {
st_global<8>((uintptr_t)((uint64_t*)(cpy_dst + data_s) + i), reg[i]);
}
}
}
template <>
inline __device__ void bulkLoad<16>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<16> reg[1], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
uint64_t data_s;
for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
reg[0] = ld_volatile_global<16>((uintptr_t)(cpy_src + data_s));
st_global<16>((uintptr_t)(cpy_dst + data_s), reg[0]);
}
}
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
inline __device__ int ppw(const int nbytes, int nw) {
int v = DIVUP(nbytes, SLICE_PAGE_SIZE);
v = DIVUP(v, nw);
while (v > WARP_SHM_PAGE_CNT) {
v = DIVUP(v, 2);
}
return v;
}
// This function is called by all threads
// Pack data from the internal iovec to the supplied flat buffer using all the
// threads
template <int Recv>
inline __device__ void ncclNetDeviceUnpack(
const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize);
template <>
inline __device__ void ncclNetDeviceUnpack</*Recv=*/0>(
const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) {
// send unpack empty
}
inline __device__ void ncclNetDeviceUnpackInner(
const int tid, const int tidInBlock, const int nworkers, const int group, const int index,
void *src, const int nbytes, const uint64_t step);
template <>
inline __device__ void ncclNetDeviceUnpack</*Recv=*/1>(
const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) {
while (mask != 0) {
int ix = __ffs(mask)-1; // Get the first set bit of the mask (this should correlate to a peer index)
mask &= mask-1; // Drop the first set bit of the mask
// Pack data from the internal iovec to the supplied flat srcs buffer using all the threads
// + Src is necessary in the case of accessing the user buffer directly
ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/,
ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head);
}
}
inline __device__ void ncclNetDeviceUnpackInner(
const int tid, const int tidInBlock, const int nworkers, const int group, const int index,
void *src, const int nbytes, const uint64_t step) {
// from src/collectives/device/common_kernel.h
const int w = tid / WARP_SIZE; // Warp number
const int nw = nworkers / WARP_SIZE; // Number of warps
const int t = tid % WARP_SIZE; // Thread (inside the warp)
BytePack<16> reg;
loadMeta meta;
uint64_t head;
struct netUnpackMeta* g_meta_struct;
void* bounce_buf;
loadMeta* g_meta;
loadMeta* s_meta;
uint64_t meta_cnt;
// hack head use per-warp
head = step;
g_meta_struct = ncclShmem.groups[group].devicePlugin.unpack.g_meta[index];
bounce_buf = ncclShmem.devicePlugin.unpack.bounce_buf;
__syncwarp();
head %= NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH;
g_meta = g_meta_struct->mem[head];
// Currently, even/odd groups perform send/recv separately. We don't really need space for send side.
// Total size is N page per warp * 16 B per page * 20 WARPS max = 320 * N bytes, N == WARP_SHM_PAGE_CNT
static_assert(ncclShmemScratchWarpSize() >= WARP_SHM_SIZE, "Each warp must have enough scratch space");
s_meta = (loadMeta*) ncclScratchForWarp(tidInBlock / WARP_SIZE); // (loadMeta*) (ncclShmem.devicePlugin.unpack.meta + shm_off);
load64gpu(g_meta_struct->cnt + head, meta_cnt);
int PPW = ppw(nbytes, nw);
for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) {
uint64_t iter_meta_cnt = meta_cnt - meta_s;
iter_meta_cnt = iter_meta_cnt < PPW ? iter_meta_cnt : PPW;
// TODO: this load size needs to work if not aligned, but since the two are both 16...
if (t < PPW * PAGE_META_SIZE / META_LOAD_SIZE && t < iter_meta_cnt) { // avoid last iter load garbage data
load128((const uint64_t*) (g_meta + (meta_s + t)), reg.u64[0], reg.u64[1]);
storeShmem128(shmemCvtPtr((uint64_t *)(s_meta + (w * PPW + t))), reg.u64[0], reg.u64[1]);
}
__syncwarp();
for (int x = 0; x < iter_meta_cnt; x++) {
int meta_idx = x + w * PPW;
// load page offs
loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]);
if (meta.len >= DATA_LOAD_SIZE) {
// fast path, but need to adapt to alignment issue
// bulk copy data
uint8_t align_off = (meta.src_off | meta.dst_off) % DATA_LOAD_SIZE;
align_off = align_off & -align_off; // keep the lowest bit
if (align_off == 0) { // 0x16
bulkLoad<16>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
} else if (align_off & 0x8) {
bulkLoad<8>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<8>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
} else if (align_off & 0x4) {
bulkLoad<4>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<4>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
} else if (align_off & 0x2) {
bulkLoad<2>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<2>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
} else { // if (align_off & 0x1)
bulkLoad<1>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<1>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
}
}
// must be less than 16 bytes
if (t < meta.len % DATA_LOAD_SIZE) {
volatile char* cpy_src = (char*) bounce_buf + meta.src_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t;
volatile char* cpy_dst = (char*) src + meta.dst_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t;
*cpy_dst = *cpy_src;
}
}
__syncwarp();
}
}
#endif // NET_DEVICE_UNPACK_DEFS_H_

View File

@ -0,0 +1,61 @@
/*************************************************************************
* Copyright (c) 2023, Google LLC. All rights reserved.
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NET_DEVICE_UNPACK_DEFS_H
#define NET_DEVICE_UNPACK_DEFS_H
#include <stdint.h>
#include "device.h"
#define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16
union alignas(16) loadMeta {
uint64_t r64[2];
struct {
uint32_t src_off;
uint32_t len;
uint64_t dst_off;
};
};
static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned");
/****** global memory ******/
#define NET_UNPACK_MAX_QUEUE_DEPTH 16 // MAX_REQUESTS
#define NET_UNPACK_MAX_SLICE_SIZE 4194304 // 4MB per Irecv call
#define SLICE_PAGE_SIZE 4096
#define NET_UNPACK_MAX_SLICE_PAGES \
(NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2) // * 2 for slack, wasteful..
struct netUnpackMeta {
loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES];
uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH];
};
struct unpackNetDeviceHandle {
struct netUnpackMeta *meta; // mapped
void* bounce_buf;
uint64_t head;
};
/****** shared memory ******/
#define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h
#define NET_UNPACK_MAX_NPEERS 2 // The most you should have is 2 network peers per-group (indexed by index)
#define WARP_SHM_PAGE_CNT 4
#define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta))
struct unpackShmem {
void* bounce_buf;
};
struct unpackGroupShmem {
int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
uint64_t head;
struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
};
#endif // NET_DEVICE_UNPACK_DEFS_H_

79
src/device/onerank.cu Normal file
View File

@ -0,0 +1,79 @@
/*************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "alloc.h"
#include "collectives.h"
#include "common_kernel.h"
#include "common.h"
#include <cuda_runtime.h>
namespace {
template<typename RedOp>
__global__ __launch_bounds__(512, 1)
void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) {
using T = typename RedOp::EltType;
int tid = threadIdx.x;
int tn = blockDim.x;
int bid = blockIdx.x;
int bn = gridDim.x;
// each block/channel gets a roughly equal segment of 16 byte packs
constexpr int EltPerPack = 16/sizeof(T);
intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack);
intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack);
i0 = min(i0, nElts);
i1 = min(i1, nElts);
src = (T*)src + i0;
dst = (T*)dst + i0;
if (redOpArgIsPtr) {
if (redOpArg%2 != 0) {
redOpArg = *reinterpret_cast<uint8_t*>(redOpArg);
} else if (redOpArg%4 != 0) {
redOpArg = *reinterpret_cast<uint16_t*>(redOpArg);
} else if (redOpArg%8 != 0) {
redOpArg = *reinterpret_cast<uint32_t*>(redOpArg);
} else {
redOpArg = *reinterpret_cast<uint64_t*>(redOpArg);
}
}
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
(tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0);
}
}
ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) {
size_t eltSize = ncclTypeSize(eltType);
if (redOp.op != ncclDevPreMulSum) {
if (dst != src) {
NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream));
}
return ncclSuccess;
}
void const* kernel;
switch (eltType) {
case ncclInt8: kernel = (void const*)&oneRankReduce<FuncPreMulSum<int8_t>>; break;
case ncclUint8: kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint8_t>>; break;
case ncclInt32: kernel = (void const*)&oneRankReduce<FuncPreMulSum<int32_t>>; break;
case ncclUint32: kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint32_t>>; break;
case ncclInt64: kernel = (void const*)&oneRankReduce<FuncPreMulSum<int64_t>>; break;
case ncclUint64: kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint64_t>>; break;
case ncclFloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<half>>; break;
#if defined(__CUDA_BF16_TYPES_EXIST__)
case ncclBfloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_bfloat16>>; break;
#endif
case ncclFloat32: kernel = (void const*)&oneRankReduce<FuncPreMulSum<float>>; break;
case ncclFloat64: kernel = (void const*)&oneRankReduce<FuncPreMulSum<double>>; break;
default: return ncclInvalidArgument;
}
dim3 grid = {0, 1, 1};
grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10));
dim3 block = {512, 1, 1};
void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr};
CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream));
return ncclSuccess;
}

View File

@ -161,21 +161,25 @@ __device__ __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack) {
// Load/store of BytePack<?> using integral addresses.
template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
template<int Size> __device__ BytePack<Size> ld_relaxed_gpu_global(uintptr_t addr);
template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
template<int Size> __device__ void st_relaxed_gpu_global(uintptr_t addr, BytePack<Size> value);
template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_relaxed_gpu_global<0>(uintptr_t addr) { return {}; }
template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {}
template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {}
template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t addr, BytePack<0> value) {}
// Used to define implementations for above prototypes.
#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
#define DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
@ -197,19 +201,44 @@ template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<
data_cxx_ty tmp = value.native; \
asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \
}
#if __CUDA_ARCH__ >= 700
#define PTX_relaxed_gpu "relaxed.gpu"
#else
#define PTX_relaxed_gpu "volatile"
#endif
#define DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<bytes> ld_relaxed_gpu_global<bytes>(uintptr_t addr) { \
data_cxx_ty tmp; \
asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
} \
template<> \
__device__ __forceinline__ void st_relaxed_gpu_global<bytes>(uintptr_t addr, BytePack<bytes> value) { \
data_cxx_ty tmp = value.native; \
asm volatile("st." PTX_relaxed_gpu ".global." #data_ptx_ty " [%0], %1;" :: "l"(addr), #data_reg_ty(tmp) : "memory"); \
}
#define DEFINE_ld_st__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \
DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, global, uintptr_t, l) \
DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, shared, uint32_t, r) \
DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty)
// Single-byte types use 4-byte registers since there is no 1-byte register
// character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
DEFINE_ld_st(1, uint32_t, b8, r, global, uintptr_t, l)
DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
#undef DEFINE_ld_st
DEFINE_ld_st__size(1, uint32_t, b8, r)
DEFINE_ld_st__size(2, uint16_t, b16, h)
DEFINE_ld_st__size(4, uint32_t, b32, r)
DEFINE_ld_st__size(8, uint64_t, b64, l)
#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
#undef DEFINE_ld_st__size_space
#undef DEFINE_ld_st__size
#define DEFINE_ld_st_16__space(space, addr_cxx_ty, addr_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
@ -226,10 +255,23 @@ DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
__device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
}
DEFINE_ld_st_16(global, uintptr_t, l)
DEFINE_ld_st_16(shared, uint32_t, r)
DEFINE_ld_st_16__space(global, uintptr_t, l)
DEFINE_ld_st_16__space(shared, uint32_t, r)
#undef DEFINE_ld_st_16
template<>
__device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) {
BytePack<16> ans;
asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr));
return ans;
}
template<>
__device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePack<16> value) {
asm volatile("st." PTX_relaxed_gpu ".global.v2.b64 [%0], {%1,%2};" :: "l"(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory");
}
#undef PTX_relaxed_gpu
////////////////////////////////////////////////////////////////////////////////
// Atomic load/store using c++ pointers.
@ -247,6 +289,15 @@ __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
#endif
return ans;
}
__device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) {
uint64_t ans;
#if __CUDA_ARCH__ >= 700
asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
#else
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
#endif
return ans;
}
__device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
uint64_t ans;
#if __CUDA_ARCH__ >= 700

View File

@ -323,7 +323,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),

View File

@ -364,7 +364,7 @@ public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),

View File

@ -4,6 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
#include "network/unpack/unpack.h"
template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
class Primitives<
@ -23,7 +25,11 @@ class Primitives<
DirectWrite = 0x200,
DirectRead = 0x400,
ThreadsSynced = 0x800,
NvlsMinPolling = 0x1000;
NvlsMinPolling = 0x1000,
NetDeviceUnpack = 0x2000,
AnyNetDeviceUnpack = 0x4000,
NvlsDirectRead = 0x8000,
NvlsDirectWrite = 0x10000;
const int tid, tidInBlock;
const int nthreads;
int nworkers;
@ -44,6 +50,8 @@ class Primitives<
};
uint64_t *connStepPtr;
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
void* mhandle;
void* netDeviceHandle;
// Don't use barrier 0 as it's used by the final sync
__device__ void barrier() {
@ -141,7 +149,7 @@ class Primitives<
if (flags & OffsFifoEnabled)
ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
else if (isSendNotRecv && DirectSend) {
if (flags & DirectWrite) {
if (flags & (DirectWrite | NvlsDirectWrite)) {
ptrs[index] = directBuff + dstIx + offset;
} else if (flags & DirectRead) { // empty send
ptrs[index] = nullptr;
@ -149,7 +157,7 @@ class Primitives<
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
} else if (!isSendNotRecv && DirectRecv) {
if (flags & DirectRead) {
if (flags & (DirectRead | NvlsDirectRead)) {
ptrs[index] = directBuff + srcIx + offset;
} else if (flags & DirectWrite) {
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
@ -160,6 +168,9 @@ class Primitives<
else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
ncclNetDeviceIncrementHead(group);
}
step += StepPerSlice;
}
}
@ -229,7 +240,16 @@ class Primitives<
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
* to 0 to avoid unnecessary workload. */
int workSize = ncclShmem.aborted ? 0 : sliceSize;
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
if (flags & AnyNetDeviceUnpack) {
ncclNetDeviceUnpack<Recv>(tid, tidInBlock, nworkers, group, ncclShmem.groups[group].devicePlugin.unpack.unpackNetDeviceIndexMask, Src, workSize);
// Sync here to make sure all workers are reading from the updated srcs)
subBarrier();
}
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]
/* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch",
* so we need to check whether MultimemSrcs and MultimemDsts are 0. */
&& MultimemSrcs == 0 && MultimemDsts == 0) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
@ -286,7 +306,7 @@ class Primitives<
// shift: peer offset to avoid all ranks sending to or receiving from same peer
template <int DirectRecv1, int DirectSend1, int Recv, int Send>
__device__ __forceinline__ void
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp) {
constexpr int DirectRecv = 1 && Direct && DirectRecv1;
constexpr int DirectSend = 1 && Direct && DirectSend1;
int offset = 0; // slice offset
@ -295,7 +315,7 @@ class Primitives<
#pragma unroll
for (int slice=0; slice<SlicePerChunk; ++slice) {
int realSize = max(0, min(dataSize, peerElem-offset));
ssize_t realSize = max(0, min(dataSize, peerElem-offset));
bool fenceNeeded = false;
if (tid < nworkers) {
if (Send) {
@ -309,11 +329,11 @@ class Primitives<
// Loop over peers
for (int j=0; j<fan.nsend(); j++) {
int i = (j+shift)%fan.nsend();
int pOffset = i*peerOffset;
ssize_t pOffset = i*peerOffset;
// Skip the data I am responsible of reducing myself
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
ssize_t realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
// Mark for threadfence at the end
@ -322,10 +342,10 @@ class Primitives<
}
} else if (Recv) {
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
int pOffset = index*peerOffset;
ssize_t pOffset = index*peerOffset;
if (skip >= 0 && index >= skip) pOffset += peerElem;
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx+pOffset, outIx+pOffset, offset, realSize);
subBarrier();
#pragma unroll
for (int j=0; j<fan.nrecv(); j++) {
@ -333,7 +353,7 @@ class Primitives<
pOffset = i*peerOffset;
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
ssize_t realPeerSize = min(realSize, totalElem-pOffset);
if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
if (realPeerSize > 0) reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
}
@ -348,6 +368,13 @@ class Primitives<
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
if (flags & (RoleWaitRecv|RolePostRecv)) {
auto *conn = &peer->recv[connIndex];
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
// handle must be a device ptr
netDeviceHandle = conn->netDeviceHandle.handle;
// Cache the handle
ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
flags |= NetDeviceUnpack;
}
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostRecv) {
@ -377,6 +404,9 @@ class Primitives<
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
/* NVLS direct */
flags |= NvlsDirectRead;
}
}
if (flags & OffsFifoEnabled)
@ -393,6 +423,7 @@ class Primitives<
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostSend) {
connStepPtr = conn->tail;
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
}
if (flags & RoleWaitSend) {
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
@ -424,6 +455,9 @@ class Primitives<
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
/* NVLS direct */
flags |= NvlsDirectWrite;
}
}
}
@ -434,10 +468,10 @@ class Primitives<
__device__ Primitives(
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, int stepSize_=0
):
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
// For send operations, we need an extra warp to overlap the threadfence and the copy
this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
@ -473,6 +507,20 @@ class Primitives<
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
if (barrierAny(flags & NetDeviceUnpack)) {
flags |= AnyNetDeviceUnpack;
// g == 0 is the first ThreadPerSync # of threads of this warp
// g == 0 is also the RoleWaitRecv threads of this group, thus the thread ID will correlate to the peer index
if (g == 0) {
uint32_t mask = __ballot_sync((1U << ThreadPerSync) - 1, (flags & NetDeviceUnpack) ? 1 : 0);
// We only want to update the shared memory variable with a single thread
if (tid == 0) {
ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
}
}
}
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
}
@ -485,8 +533,10 @@ class Primitives<
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
conns[index]->step = step;
}
// Make sure all threads are done writing back conn->step and done using
// ncclShmem.groups[group]
if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
ncclNetDeviceSaveHead(netDeviceHandle, group);
}
barrier();
}
@ -497,33 +547,41 @@ class Primitives<
}
if (flags & RoleOutput) userBuff = (T*)outputBuf;
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite);
bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer
bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
int regUsed = e != nullptr ? e->elem.regUsed : 0;
if (Direct && recvProvider) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
// Wait for consumer to consume previous value before trampling it.
while (*slot != nullptr && !checkAbort(spins));
directBuff = (T*)outputBuf;
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
if (slot) {
while (*slot != nullptr && !checkAbort(spins));
directBuff = (T*)outputBuf;
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
}
}
if (Direct && sendAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
void *ptr;
while (true) {
while (slot) {
ptr = *slot;
if (ptr != nullptr || checkAbort(spins)) break;
}
directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
if (slot) {
directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
*slot = nullptr;
*slot = nullptr;
} else {
/* slot is NULL, it must be regUsed == 1 */
directBuff = (T*)e->dnOutputs[index];
}
}
if (Direct && sendProvider) {
int spins = 0;
@ -531,17 +589,19 @@ class Primitives<
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
// Wait for consumer to consume previous value before trampling it.
while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
// Exchange pre-scalers for use in direct pull
*argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
*argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
if (slot && argSlot0 && argSlot1) {
while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
// Exchange pre-scalers for use in direct pull
*argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
*argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
}
}
if (Direct && recvAcceptor) {
int spins = 0;
@ -549,24 +609,29 @@ class Primitives<
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
void *ptr;
while (true) {
while (slot) {
ptr = *slot;
if (ptr != nullptr || checkAbort(spins)) break;
}
directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
if (MaxSend != 0) { // reduce group rather than gather group
// Store scalers for remote inputs
uint64_t arg0, arg1;
while (true) {
arg0 = *argSlot0;
arg1 = *argSlot1;
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
if (slot && argSlot0 && argSlot1) {
directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
if (MaxSend != 0) { // reduce group rather than gather group
// Store scalers for remote inputs
uint64_t arg0, arg1;
while (true) {
arg0 = *argSlot0;
arg1 = *argSlot1;
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
}
ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
}
ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
*argSlot0 = 0; *argSlot1 = 0;
*slot = nullptr;
} else {
directBuff = (T*)e->dnInputs[index];
}
*argSlot0 = 0; *argSlot1 = 0;
*slot = nullptr;
}
}
@ -594,6 +659,9 @@ class Primitives<
__device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
}
__device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
}
__device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
@ -611,6 +679,9 @@ class Primitives<
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
}
__device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
}
__device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
@ -635,20 +706,20 @@ class Primitives<
}
__device__ __forceinline__ void
scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void
directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
directScatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void
gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
gather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp=false) {
ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
}
__device__ __forceinline__ void
directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
};

View File

@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "device.h"
#include "collectives.h"
#include "primitives.h"

View File

@ -12,6 +12,19 @@
#include <limits>
#include <type_traits>
template<typename T>
struct IsFloatingPoint: std::false_type {};
template<>
struct IsFloatingPoint<half>: std::true_type {};
#if defined(__CUDA_BF16_TYPES_EXIST__)
template<>
struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
#endif
template<>
struct IsFloatingPoint<float>: std::true_type {};
template<>
struct IsFloatingPoint<double>: std::true_type {};
////////////////////////////////////////////////////////////////////////////////
// The reduction function classes. All classes must:
// 1. Expose the `EltType` typedef.
@ -19,16 +32,21 @@
// 3. Have constructor taking `uint64_t opArg`.
template<typename T>
struct FuncNull { using EltType = T; __device__ FuncNull(uint64_t opArg=0) {}; };
struct FuncCopy { using EltType = T; __device__ FuncCopy(uint64_t opArg=0) {}; };
template<typename T>
struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
template<typename T>
struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
template<typename T>
struct FuncMin { using EltType = T; __device__ FuncMin(uint64_t opArg=0) {}; };
template<typename T>
struct FuncMax { using EltType = T; __device__ FuncMax(uint64_t opArg=0) {}; };
struct FuncMinMax {
using EltType = T;
BytePack<sizeof(T)> xormask; // only used by integers
bool isMinNotMax; // only used by floats
__device__ FuncMinMax(uint64_t opArg=0) {
xormask.native = opArg;
isMinNotMax = (opArg&1)==0;
}
};
template<typename T> struct FuncPreMulSum;
template<typename T> struct FuncSumPostDiv;
@ -127,8 +145,8 @@ struct Apply_Reduce {
// Base case definitions (EltPerPack == 1)
template<typename T>
struct Apply_Reduce<FuncNull<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
struct Apply_Reduce<FuncCopy<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncCopy<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return a;
}
};
@ -145,15 +163,9 @@ struct Apply_Reduce<FuncProd<T>, /*EltPerPack=*/1> {
}
};
template<typename T>
struct Apply_Reduce<FuncMin<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncMin<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return toPack<T>(min(fromPack<T>(a), fromPack<T>(b)));
}
};
template<typename T>
struct Apply_Reduce<FuncMax<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return toPack<T>(max(fromPack<T>(a), fromPack<T>(b)));
struct Apply_Reduce<FuncMinMax<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncMinMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return (a.native ^ fn.xormask.native) < (b.native ^ fn.xormask.native) ? a : b;
}
};
@ -161,57 +173,55 @@ struct Apply_Reduce<FuncMax<T>, /*EltPerPack=*/1> {
template<>
struct Apply_Reduce<FuncSum<uint8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
constexpr uint32_t lo = 0x00ff00ff;
constexpr uint32_t hi = ~lo;
uint32_t x = a.u32;
uint32_t y = b.u32;
a.u32 = (((x&lo) + (y&lo))&lo) + (((x&hi) + (y&hi))&hi);
constexpr uint32_t even = 0x00ff00ffu;
uint32_t x = (a.native & even) + (b.native & even);
uint32_t y = (a.native & ~even) + (b.native & ~even);
//a.native = (x & even) | (y & ~even);
a.native = __byte_perm(x, y, 0x7250);
return a;
}
};
template<>
struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncSum<int8_t> fn, BytePack<4> a, BytePack<4> b) {
return Apply_Reduce<FuncSum<uint8_t>, 4>::reduce(FuncSum<uint8_t>(), a, b);
struct Apply_Reduce<FuncMinMax<uint8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMinMax<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
constexpr uint32_t ones = 0x01010101u;
constexpr uint32_t even = 0x00ff00ffu; // even byte mask
// Replicate xormask to all bytes
uint32_t x = fn.xormask.native * ones;
// Transform inputs by xormask
uint32_t ax = a.native ^ x;
uint32_t bx = b.native ^ x;
// Use 9-bit arithmetic to compute d=a-b
uint32_t d0 = (ax & even) + (~bx & even) + ones;
uint32_t d1 = (ax>>8 & even) + (~(bx>>8) & even) + ones;
// Move sign bit of each 9-bit delta into the least bit of origin byte
//uint32_t s = (d0>>8 & ones & even) | (d1 & ones & ~even);
uint32_t s = __byte_perm(d0, d1, 0x7351) & ones;
// Broadcast least bit across whole byte
s *= 0xffu;
// Compose result by selecting bytes via: signbit(a-b)==1 ? a : b
a.native = (a.native & s) | (b.native & ~s);
return a;
}
};
#if 300 <= __CUDA_ARCH__ && __CUDA_ARCH__ < 500
template<>
struct Apply_Reduce<FuncMin<uint8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMin<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
uint32_t z=0;
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
return a;
}
};
template<>
struct Apply_Reduce<FuncMin<int8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMin<int8_t> fn, BytePack<4> a, BytePack<4> b) {
int32_t z=0;
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
return a;
}
};
template<>
struct Apply_Reduce<FuncMax<uint8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMax<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
uint32_t z=0;
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
return a;
}
};
template<>
struct Apply_Reduce<FuncMax<int8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMax<int8_t> fn, BytePack<4> a, BytePack<4> b) {
int32_t z=0;
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
return a;
}
};
#endif
template<>
struct Apply_Reduce<FuncProd<uint8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncProd<uint8_t> fn, BytePack<4> apack, BytePack<4> bpack) {
uint32_t a = apack.native;
uint32_t b = bpack.native;
uint32_t ab0 = (a*b) & 0xffu;
asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
uint32_t ab1;
asm("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
apack.native = __byte_perm(ab0, ab1, 0x6420);
return apack;
}
};
#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_x_y) \
#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_fn_x_y) \
template<> \
struct Apply_Reduce<Fn<T>, EltPerPack> { \
__device__ __forceinline__ static BytePack<sizeof(Vec)> reduce( \
@ -219,10 +229,13 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
) { \
Vec x = fromPack<Vec>(a); \
Vec y = fromPack<Vec>(b); \
return toPack<Vec>(expr_of_x_y); \
return toPack<Vec>(expr_of_fn_x_y); \
} \
};
SPECIALIZE_REDUCE(FuncMinMax, float, 1, float, fn.isMinNotMax ? fminf(x, y) : fmaxf(x, y))
SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : fmax(x, y))
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y))
SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y))
@ -234,13 +247,10 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
#endif
#if __CUDA_ARCH__ >= 800
SPECIALIZE_REDUCE(FuncMin, half, 1, half, __hmin(x, y))
SPECIALIZE_REDUCE(FuncMin, half, 2, half2, __hmin2(x, y))
SPECIALIZE_REDUCE(FuncMax, half, 1, half, __hmax(x, y))
SPECIALIZE_REDUCE(FuncMax, half, 2, half2, __hmax2(x, y))
SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
#else
SPECIALIZE_REDUCE(FuncMin, half, 1, half, __float2half(fminf(__half2float(x), __half2float(y))))
SPECIALIZE_REDUCE(FuncMax, half, 1, half, __float2half(fmaxf(__half2float(x), __half2float(y))))
SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y))))
#endif
#if defined(__CUDA_BF16_TYPES_EXIST__)
@ -249,15 +259,12 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __hmin(x, y))
SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 2, __nv_bfloat162, __hmin2(x, y))
SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __hmax(x, y))
SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 2, __nv_bfloat162, __hmax2(x, y))
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
#else
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y)))
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) * __bfloat162float(y)))
SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fminf(__bfloat162float(x), __bfloat162float(y))))
SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fmaxf(__bfloat162float(x), __bfloat162float(y))))
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fn.isMinNotMax ? fminf(__bfloat162float(x), __bfloat162float(y)) : fmaxf(__bfloat162float(x), __bfloat162float(y))))
#endif
#endif
@ -479,19 +486,6 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
////////////////////////////////////////////////////////////////////////////////
// FuncSumPostDiv
template<typename T>
struct IsFloatingPoint: std::false_type {};
template<>
struct IsFloatingPoint<half>: std::true_type {};
#if defined(__CUDA_BF16_TYPES_EXIST__)
template<>
struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
#endif
template<>
struct IsFloatingPoint<float>: std::true_type {};
template<>
struct IsFloatingPoint<double>: std::true_type {};
template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
struct FuncSumPostDiv_IntOnly;
@ -543,25 +537,44 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
#define SIZEOF_BytePack_field_u64 8
#define PTX_REG_BytePack_field_u64 "l"
#define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \
#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>, SIZEOF_BytePack_field_##pack_field> { \
struct Apply_LoadMultimem<FuncSum<T>, SIZEOF_BytePack_field_##pack_field> { \
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
__device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
return ans; \
} \
};
#define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
__device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
struct Apply_LoadMultimem<FuncMinMax<T>, SIZEOF_BytePack_field_##pack_field> { \
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
} \
return ans; \
} \
};
#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<FuncSum<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
@ -570,18 +583,61 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
return ans; \
} \
};
#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \
DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>, sizeof(T)> { \
__device__ static BytePack<sizeof(T)> load(Fn<T> fn, uintptr_t addr) { \
struct Apply_LoadMultimem<FuncMinMax<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
} \
return ans; \
} \
};
#define DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(T, ptx_ty, pack_field) \
DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
__device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(sizeof(T)))); \
return tmp.half[(addr/sizeof(T))%2]; \
} \
};
#define DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(T, ptx_ty, pack_field) \
DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<FuncMinMax<T>, sizeof(T)> { \
__device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(sizeof(T)))); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(sizeof(T)))); \
} \
return tmp.half[(addr/sizeof(T))%2]; \
} \
};
template<typename Fn, int BytePerPack>
struct Apply_LoadMultimem {
@ -598,46 +654,39 @@ struct Apply_LoadMultimem {
static constexpr bool IsSum = std::is_same<Fn, FuncSum<T>>::value ||
std::is_same<Fn, FuncPreMulSum<T>>::value ||
std::is_same<Fn, FuncSumPostDiv<T>>::value;
static constexpr bool IsMinOrMax = std::is_same<Fn, FuncMin<T>>::value ||
std::is_same<Fn, FuncMax<T>>::value;
static constexpr bool IsMinMax = std::is_same<Fn, FuncMinMax<T>>::value;
static constexpr bool IsFloat = IsFloatingPoint<T>::value;
static constexpr int BigPackSize =
IsFloat && IsSum && sizeof(T) < 8 ? 16 :
IsFloat && IsSum ? 8 :
IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 :
!IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) :
IsFloat && IsMinMax && sizeof(T)==2 ? 16 :
!IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) :
/*multimem.ld_reduce not supported:*/ 0;
};
DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32)
DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32)
DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32)
DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, u32)
DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, u32)
DEFINE_Apply_LoadMultimem(FuncSum, int32_t, add, s32, u32)
DEFINE_Apply_LoadMultimem(FuncMin, int32_t, min, s32, u32)
DEFINE_Apply_LoadMultimem(FuncMax, int32_t, max, s32, u32)
DEFINE_Apply_LoadMultimem_sum(int32_t, s32, u32)
DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, u32)
DEFINE_Apply_LoadMultimem(FuncSum, uint64_t, add, u64, u64)
DEFINE_Apply_LoadMultimem(FuncMin, uint64_t, min, u64, u64)
DEFINE_Apply_LoadMultimem(FuncMax, uint64_t, max, u64, u64)
DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, u64)
DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, u64)
DEFINE_Apply_LoadMultimem(FuncSum, int64_t, add, u64, u64)
DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64)
DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64)
DEFINE_Apply_LoadMultimem_sum(int64_t, u64, u64)
DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, u64)
DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32)
DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32)
DEFINE_Apply_LoadMultimem_sum(float, f32, u32)
DEFINE_Apply_LoadMultimem_sum_v4(float, f32, u32)
DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64)
DEFINE_Apply_LoadMultimem_sum(double, f64, u64)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32)
DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(half, f16x2, u32)
DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(half, f16x2, u32)
#if defined(__CUDA_BF16_TYPES_EXIST__)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32)
DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32)
DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32)
#endif
#else
template<typename Fn>

View File

@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "device.h"
#include "collectives.h"
#include "primitives.h"
@ -98,33 +98,69 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*chunkSize;
const int rank = ncclShmem.comm.rank;
const int nranks = ncclShmem.comm.nRanks;
const int nThreadsScatter = 128 + WARP_SIZE;
const int nThreadsReduce = 384;
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
* if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
* and the rest are allocated to scatter. */
const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
const int tidEndScatter = nThreadsScatter;
const int tidEndReduce = tidEndScatter + nThreadsReduce;
using Proto = ProtoSimple<1, 1>;
if (tid < tidEndScatter) {
// Scatter
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
if (!args->regUsed) {
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.scatter(offset, nvls->nHeads * size, nelem, size, -1, 0);
}
} else if (tid < tidEndReduce) {
// Reduce through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.recv(offset, nelem);
}
}
} else if (tid < tidEndReduce) {
// Reduce through NVLS
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem);
} else {
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
prims.scatter(0, 0, 0, 0, -1, 0);
}
/* gather used as sync */
prims.gather(0, 0, 0, 0, -1, 0);
} else if (tid < tidEndReduce) {
// Reduce through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t outOffset = gridOffset + bid * chunkSize;
ssize_t inpOffset = outOffset + rank * size;
int nelem = min(chunkSize, size - outOffset);
prims.directRecvCopy(inpOffset, outOffset, nelem);
}
/* send for sync */
prims.send(0, 0);
}
}
}

View File

@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "device.h"
#include "collectives.h"
#include "primitives.h"
@ -26,7 +26,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1);
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
size_t offset = 0;
do {
int nelem = min(size_t(chunkSize), count-offset);
@ -45,7 +45,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1);
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
size_t offset = 0;
do {
int nelem = min(size_t(chunkSize), count-offset);

View File

@ -11,83 +11,16 @@
#include "bootstrap.h"
#include "channel.h"
#include "cudawrap.h"
#include "transport.h"
#include <cstring> // std::memcpy
#include <cinttypes> // PRIx64
static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t);
struct ncclKernelMatch {
void* kernelFn;
bool specialized;
};
// Only generate inline kernels for LL
#define NCCL_FUNC5(func, algo, devredop, dtype, specialized) \
/*LL */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), true && specialized}, \
/*LL128 */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}, \
/*SIMPLE*/{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}
#define NCCL_FUNC4(func, devredop, type, specialized) \
NCCL_FUNC5(func, TREE, devredop, type, specialized), \
NCCL_FUNC5(func, RING, devredop, type, specialized), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized), \
NCCL_FUNC5(func, NVLS, devredop, type, specialized), \
NCCL_FUNC5(func, NVLS_TREE, devredop, type, specialized)
#ifdef __CUDA_BF16_TYPES_EXIST__
#define HAVE_BFLOAT16 1
#else
#define HAVE_BFLOAT16 0
#endif
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3(func, devredop, reduction, specialized) \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int8_t, int8_t), specialized), \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint8_t, int8_t), specialized), \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int32_t, int8_t), specialized), \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint32_t, int8_t), specialized), \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int64_t, int8_t), specialized), \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint64_t, int8_t), specialized), \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, half, int8_t), specialized), \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, float, int8_t), specialized), \
NCCL_FUNC4(func, devredop, MACRO_IF(reduction, double, int8_t), specialized) \
MACRO_IF(HAVE_BFLOAT16, \
SINGLE_ARG(, NCCL_FUNC4(func, devredop, MACRO_IF(reduction, __nv_bfloat16, int8_t), specialized)), \
/*nothing*/ \
)
// Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums.
#define NCCL_FUNCS2(func, reduction) \
NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/1), /*Sum*/ \
NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Prod*/ \
NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Max*/ \
NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Min*/ \
NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*PreMulSum*/ \
NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0) /*SumPostDiv*/
// Must be consistent with the ncclFuncSet enum
static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
{(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true},
// We don't bake special kernels for the one-rank reductions
{/*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
{/*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
{/*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
{/*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
{/*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
{/*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
{/*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
{/*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
{/*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
#if HAVE_BFLOAT16
{/*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
#endif
NCCL_FUNCS2(Broadcast, /*reduction=*/0),
NCCL_FUNCS2(Reduce, /*reduction=*/1),
NCCL_FUNCS2(AllGather, /*reduction=*/0),
NCCL_FUNCS2(ReduceScatter, /*reduction=*/1),
NCCL_FUNCS2(AllReduce, /*reduction=*/1)
enum ncclRegBufferType {
NCCL_REGULAR_BUFFER = 0,
NCCL_IPC_REG_BUFFER = 1,
NCCL_NVLS_REG_BUFFER = 2,
NCCL_REG_BUFFER_NUM = 3
};
static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
@ -96,19 +29,14 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
// Returns maximum kernel stack size of all CUDA kernels
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
ncclResult_t result = ncclSuccess;
if (maxStackSize) *maxStackSize = 0;
int carveout = ncclParamL1SharedMemoryCarveout();
// Keep track if we already visited a function pointer.
void* lru[2] = {nullptr, nullptr};
for (int i=0; i < KernelCount; i++) {
void* fn = ncclKerns[i].kernelFn;
if (fn == lru[0] || fn == lru[1]) goto next_kernel;
lru[1] = lru[0];
lru[0] = fn;
for (int k=0; k < ncclDevKernelCount; k++) {
void* fn = ncclDevKernelList[k];
if (fn == nullptr) continue;
if (maxStackSize) {
cudaFuncAttributes attr = {0};
@ -116,14 +44,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
ignore0:;
}
if (carveout) {
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
result, ignore1);
ignore1:;
}
if (ncclShmemDynamicSize(cudaArch) != 0) {
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
@ -218,7 +144,7 @@ static void appendWorkElemP2p(
struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
struct ncclWorkElemP2p const *elem, bool fuseOk
) {
constexpr int funcIndex = FUNC_INDEX_P2P;
int funcIndex = ncclDevFuncId_P2p();
struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
if (q && funcIndex == q->work.header.funcIndex) {
@ -240,7 +166,7 @@ static void appendWorkElemP2p(
}
q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
q->work.header.type = ncclWorkTypeP2p;
q->work.header.funcIndex = FUNC_INDEX_P2P;
q->work.header.funcIndex = ncclDevFuncId_P2p();
chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0;
chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1;
q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment
@ -265,7 +191,7 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP
static ncclResult_t addCollToPlan(
struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
int nCollChannels, int nBid, size_t bytes, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[]
) {
struct ncclKernelPlan::Channel *chans = plan->channels;
@ -307,10 +233,9 @@ static ncclResult_t addCollToPlan(
// Add work elem
*nWorkBudget += chans[c].nWork;
if (!regBufUsed) {
if (regBufType == NCCL_REGULAR_BUFFER) {
appendWorkElemColl(comm, plan, c, funcIndex, workElem, bid);
} else {
// Buffer registration in play which could only for CollNet at the moment.
} else if (regBufType == NCCL_IPC_REG_BUFFER) {
struct ncclChannel* channel = &comm->channels[c];
struct ncclWorkElemReg workElemReg;
workElemReg.elem = *workElem; // C++ struct assignment
@ -330,6 +255,18 @@ static ncclResult_t addCollToPlan(
workElemReg.upOutputs[i] = regBufRecv[j];
}
appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid);
} else if (regBufType == NCCL_NVLS_REG_BUFFER) {
struct ncclWorkElemReg workElemReg;
workElemReg.elem = *workElem; // C++ struct assignment
workElemReg.elem.regUsed = 1;
/* NVLS only has one send and recv buffer registered */
workElemReg.dnInputs[0] = regBufSend[0];
workElemReg.dnOutputs[0] = regBufRecv[0];
appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid);
} else {
/* impossible value */
WARN("Invalid regBufType %d\n", regBufType);
return ncclInvalidArgument;
}
*nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork
@ -417,68 +354,118 @@ static void finishPlan(struct ncclKernelPlan* plan) {
plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE);
}
int64_t ncclParamLocalRegister();
NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1);
static ncclResult_t registerIntraNodeBuffers(
struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info,
bool* outRegBufUsed,
void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS]
void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
ncclRegBufferType *outRegBufType
) {
*outRegBufUsed = false;
ncclResult_t result = ncclSuccess;
*outRegBufType = NCCL_REGULAR_BUFFER;
#if CUDART_VERSION >= 11030
int localRank = comm->localRank;
if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
bool regBufUsed = false;
const void *sendbuff = info->sendbuff;
void *recvbuff = info->recvbuff;
cudaPointerAttributes sattr, rattr;
bool query = false;
if (info->coll == ncclFuncAllGather)
sendbuff = NULL;
else if (info->coll == ncclFuncReduceScatter)
recvbuff = NULL;
if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
/* first try local registration. */
if (ncclParamLocalRegister()) {
CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
query = true;
if (sattr.type == cudaMemoryTypeDevice && rattr.type == cudaMemoryTypeDevice)
ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
}
struct HandlePair {
cudaIpcMemHandle_t ipc[2]; // {send, recv}
size_t offset[2]; // {send, recv}
};
struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
if (regBufUsed == false && plan->persistent && ncclParamGraphRegister()) {
if (!query) {
CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
}
if (sattr.type == cudaMemoryTypeDevice && rattr.type == cudaMemoryTypeDevice)
ncclNvlsGraphRegisterBuffer(comm, plan, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
}
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
if (regBufUsed) {
/* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
* saturate bandwidth. */
if (info->coll == ncclFuncReduceScatter)
info->nChannels = std::min(5, comm->nvlsChannels);
else
info->nChannels = std::min(4, comm->nvlsChannels);
*outRegBufType = NCCL_NVLS_REG_BUFFER;
}
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
comm->intraRanks < comm->localRanks && // only with inter-process & intra-node peers
plan->persistent && 0) {
/* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
int localRank = comm->localRank;
cudaPointerAttributes sattr, rattr;
void *baseSend, *baseRecv;
size_t size;
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess;
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
// Open handles locally
for (int i=0; i < comm->localRanks; i++) {
if (i == localRank) { // Skip self
outRegBufSend[i] = nullptr;
outRegBufRecv[i] = nullptr;
} else {
for (int sr=0; sr < 2; sr++) {
// Get base address of mapping
void* base;
CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
// Get real buffer address by adding offset in the mapping
(sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
// Enqueue reminder to close memory handle
struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
q->ptr = base;
ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
struct HandlePair {
cudaIpcMemHandle_t ipc[2]; // {send, recv}
size_t offset[2]; // {send, recv}
};
struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
void *baseSend, *baseRecv;
size_t size;
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
// Open handles locally
for (int i=0; i < comm->localRanks; i++) {
if (i == localRank) { // Skip self
outRegBufSend[i] = nullptr;
outRegBufRecv[i] = nullptr;
} else {
for (int sr=0; sr < 2; sr++) {
// Get base address of mapping
void* base;
CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
// Get real buffer address by adding offset in the mapping
(sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
// Enqueue reminder to close memory handle
struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
q->ptr = base;
ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
}
}
}
*outRegBufType = NCCL_IPC_REG_BUFFER;
}
*outRegBufUsed = true;
fallback:
#endif
return result;
}
NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0);
static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport);
static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps);
static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport);
static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps);
static ncclResult_t scheduleCollTasksToPlan(
struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
@ -517,6 +504,7 @@ static ncclResult_t scheduleCollTasksToPlan(
int nAggChannels = 0;
int nAggOps = 1;
struct ncclTaskColl* aggEnd = head->next;
int nvlsSupport = comm->nvlsSupport && ncclNvlsSupported(aggInfo.opFull.op, aggInfo.datatype);
int collNetSupport = 0;
NCCLCHECK(getCollNetSupport(&aggInfo, &collNetSupport));
@ -537,7 +525,7 @@ static ncclResult_t scheduleCollTasksToPlan(
NCCLCHECK(ncclInfoSetDerived(&aggInfo, comm->nRanks));
aggInfo.nChannels = std::min(comm->nChannels, nAggChannels);
int opPerChannel = DIVUP(nAggChannels, aggInfo.nChannels);
NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, opPerChannel));
NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, nvlsSupport, opPerChannel));
}
while (head != aggEnd) {
@ -566,23 +554,26 @@ static ncclResult_t scheduleCollTasksToPlan(
int workFuncIndex;
struct ncclWorkElem workElem = {};
struct ncclProxyOp proxyOp = {};
NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp));
// Check whether algo and proto have been preset (as in aggregation case)
// If so, skip the calculation
if (info.nChannels <= 0 || info.nThreads <= 0) {
NCCLCHECK(getAlgoInfo(&info, collNetSupport, nvlsSupport, 1));
}
if (*nWorkBudget < info.nChannels) return ncclSuccess; // Ensure room for addCollToPlan()
bool regBufUsed = false;
/* if possible, start registration */
ncclRegBufferType regBufType = NCCL_REGULAR_BUFFER;
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
if (plan->persistent && ncclParamGraphRegister() &&
info.algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers
NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
}
registerIntraNodeBuffers(comm, plan, &info, regBufSend, regBufRecv, &regBufType);
NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp));
int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
maxChannels, info.nChannels, info.nBytes, regBufType, regBufSend, regBufRecv));
tasks->nTasksColl -= 1;
tasks->collBytesTotal -= info.nBytes;
ncclIntruQueueDequeue(&tasks->collQueue);
@ -590,8 +581,8 @@ static ncclResult_t scheduleCollTasksToPlan(
plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads);
if (!plan->kernelSpecialized) {
plan->kernelFn = ncclKerns[workFuncIndex].kernelFn;
plan->kernelSpecialized = ncclKerns[workFuncIndex].specialized;
plan->kernelFn = ncclDevKernelForFunc[workFuncIndex];
plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[workFuncIndex];
}
}
}
@ -619,8 +610,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
if (!plan->kernelSpecialized) {
plan->kernelFn = ncclKerns[FUNC_INDEX_P2P].kernelFn;
plan->kernelSpecialized = ncclKerns[FUNC_INDEX_P2P].specialized;
plan->kernelFn = ncclDevKernelForFunc[ncclDevFuncId_P2p()];
plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[ncclDevFuncId_P2p()];
}
// Compute how much to split operations
@ -893,6 +884,13 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr));
ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q);
}
/* free mcHandle */
while (!ncclIntruQueueEmpty(&plan->nvlsMcHandleQueue)) {
struct ncclNvlsMcHandleList* obj = ncclIntruQueueDequeue(&plan->nvlsMcHandleQueue);
NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj);
}
}
ncclMemoryPoolTakeAll(&comm->memPool_ncclProxyOp, &plan->memPool_ncclProxyOp);
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
@ -1142,45 +1140,64 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
/* Enqueueing system : computation of kernel and proxy operations parameters */
/*****************************************************************************/
static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport) {
// Translate ncclAvg and PreMulSum
ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
*collNetTypeSupport = info->comm->collNetSupportMatrix[netOp][info->datatype];
*collNetSupport = info->comm->collNetSupport && info->comm->collNetSupportMatrix[netOp][info->datatype];
return ncclSuccess;
}
// numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) {
static ncclResult_t topoGetAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps) {
struct ncclComm* comm = info->comm;
if (comm->nRanks == 1) {
info->algorithm = NCCL_ALGO_RING;
info->protocol = NCCL_PROTO_SIMPLE;
}
else {
else if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
float backupMinTime = 3600000000.0;
bool backup = false;
int backupAlgo = NCCL_ALGO_UNDEF; // back up algo and proto if no algo/proto is picked up.
int backupProto = NCCL_PROTO_UNDEF;
// Find algorithm / protocol.
info->algorithm = -1;
info->protocol = -1;
int nAlgos = NCCL_NUM_ALGORITHMS;
for (int a=0; a<nAlgos; a++) {
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue;
if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
if (a == NCCL_ALGO_NVLS && nvlsSupport != 1 && info->coll != ncclFuncAllGather) continue;
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
/* now we only support single-node NVLS allgather and reducescatter */
if (a == NCCL_ALGO_NVLS && (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
if (a == NCCL_ALGO_NVLS_TREE && nvlsSupport != 1) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float time;
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
if (time >= 0 && time < minTime) {
info->algorithm = a;
info->protocol = p;
minTime = time;
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time, &backup));
if (!backup) {
if (time >= 0 && time < minTime) {
info->algorithm = a;
info->protocol = p;
minTime = time;
}
} else {
if (time >= 0 && time < backupMinTime) {
backupAlgo = a;
backupProto = p;
backupMinTime = time;
}
}
}
}
if (info->algorithm == -1 || info->protocol == -1) {
WARN("Error : no algorithm/protocol available");
return ncclInternalError;
if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
WARN("Error : no algorithm/protocol available");
return ncclInternalError;
}
info->algorithm = backupAlgo;
info->protocol = backupProto;
}
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
@ -1222,6 +1239,25 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
return ncclSuccess;
}
// Use the default topo-based tuner if tuner plugin is not successful.
// Call the plugin first. Let it set algo+proto, and/or nChannels.
// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto.
// Finally, nChannels will be overriden by the plugin setting.
static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps) {
info->algorithm = NCCL_ALGO_UNDEF;
info->protocol = NCCL_PROTO_UNDEF;
int nChannels = 0;
if (info->comm->tuner != NULL) {
NCCLCHECK(info->comm->tuner->getCollInfo(
info->coll, info->nBytes,
collNetSupport, nvlsSupport, numPipeOps,
&info->algorithm, &info->protocol, &nChannels));
}
NCCLCHECK(topoGetAlgoInfo(info, collNetSupport, nvlsSupport, numPipeOps));
if (nChannels) info->nChannels = nChannels; // Set by plugin; override default.
return ncclSuccess;
}
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
switch (info->coll) {
case ncclFuncBroadcast:
@ -1275,14 +1311,6 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
}
static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
int collNetTypeSupport = 0;
// Check whether algo and proto have been preset (as in aggregation case)
// If so, skip the calculation
if (info->nChannels > 0 && info->nThreads > 0) goto comp_next;
NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport));
NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1));
comp_next:
// Set nstepsPerLoop and nchunksPerLoop
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
@ -1295,14 +1323,7 @@ comp_next:
work->nWarps = info->nThreads / WARP_SIZE;
work->redOpArg = info->opFull.scalarArg;
work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
if (info->comm->nRanks == 1) {
// one-rank reduce index
*workFuncIndex = 1 + int(info->datatype);
return ncclSuccess;
}
*workFuncIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
*workFuncIndex = ncclDevFuncId(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@ -1337,6 +1358,7 @@ comp_next:
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_NVLS) {
int maxChunkSize = 131072;
if (info->comm->nNodes > 1 && info->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
@ -1347,6 +1369,7 @@ comp_next:
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
if (info->comm->nNodes >= 4) chunkSize = 65536;
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
@ -1381,7 +1404,7 @@ comp_next:
proxyOp->protocol = info->protocol;
proxyOp->dtype = info->datatype;
proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
info->op;
info->opFull.proxyOp;
proxyOp->pattern = info->pattern;
proxyOp->root = info->root;
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
@ -1399,27 +1422,37 @@ static ncclResult_t hostToDevRedOp(
ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm
) {
union {
int8_t i8;
uint8_t u8;
int32_t i32;
uint32_t u32;
int64_t i64;
uint64_t u64;
half f16;
int8_t i8; uint8_t u8;
int32_t i32; uint32_t u32;
int64_t i64; uint64_t u64;
half f16; float f32; double f64;
#if defined(__CUDA_BF16_TYPES_EXIST__)
__nv_bfloat16 bf16;
#endif
float f32;
double f64;
void *ptr;
};
u64 = 0;
opFull->scalarArgIsPtr = false;
opFull->proxyOp = op;
int nbits = 8*ncclTypeSize(datatype);
uint64_t allBits = uint64_t(-1)>>(64-nbits);
uint64_t signBit = allBits^(allBits>>1);
switch (int(op)) {
case ncclSum: opFull->op = ncclDevSum; break;
case ncclProd: opFull->op = ncclDevProd; break;
case ncclMax: opFull->op = ncclDevMax; break;
case ncclMin: opFull->op = ncclDevMin; break;
case ncclMin:
case ncclMax:
opFull->op = ncclDevMinMax;
opFull->scalarArg = 0;
// The xormask used by ncclFuncMinMax<[u]int> is the XOR of the sign bit
// for signed (opposed to unsigned) types and all the bits for max (opposed to min).
if (datatype==ncclInt8 || datatype==ncclInt32 || datatype==ncclInt64) {
opFull->scalarArg ^= signBit;
}
opFull->scalarArg ^= (op == ncclMax) ? allBits : 0;
break;
case ncclAvg:
switch ((int)datatype) {
case ncclInt8: case ncclInt32: case ncclInt64:
@ -1513,12 +1546,8 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
struct ncclDevRedOpFull opFull;
NCCLCHECK(hostToDevRedOp(&opFull, info->op, info->datatype, comm));
// User-defined reduction ops may need alter the data even for unitary reductions
if (comm->nRanks == 1 && opFull.op < ncclDevPreMulSum) {
if (info->sendbuff != info->recvbuff) {
size_t bytes = info->count*ncclTypeSize(info->datatype);
CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, bytes, cudaMemcpyDeviceToDevice, info->stream));
}
if (comm->nRanks == 1) {
NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opFull, info->datatype, info->stream));
return ncclSuccess;
} else {
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.

View File

@ -370,13 +370,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
}
for (int r=0; r<nranks; r++) {
ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
}
}
for (int c=0; c<graphs[NCCL_ALGO_NVLS]->nChannels; c++) {
for (int n=0; n<nNodes; n++) {
int r = firstRanks[n];
nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));

View File

@ -70,7 +70,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
if (remNode->links[l].remNode == node) {
if (remNode->links[l].remNode == node && remNode->links[l].type == link->type) {
remPath->list[0] = remNode->links+l;
break;
}
@ -126,7 +126,7 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
for (int i=0; i<node->paths[t][n].count; i++) {
struct ncclTopoLink* link = node->paths[t][n].list[i];
struct ncclTopoNode* remNode = link->remNode;
sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
sprintf(line+offset, "--%s(%g)->%s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], remNode->id);
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw);
@ -212,14 +212,14 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
if (*level == -1) {
int l = -1;
if (disableEnv) {
char* str = getenv(disableEnv);
const char* str = ncclGetEnv(disableEnv);
if (str) {
int disable = strtol(str, NULL, 0);
if (disable == 1) l = 0;
}
}
if (l == -1) {
char* str = getenv(levelEnv);
const char* str = ncclGetEnv(levelEnv);
if (str) {
for (int i=0; i<=PATH_SYS; i++) {
if (strcmp(str, topoPathTypeStr[i]) == 0) {
@ -318,14 +318,15 @@ compare:
status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite;
good &= status == NVML_P2P_STATUS_OK;
if (!good) {
if (ncclParamIgnoreDisabledP2p()) {
*p2p = 0;
} else if (path->type <= PATH_NVB) {
WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
return ncclUnhandledCudaError;
} else if (path->type < PATH_SYS) {
INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
if (!ncclParamIgnoreDisabledP2p()) {
if (path->type <= PATH_NVB) {
WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
return ncclUnhandledCudaError;
} else if (path->type < PATH_SYS) {
INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
}
}
*p2p = 0;
}
}
}
@ -360,7 +361,8 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
if (read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = ncclParamNetGdrRead();
if (gdrReadParam == 0) return ncclSuccess;
if (gdrReadParam < 0) {
// Disable GDR Reads pre-Ampere when we have other PCI flows
if (gdrReadParam < 0 && gpu->gpu.cudaCompCap < 80) {
int nvlink = 0;
// Since we don't know whether there are other communicators,
// it's better to keep things local if we have a single GPU.
@ -400,7 +402,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
}
// Set to 0 to disable the flush on Hopper when using GDR
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
// Determine whether we need to flush the GDR recv buffers
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {

View File

@ -49,10 +49,10 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
return ncclSuccess;
}
static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) {
static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) {
for (int l=0; l<node2->nlinks; l++) {
struct ncclTopoLink* link = node2->links+l;
if (link->remNode == node1) {
if (link->remNode == node1 && link->type == type) {
*revLink = link;
return ncclSuccess;
}
@ -85,11 +85,11 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
float fwBw = link->type == LINK_PCI ? pciBw : bw;
float revBw = 0;
if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
revBw += fwBw/8;
}
if (link->remNode->type == CPU && link->type == LINK_NVL) {
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
if (link->remNode->type == CPU && link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER && link->type == LINK_NVL) {
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
revBw += fwBw;
}
if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
@ -260,6 +260,32 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
} else {
for (int i=0; i<count; i++) next[i] = scores[i].g;
}
if (system->nodes[NVS].count) {
// NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first.
int index = gpu-system->nodes[GPU].nodes;
int i;
int prevGpu = (index-1+ngpus)%ngpus;
int nextGpu = (index+1)%ngpus;
int firstGpus[2];
int firstGpuCount = 0;
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2;
} else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2;
} else {
firstGpus[0] = nextGpu; firstGpuCount = 1;
}
for (int g=0; g<firstGpuCount; g++) {
for (i=0; i<count && next[i] != firstGpus[g]; i++);
if (i<count) {
for (; i>0; i--) next[i] = next[i-1];
next[0] = firstGpus[g];
}
}
}
*countPtr = count;
return ncclSuccess;
}
@ -267,7 +293,7 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
// Try to keep all searchs within one second
#define NCCL_SEARCH_GLOBAL_TIMEOUT (5ULL<<16)
#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<19)
#define NCCL_SEARCH_TIMEOUT (1<<14)
#define NCCL_SEARCH_TIMEOUT_TREE (1<<14)
#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8)
@ -342,6 +368,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1;
if (graph->nChannels*graph->bwInter > refGraph->nChannels*refGraph->bwInter) *copy = 1;
return ncclSuccess;
}
// 2. Try to get better bandwidth
@ -358,30 +385,27 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
return ncclSuccess;
}
// Build a list of the best NETs to try.
// Build a sorted list of the NETs to try.
//
// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
// index when trying to get back to the NIC.
//
// The list is built the following way:
// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
// based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
// might have been choosen by GPU 0 (case with multiple independent communicators per node)
// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
// 2. add other NETs satisfying typeInter but not already in the list.
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
int netCount = 0;
int localNetCount;
int* localNets;
NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS));
// First add the preferred NICs
for (int g=0; g<system->nodes[GPU].count; g++) {
if (gpu != -1 && gpu != g) continue;
localNetCount = 0;
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
for (int c = 0;; c++) {
for (int c = 0; c<MAXCHANNELS; c++) {
int netId;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
@ -451,11 +475,11 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
int startNetIndex;
NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
int netcount;
int netCount;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
for (int i=0; i<netcount; i++) {
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
for (int i=0; i<netCount; i++) {
int n = nets[i];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
@ -523,12 +547,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
const int bw = graph->bwInter;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
int netcount;
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
for (int i=0; i<netcount; i++) {
int n = nets[i];
int netCount;
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
for (int i=0; i<netCount; i++) {
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue;
int n = nets[(graph->nChannels+i)%netCount];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoNode* gpu;
if (graph->collNet && net->net.collSupport == 0) continue;
if (net->net.bw < bw) continue;
@ -542,12 +566,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
// NVLS needs to balance on all NICs
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
if (graph->nChannels < netcount) {
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
if (graph->nChannels < netCount) {
int gpu;
NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu));
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
}
} else {
if (graph->nChannels > 0) {
@ -557,7 +581,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
}
if (graph->nChannels == 0 || graph->sameChannels == 0) {
if (graph->nChannels == 0) {
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
@ -577,18 +601,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
if (maxBw >= bw) {
// In the first loop, avoid using GPUs in both directions between channels (one channel
// sending from that GPU and one channel receiving to that GPU), since that usually leads
// to lower BW.
for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].bw == maxBw && paths[g].count == minHops) {
gpu = system->nodes[GPU].nodes+g;
int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
if (tryGpuBidir == gpuUsed) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
}
}
for (int i=0; i<system->nodes[GPU].count; i++) {
int g = (graph->nChannels+i)%system->nodes[GPU].count;
if (paths[g].bw == maxBw && paths[g].count == minHops) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
}
}
}
@ -804,33 +820,50 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
return ncclSuccess;
}
ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngpus) {
if (graph->nChannels == 0) return ncclSuccess;
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
if (graph->bwIntra < 25.0) return ncclSuccess;
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
return ncclSuccess;
}
float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
graph->crossNic = ncclParamCrossNic();
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic &&
int crossNic = (system->nodes[NET].count > 1) &&
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0;
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
graph->crossNic = crossNic == 1 ? 1 : 0;
graph->bwIntra = graph->bwInter = 0;
graph->latencyInter = 0;
if (graph->crossNic == 2) graph->crossNic = 0;
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
graph->typeInter = PATH_PIX;
graph->nChannels = 0;
int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
graph->sameChannels = trySameChannels;
char* str = getenv("NCCL_GRAPH_FILE");
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(system, &cpuArch, &cpuVendor, &cpuModel));
const char* str = ncclGetEnv("NCCL_GRAPH_FILE");
if (str) {
INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
struct ncclXml* xml;
@ -846,6 +879,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
int ccMin;
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
// NVLS search must have ngpus heads at most.
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count;
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
@ -884,7 +919,7 @@ search:
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
#if 0
printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
printf("Id %d Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.id, tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
for (int c=0; c<graph->nChannels; c++) {
printf("%2d : ", c);
for (int g=0; g<ngpus; g++) {
@ -901,8 +936,9 @@ search:
if (pass == 1) {
// First pass, we don't have a solution yet ; try other options
// Try having different channels
if (tmpGraph.sameChannels == 1) {
// Try having different channels (except when going through AMD CPUs)
if (tmpGraph.sameChannels == 1 &&
!(cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD && tmpGraph.typeIntra == PATH_SYS)) {
tmpGraph.sameChannels = 0;
goto search;
}
@ -932,12 +968,12 @@ search:
}
tmpGraph.typeInter = PATH_PIX;
if (crossNic && tmpGraph.crossNic == 0) {
if (crossNic == 2 && tmpGraph.crossNic == 0) {
// Try again with crossNic if permitted
tmpGraph.crossNic = crossNic;
tmpGraph.crossNic = 1;
goto search;
}
tmpGraph.crossNic = 0;
tmpGraph.crossNic = crossNic == 1 ? 1 : 0;
// Decrease bw until we find a solution
if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
@ -954,6 +990,7 @@ done:
// We have a solution. Start from that solution and move to pass 2.
if (pass == 1) {
time = -1;
NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus));
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
speedIndex = 0;
while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
@ -962,13 +999,22 @@ done:
pass = 2;
}
// 3. See if we can increase bwIntra for trees (2 nodes or collnet)
if (pass == 2) {
if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
speedIndex > 0) {
tmpGraph.bwIntra = speedArray[--speedIndex];
goto search;
// See if we can increase bw
if (time != 0 && speedIndex > 0) {
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
// increase bw for Ring
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex];
goto search;
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) {
tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
tmpGraph.bwInter = speedArray[--speedIndex];
goto search;
} else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) {
// increase bwIntra for trees (2 nodes or collnet)
tmpGraph.bwIntra = speedArray[--speedIndex];
goto search;
}
}
time = -1;
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
@ -982,18 +1028,6 @@ done:
graph->typeIntra = graph->typeInter = PATH_SYS;
graph->nChannels = 1;
}
if (graph->nChannels == 0) return ncclSuccess;
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
if (graph->bwIntra < 25.0) return ncclSuccess;
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
return ncclSuccess;
}
@ -1023,7 +1057,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
}
ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
char* str = getenv("NCCL_GRAPH_DUMP_FILE");
const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE");
if (str) {
INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
struct ncclXml* xml;

View File

@ -72,6 +72,9 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
*bw = AMD_BW;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
}
@ -540,6 +543,36 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
return ncclSuccess;
}
ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
if (strcmp(node->name, "c2c") == 0) {
struct ncclTopoNode* gpu = NULL;
int64_t pBusId;
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx", pBusId);
return ncclInternalError;
}
int count = 0;
NCCLCHECK(xmlGetAttrInt(node, "count", &count));
int bw = 0;
NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
double c2cBw = (bw*count)/1000.0;
struct ncclTopoNode* cpu = NULL;
NCCLCHECK(findLocalCpu(gpu, &cpu));
if (cpu == NULL) return ncclSuccess;
NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
} else {
const char* busId;
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
for (int s=0; s<node->nSubs; s++) {
NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
NCCLCHECK(ncclCalloc(topoSystem, 1));
struct ncclXmlNode* topNode;
@ -549,6 +582,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
}
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL));
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
@ -595,7 +629,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
struct ncclXml* xml;
NCCLCHECK(ncclCalloc(&xml, 1));
char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
@ -668,7 +702,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
@ -704,7 +738,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
int* localNets;
int localNetCount;
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
int* localGpus;
int* localGpus = NULL;
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
@ -717,17 +751,25 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
int netIndex;
NCCLCHECK(ncclTopoIdToIndex(system, NET, net, &netIndex));
int* localGpus = NULL;
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
for (int c=0; c<MAXCHANNELS; c++) {
for (int g=0; g<system->nodes[GPU].count; g++) {
for (int lg=0; lg<localGpuCount; lg++) {
int g = localGpus[lg];
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int id;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
if (net == id) {
*gpuIndex = g;
free(localGpus);
return ncclSuccess;
}
}
}
free(localGpus);
*gpuIndex = -1;
return ncclSuccess;
}
@ -836,14 +878,3 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
if (ccMax) *ccMax = max;
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
for (int g=0; g<system->nodes[GPU].count; g++) {
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
*localRank = g;
return ncclSuccess;
}
}
WARN("Could not find local GPU with rank %d", rank);
return ncclInternalError;
}

View File

@ -18,6 +18,7 @@
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define AMD_BW 16.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0

View File

@ -5,7 +5,7 @@
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "device.h"
#include "comm.h"
#include "topo.h"
@ -54,9 +54,9 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
{ 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
{ 6.8, 14.0, 0 }, { 6.8, 14.0, 0 }, // Collnet Direct, Chain
{ 0, 0, 23.0 }, { 0, 0, 23.0 }}; // NVLS, NVLS Tree
{ 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
{ 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
@ -64,17 +64,17 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
#define NCCL_HW_NET 2
static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 4.75 },
/* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
/* NVLS */ { 0, 0, 23 }, /* NVLSTree */ { 0, 0, 23 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 6 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
/* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
/* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 14 },
/* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 19 } }
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
/* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 },
/* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } }
};
/* Array indexes used below */
@ -165,13 +165,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
float busBw = graphs[a]->nChannels * bw;
// Various model refinements
@ -194,10 +196,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Convert bus BW to algorithm BW
float ratio;
if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0;
else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1));
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio = 5.0/6.0;
else ratio = .5;
comm->bandwidths[coll][a][p] = busBw * ratio;
/* Ring bandwidth backup */
if (a == NCCL_ALGO_RING)
comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = hwLat[intraHw[a]][a][p];
@ -229,13 +233,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
} else if (a == NCCL_ALGO_COLLNET_DIRECT) {
comm->latencies[coll][a][p] +=
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency
} else if (a == NCCL_ALGO_COLLNET_CHAIN) {
comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
} else if (a == NCCL_ALGO_NVLS) {
if (nNodes > 1) comm->latencies[coll][a][p] += hwLat[NCCL_HW_NET][a][p];
comm->latencies[coll][a][p] = intraLat;
if (nNodes > 1) comm->latencies[coll][a][p] += interLat;
} else if (a == NCCL_ALGO_NVLS_TREE) {
comm->latencies[coll][a][p] += 2*(nNodes-1)*hwLat[NCCL_HW_NET][a][p];
comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
}
}
}
@ -246,12 +251,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
const char *protoStr = getenv("NCCL_PROTO");
const char *protoStr = ncclGetEnv("NCCL_PROTO");
if (protoStr) {
INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
}
const char *algoStr = getenv("NCCL_ALGO");
const char *algoStr = ncclGetEnv("NCCL_ALGO");
if (algoStr) {
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
@ -293,11 +298,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
}
}
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
// Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE.
if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue;
if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
}
for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
bool available = false;
for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++)
for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
if (comm->bandwidths[c][a][p] != 0) {
available = true;
goto check_avail;
}
check_avail:
if (available == false) {
/* at least set ring algo available */
for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p];
}
}
if (comm->rank == 0) {
char line[1024];
for (int block=0; block<2; block++) {
@ -346,7 +365,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512;
// Override defaults with user env
char* str = getenv("NCCL_THREAD_THRESHOLDS");
const char* str = ncclGetEnv("NCCL_THREAD_THRESHOLDS");
if (str) {
INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
@ -378,9 +397,19 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
{ .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 }
};
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
float lat = info->comm->latencies[info->coll][algorithm][protocol];
if (backup) {
*backup = false;
if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
/* try back up RING algorithm */
bw = info->comm->ringbdw[info->coll][protocol];
*backup = true;
}
}
if (bw == 0) {
*time = -1.0; return ncclSuccess;
}

View File

@ -254,9 +254,13 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX
return ncclSuccess;
}
ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return ncclSuccess;
}
ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } };
NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink }, { "c2c", ncclTopoXmlLoadC2c } };
NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
return ncclSuccess;
}
@ -687,6 +691,41 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
}
}
}
#if CUDART_VERSION >= 11080
struct ncclXmlNode* c2cNode = NULL;
NCCLCHECK(xmlGetSub(gpuNode, "c2c", &c2cNode));
if (c2cNode == NULL) {
if (sm >= 90) {
int c2cLinksCount = 0;
nvmlFieldValue_t fv;
fv.fieldId = NVML_FI_DEV_C2C_LINK_COUNT;
if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 1, &fv) == ncclSuccess) && (fv.nvmlReturn == NVML_SUCCESS)) {
c2cLinksCount = fv.value.uiVal;
int bw = 0;
int count = 0;
for (int l=0; l<c2cLinksCount; l++) {
nvmlFieldValue_t fvs[2];
fvs[0].fieldId = NVML_FI_DEV_C2C_LINK_GET_STATUS;
fvs[0].scopeId = l;
fvs[1].fieldId = NVML_FI_DEV_C2C_LINK_GET_MAX_BW;
fvs[1].scopeId = l;
if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 2, fvs) == ncclSuccess) &&
(fvs[0].nvmlReturn == NVML_SUCCESS) &&
(fvs[0].value.uiVal == 1) &&
(fvs[1].nvmlReturn == NVML_SUCCESS)) {
bw = fvs[1].value.uiVal;
count++;
}
}
if (count > 0) {
NCCLCHECK(xmlAddNode(xml, gpuNode, "c2c", &c2cNode));
NCCLCHECK(xmlSetAttrInt(c2cNode, "bw", bw));
NCCLCHECK(xmlSetAttrInt(c2cNode, "count", count));
}
}
}
}
#endif
// Fill target classes
for (int s=0; s<gpuNode->nSubs; s++) {
struct ncclXmlNode* sub = gpuNode->subs[s];

View File

@ -22,7 +22,6 @@ __thread int ncclGroupBlocking = -1; /* default mode */
__thread bool ncclGroupJobAbortFlag = false;
void* ncclAsyncJobMain(void* arg);
static ncclResult_t groupJobComplete(struct ncclGroupJob *job);
ncclResult_t ncclAsyncLaunch(
struct ncclAsyncJob* job,
@ -181,9 +180,28 @@ failure:
return result;
}
static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) {
static inline void groupResetJobState(struct ncclGroupJob* job) {
if (job) {
if (job->groupBlockingPtr) *job->groupBlockingPtr = -1;
if (job->abortFlagPtr) *job->abortFlagPtr = false;
if (job->groupErrorPtr) *job->groupErrorPtr = ncclSuccess;
if (job->groupCommHeadPtr) *job->groupCommHeadPtr = NULL;
if (job->groupCommPreconnectHeadPtr) *job->groupCommPreconnectHeadPtr = NULL;
memset(job, 0, sizeof(struct ncclGroupJob));
}
return;
}
static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, int* groupBlockingPtr, volatile bool* groupJobAbortFlagPtr, ncclResult_t error) {
struct ncclComm* comm = *groupCommHeadPtr;
/* reset all thread local variables */
*groupCommHeadPtr = NULL;
*groupCommPreconnectHeadPtr = NULL;
*groupErrorPtr = ncclSuccess;
*groupBlockingPtr = -1;
*groupJobAbortFlagPtr = false;
while (comm != nullptr) {
struct ncclComm* next = comm->groupNext;
(void) ncclGroupCommLeave(comm); // overwrites comm->groupNext
@ -233,16 +251,12 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
/* reset everything */
while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
*job->abortFlag = 1;
if (job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, error);
if (job->undo) job->undo(job);
if (job->destructor) job->destructor((void*)job);
}
*groupErrorPtr = ncclSuccess;
*groupCommHeadPtr = nullptr;
*groupCommPreconnectHeadPtr = nullptr;
return;
}
@ -325,9 +339,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
}
/* this atomic must happen before cleanup and setting state of communicators */
__atomic_store_n(&gjob->doneFlag, true, __ATOMIC_RELEASE);
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
if (job->comm && !job->comm->config.blocking)
@ -345,16 +356,12 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
groupCommHeadMain = next;
}
*gjob->groupErrorPtr = ncclSuccess;
*gjob->groupCommHeadPtr = nullptr;
*gjob->groupCommPreconnectHeadPtr = nullptr;
CUDACHECK(cudaSetDevice(savedDev));
exit:
return ret;
fail:
groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, ret);
groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret);
goto exit;
}
@ -377,7 +384,8 @@ ncclResult_t ncclGroupEndInternal() {
ncclGroupJobMain.groupErrorPtr = &ncclGroupError;
ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs;
ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag;
ncclGroupJobMain.doneFlag = false;
ncclGroupJobMain.groupBlockingPtr = &ncclGroupBlocking;
ncclGroupJobMain.initialized = true;
ncclGroupJobMainPtr = &ncclGroupJobMain;
/* make sure ncclGroupBlocking has been set. */
assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
@ -387,6 +395,7 @@ ncclResult_t ncclGroupEndInternal() {
ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
do {
NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail);
job->comm->groupJob = ncclGroupJobMainPtr;
job = job->next;
} while (job);
}
@ -395,30 +404,42 @@ ncclResult_t ncclGroupEndInternal() {
ncclComm_t comm = ncclGroupCommHead;
do {
NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
/* link group job to communicators. */
comm->groupJob = ncclGroupJobMainPtr;
comm = comm->groupNext;
} while (comm);
}
ncclGroupJobMainPtr->base.func = groupLaunch;
SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
ret = ncclInProgress;
} else {
/* blocking group */
NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
groupResetJobState();
groupResetJobState(ncclGroupJobMainPtr);
}
}
exit:
return ret;
fail:
groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, ret);
groupResetJobState();
groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, &ncclGroupBlocking, &ncclGroupJobAbortFlag, ret);
goto exit;
}
void ncclGroupJobAbort() {
ncclGroupJobAbortFlag = true;
(void) groupJobComplete(ncclGroupJobMainPtr);
/* reset group abort flag */
ncclGroupJobAbortFlag = false;
ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
ncclResult_t ret = ncclSuccess;
if (groupJob && groupJob->initialized) {
ret = ncclAsyncJobComplete(&groupJob->base);
groupResetJobState(groupJob);
}
return ret;
}
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
if (groupJob && groupJob->initialized) {
*groupJob->abortFlagPtr = true;
NCCLCHECK(ncclGroupJobComplete(groupJob));
}
return ncclSuccess;
}

View File

@ -101,7 +101,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
/* Now allow RW access to the newly mapped memory */

View File

@ -7,108 +7,7 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
enum ncclDevRedOp_t {
ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin,
ncclDevPreMulSum, ncclDevSumPostDiv,
ncclNumDevRedOps
};
struct ncclDevRedOpFull {
ncclDevRedOp_t op;
bool scalarArgIsPtr;
uint64_t scalarArg;
};
#define FUNC_INDEX_P2P 0
#define FUNC_INDEX(func, devredop, ncclType, al, pr) (1+ncclNumTypes+(((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
#define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \
ncclFunction_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_ONERANK_REDUCE_NAME(devredop, type) \
ncclFunction_OneRankReduce_##devredop##_##type
#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_IMPL_NAME(func, algo, proto) \
nccl##func##algo##proto
/* Declare all collective operations */
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
#define SINGLE_ARG(...) __VA_ARGS__
#define CONCAT(a,b) a##b
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f))
#define MACRO_IF_0(t, f) f
#define MACRO_IF_1(t, f) t
#define DECL4(func, algo, devredop, type, undef) \
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL, devredop, type)) \
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type))
#define DECL3(func, devredop, type, undef) \
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef) \
DECL4(func, NVLS_TREE, devredop, type, undef)
#if defined(__CUDA_BF16_TYPES_EXIST__)
#define DECL2(func, devredop, undefForFloat) \
DECL3(func, devredop, int8_t, /*undef=*/0) \
DECL3(func, devredop, uint8_t, /*undef=*/0) \
DECL3(func, devredop, int32_t, /*undef=*/0) \
DECL3(func, devredop, uint32_t, /*undef=*/0) \
DECL3(func, devredop, int64_t, /*undef=*/0) \
DECL3(func, devredop, uint64_t, /*undef=*/0) \
DECL3(func, devredop, half, /*undef=*/undefForFloat) \
DECL3(func, devredop, float, /*undef=*/undefForFloat) \
DECL3(func, devredop, double, /*undef=*/undefForFloat) \
DECL3(func, devredop, __nv_bfloat16, /*undef=*/undefForFloat)
#else
#define DECL2(func, devredop, undefForFloat) \
DECL3(func, devredop, int8_t, /*undef=*/0) \
DECL3(func, devredop, uint8_t, /*undef=*/0) \
DECL3(func, devredop, int32_t, /*undef=*/0) \
DECL3(func, devredop, uint32_t, /*undef=*/0) \
DECL3(func, devredop, int64_t, /*undef=*/0) \
DECL3(func, devredop, uint64_t, /*undef=*/0) \
DECL3(func, devredop, half, /*undef=*/undefForFloat) \
DECL3(func, devredop, float, /*undef=*/undefForFloat) \
DECL3(func, devredop, double, /*undef=*/undefForFloat)
#endif
#define DECL(func) \
DECL2(func, Sum, /*undefForFloat=*/0) \
DECL2(func, Prod, /*undefForFloat=*/0) \
DECL2(func, Min, /*undefForFloat=*/0) \
DECL2(func, Max, /*undefForFloat=*/0) \
DECL2(func, PreMulSum, /*undefForFloat=*/0) \
DECL2(func, SumPostDiv, /*undefForFloat=*/1)
DECL2(Broadcast, Sum, /*undefForFloat=*/0)
DECL(Reduce)
DECL2(AllGather, Sum, /*undefForFloat=*/0)
DECL(ReduceScatter)
DECL(AllReduce)
DECL5(SendRecv, RING, SIMPLE, Sum, int8_t)
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)();
#if defined(__CUDA_BF16_TYPES_EXIST__)
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16)();
#endif
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
#include "nccl.h"
// CHUNKSIZE must be a multiple of SLICESIZE
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@ -123,13 +22,27 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
#define REDUCE_CHUNKSTEPS 1
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
// macro will be used in preprocessor conditionals where enums have no meaning.
#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
(((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
(type==7 && red==0) || \
(type==8 && red==0))
inline int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
return 1;
case ncclFloat16:
#if defined(__CUDA_BF16_TYPES_EXIST__)
case ncclBfloat16:
#endif
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
#endif

View File

@ -10,8 +10,10 @@
#include "transport.h"
#include "p2p.h"
#include "collectives.h"
#include "nccl_tuner.h"
#include "proxy.h"
#include "strongstream.h"
#include "nccl_net.h"
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
@ -125,7 +127,7 @@ struct ncclChannel {
struct ncclChannelPeer** peers;
struct ncclDevChannelPeer** devPeers;
/* devPeer pointer array used for host side access */
struct ncclDevChannelPeer** devPeersHostPtr;
struct ncclDevChannelPeer** devPeersHostPtr;
struct ncclRing ring;
int* devRingUserRanks;
struct ncclTree tree;
@ -155,6 +157,14 @@ struct ncclPointerList {
void *ptr;
};
struct ncclNvlsMcHandleList {
struct ncclNvlsMcHandleList *next;
CUmemGenericAllocationHandle mcHandle;
CUdeviceptr ptr;
int dev;
size_t size;
};
struct ncclKernelPlan {
// A kernel plan is also a callback that reclaims itself. Hence this must
// be the first member.
@ -178,6 +188,7 @@ struct ncclKernelPlan {
int collOpCount; // zero based for this plan
struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
struct Channel {
int nWork;
@ -191,6 +202,23 @@ struct ncclKernelPlan {
} channels[MAXCHANNELS];
};
struct ncclRegRequest {
uintptr_t buff;
size_t size;
struct ncclRegRequest *next;
};
struct ncclRegRecord {
uintptr_t buff;
size_t size;
CUdeviceptr regAddr;
size_t regSize;
int dev;
CUmemGenericAllocationHandle mcHandle;
uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */
struct ncclRegRecord *next;
};
struct ncclComm {
struct ncclMemoryStack memPermanent, memScoped;
// List of destructors to run when comm is destructed
@ -261,6 +289,7 @@ struct ncclComm {
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
/* This attribute can indicate the states of communicators and return code of
@ -270,7 +299,7 @@ struct ncclComm {
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile uint32_t *childAbortFlag;
uint32_t *abortFlagRefCount;
volatile uint32_t *abortFlagRefCount;
// Device side of the communicator (for cudaFree's)
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
@ -310,15 +339,19 @@ struct ncclComm {
// NVLink SHARP (NVLS) support
int nvlsSupport;
int nvlsRegSupport;
/* sharable NVLS resource. */
struct ncclNvlsSharedRes* nvlsResources;
struct ncclShmemCollBuff nvlsShmem;
void *nvlsShmemHandle;
size_t channelSize; // User requested work size (bytes) for channel partitions
ssize_t channelSize; // User requested work size (bytes) for channel partitions
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
struct ncclMemoryPool memPool_ncclPointerList;
struct ncclMemoryPool memPool_ncclNvlsHandleList;
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
// this comm is not yet in a group.
struct ncclComm* groupNext;
@ -346,6 +379,16 @@ struct ncclComm {
bool finalizeCalled;
// shared structures for finalization
int finalizeRankCnt;
// group job to support multi-thread FT
struct ncclGroupJob *groupJob;
/* store to buffer register request */
struct ncclIntruQueue<struct ncclRegRequest, &ncclRegRequest::next> regRequestQueue;
/* store registered buffer */
struct ncclIntruQueue<struct ncclRegRecord, &ncclRegRecord::next> regRecordQueue;
// Tuning plugin
ncclTuner_t* tuner;
};
enum ncclLaunchMode {

View File

@ -30,29 +30,6 @@
ret func(args)
#endif // end PROFAPI
static __inline__ int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
return 1;
case ncclFloat16:
#if defined(__CUDA_BF16_TYPES_EXIST__)
case ncclBfloat16:
#endif
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
#include "debug.h"
#include "checks.h"
#include "cudawrap.h"

View File

@ -30,7 +30,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
WARN("Cuda failure '%s'", errStr); \
WARN("Cuda failure %d '%s'", err, errStr); \
return ncclUnhandledCudaError; \
} \
} while(false)
@ -40,7 +40,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
WARN("Cuda failure '%s'", errStr); \
WARN("Cuda failure %d '%s'", err, errStr); \
res = ncclUnhandledCudaError; \
goto label; \
} \
@ -52,7 +52,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr); \
INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \
} \
} while(false)
@ -79,6 +79,7 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
// cuMem API support
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);

View File

@ -4,10 +4,11 @@
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEBUG_H_
#define NCCL_DEBUG_H_
#ifndef NCCL_INT_DEBUG_H_
#define NCCL_INT_DEBUG_H_
#include "nccl_net.h"
#include "nccl.h"
#include "nccl_common.h"
#include <stdio.h>
#include <chrono>
#include <type_traits>

View File

@ -8,31 +8,33 @@
#define NCCL_DEVICE_H_
#include "nccl.h"
#include "nccl_common.h"
#include "align.h"
#include <stdint.h>
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_LL 0
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2
extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
#include "net_device.h"
enum ncclDevRedOp_t {
ncclDevSum, ncclDevProd, ncclDevMinMax,
ncclDevPreMulSum, ncclDevSumPostDiv,
ncclNumDevRedOps
};
struct ncclDevRedOpFull {
ncclDevRedOp_t op;
ncclRedOp_t proxyOp;
bool scalarArgIsPtr;
uint64_t scalarArg;
};
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
@ -85,6 +87,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
struct ncclConnInfo {
// Regular comm mechanism
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
void* mhandles[NCCL_NUM_PROTOCOLS];
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
@ -98,6 +101,7 @@ struct ncclConnInfo {
uint64_t step; // Keep where we are
uint64_t llLastCleaning;
ncclNetDeviceHandle_t netDeviceHandle;
};
struct ncclProxyConnector {
@ -105,6 +109,7 @@ struct ncclProxyConnector {
int tpLocalRank;
int sameProcess;
struct ncclProxyConnection* connection;
ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary
};
struct ncclConnector {
@ -292,6 +297,7 @@ struct ncclDevComm {
int rank;
int nRanks;
int buffSizes[NCCL_NUM_PROTOCOLS];
int p2pChunkSize;
// Operation list for aggregation
int workFifoDepth;
@ -370,4 +376,88 @@ __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_
return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
}
// Host-side table of kernel function pointers.
extern int const ncclDevKernelCount;
extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
// Table of most specialized kernel function to run given func index.
extern int const ncclDevFuncRowToId[];
extern void* const ncclDevKernelForFunc[/*funcIndex*/];
extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
// Launch a one-rank reduction on stream.
ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream);
// `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py"
inline bool ncclNvlsSupported(int devRedOp, int type) {
switch (type) {
case ncclInt32:
case ncclUint32:
case ncclInt64:
case ncclUint64:
case ncclFloat16:
#if defined(__CUDA_BF16_TYPES_EXIST__)
case ncclBfloat16:
#endif
return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax;
case ncclFloat:
case ncclDouble:
return devRedOp == ncclDevSum;
default:
return false;
}
}
// `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py"
inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) {
#if defined(__CUDA_BF16_TYPES_EXIST__)
constexpr int NumTypes = ncclNumTypes;
#else
constexpr int NumTypes = ncclNumTypes + 1;
#endif
int row = 0; // ncclDevFuncIndex_P2p
if (coll == ncclFuncSendRecv) goto have_row;
row += 1;
if (coll == ncclFuncAllGather) {
int algo1 = algo == NCCL_ALGO_RING ? 0 :
/*algo == NCCL_ALGO_NVLS*/ 1;
row += algo1*NCCL_NUM_PROTOCOLS + proto;
goto have_row;
}
row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
if (coll == ncclFuncBroadcast) {
row += proto;
goto have_row;
}
row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
if (coll == ncclFuncAllReduce) {
row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto;
goto have_row;
}
row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS;
if (coll == ncclFuncReduce) {
row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto;
goto have_row;
}
row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
if (coll == ncclFuncReduceScatter) {
int algo1 = algo == NCCL_ALGO_RING ? 0 :
/*algo == NCCL_ALGO_NVLS*/ 1;
row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto;
goto have_row;
}
row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
have_row:
return ncclDevFuncRowToId[row];
}
inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; }
#endif

View File

@ -8,7 +8,7 @@
#define NCCL_GRAPH_H_
#include "nccl.h"
#include "devcomm.h"
#include "device.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
@ -38,7 +38,6 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
int ncclPxnDisable(struct ncclComm* comm);
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
// Find CPU affinity
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
@ -112,6 +111,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);
#endif

View File

@ -14,7 +14,8 @@ ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
void ncclGroupCommJoin(struct ncclComm* comm);
void ncclGroupCommPreconnect(struct ncclComm* comm);
ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
void ncclGroupJobAbort();
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob);
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
@ -52,8 +53,9 @@ struct ncclGroupJob {
struct ncclComm **groupCommPreconnectHeadPtr;
ncclResult_t *groupErrorPtr;
volatile bool *abortFlagPtr;
int *groupBlockingPtr;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
bool doneFlag;
bool initialized;
};
ncclResult_t ncclGroupStartInternal();
@ -87,14 +89,6 @@ static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
}
inline ncclResult_t ncclGroupStartInternal() {
/* if previous group launch does not complete, don't launch this one. */
if (ncclGroupJobMainPtr != NULL) {
if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
return ncclInvalidUsage;
} else {
NCCLCHECK(groupJobComplete(ncclGroupJobMainPtr));
}
}
ncclGroupDepth++;
return ncclSuccess;
}

View File

@ -1040,4 +1040,19 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
return qp->context->ops.post_send(qp, wr, bad_wr);
}
struct ibv_ece {
/*
* Unique identifier of the provider vendor on the network.
* The providers will set IEEE OUI here to distinguish
* itself in non-homogenius network.
*/
uint32_t vendor_id;
/*
* Provider specific attributes which are supported or
* needed to be enabled by ECE users.
*/
uint32_t options;
uint32_t comp_mask;
};
#endif // NCCL_IBV_CORE_H_

View File

@ -36,6 +36,8 @@ struct ncclIbvSymbols {
int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
};
/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */

View File

@ -66,6 +66,8 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries,
ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/

View File

@ -8,7 +8,7 @@
#define NCCL_INFO_H_
#include "nccl.h"
#include "devcomm.h"
#include "device.h"
#include "collectives.h"
#include "core.h"
#include "utils.h"
@ -54,6 +54,8 @@ struct ncclInfo {
int nChannels;
int nThreads;
size_t nBytes;
size_t sendbuffSize;
size_t recvbuffSize;
int nstepsPerLoop;
int nchunksPerLoop;
int chunkSize;
@ -67,6 +69,17 @@ inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
info->datatype = ncclInt8;
}
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
/* compute buffer size for NVLS buffer registration */
if (info->coll == ncclFuncAllGather) {
info->sendbuffSize = info->count * ncclTypeSize(info->datatype);
info->recvbuffSize = info->sendbuffSize * nRanks;
} else if (info->coll == ncclFuncReduceScatter) {
info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
info->sendbuffSize = info->recvbuffSize * nRanks;
} else {
info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
}
return ncclSuccess;
}

View File

@ -30,6 +30,7 @@ struct ncclIpcSocket {
ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);

33
src/include/nccl_common.h Normal file
View File

@ -0,0 +1,33 @@
/*************************************************************************
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEBUG_H_
#define NCCL_DEBUG_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_UNDEF -1
#define NCCL_PROTO_LL 0
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2
#endif

View File

@ -8,6 +8,8 @@
#define NCCL_NET_H_
#include "nccl.h"
#include "nccl_common.h"
#include "net_device.h"
#include <stdint.h>
#define NCCL_NET_HANDLE_MAXSIZE 128
@ -17,13 +19,89 @@
#define NCCL_PTR_DMABUF 0x4
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 8
#define NCCL_NET_MAX_REQUESTS 32
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v7_t;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
typedef ncclNetProperties_v7_t ncclNetProperties_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v7_t;
typedef ncclNet_v7_t ncclNet_t;
#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7
#define NCCL_NET_MAX_REQUESTS_V6 8
// v6 struct for backwards compatibility
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
@ -35,9 +113,7 @@ typedef struct {
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
}ncclNetProperties_v6_t;
typedef ncclNetProperties_v6_t ncclNetProperties_t;
} ncclNetProperties_v6_t;
typedef struct {
// Name of the network (mainly for logs)
@ -86,10 +162,49 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v6_t;
typedef ncclNet_v6_t ncclNet_t;
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v7_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
typedef ncclCollNet_v7_t ncclCollNet_t;
// v6 struct for backwards compatibility
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
@ -130,10 +245,6 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v6_t;
typedef ncclCollNet_v6_t ncclCollNet_t;
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
// v5 struct for backwards compatibility
typedef struct {
// Name of the network (mainly for logs)
@ -219,95 +330,4 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v5_t;
// v4 struct for backwards compatibility
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
int speed; // Port speed in Mbps.
int port; // Port number.
int maxComms; // Maximum number of comms we can create
} ncclNetProperties_v4_t;
// v4 struct for backwards compatibility
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v4_t;
// v4 struct for backwards compatibility
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v4_t;
#endif // end include guard

55
src/include/nccl_tuner.h Normal file
View File

@ -0,0 +1,55 @@
/*************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_TUNER_H_
#define NCCL_TUNER_H_
#include "nccl.h"
#include "nccl_common.h"
// API to be implemented by external tuner
typedef struct {
// Name of the tuner
const char* name;
// Initializes tuner states.
// nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// nNodes: number of nodes in current communicator.
// logFunction: a logFunction can be useful to integrate logging together with NCCL core.
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
// Inputs:
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetTypeSupport: whether collnet supports this type
// - nvlsTypeSupport: whether nvlink sharp supports this time
// - numPipeOps: number of operations in the group
//
// Outputs:
// - algorithm: selected algorithm to be used for the given collective
// - protocol: selected protocol to be used for the given collective
// - nChannels: number of channels (hence SMs) to be used.
//
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
// default tuning for the given collective.
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
// Terminates the plugin and cleans up any resources that the plugin allocated.
ncclResult_t (*destroy)();
} ncclTuner_v1_t;
typedef ncclTuner_v1_t ncclTuner_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
#endif

29
src/include/net_device.h Normal file
View File

@ -0,0 +1,29 @@
/*************************************************************************
* Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NET_DEVICE_H_
#define NCCL_NET_DEVICE_H_
#define NCCL_NET_DEVICE_INVALID_VERSION 0x0
#define NCCL_NET_MTU_SIZE 4096
// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
typedef struct {
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
void* handle;
size_t size;
int needsProxyProgress;
} ncclNetDeviceHandle_v7_t;
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
#endif

View File

@ -160,7 +160,12 @@ typedef union nvmlValue_st
#define NVML_FI_DEV_NVLINK_GET_SPEED 164
#define NVML_FI_DEV_NVLINK_GET_STATE 165
#define NVML_FI_DEV_NVLINK_GET_VERSION 166
#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device
#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE
#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links
#define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above
/**
* Information for a Field Value Sample

View File

@ -12,7 +12,7 @@
#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
typedef struct {
int data; // Currently only support an fd based descriptor
uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
} ncclCuDesc;
typedef union {

View File

@ -12,6 +12,7 @@
const char* userHomeDir();
void setEnvFile(const char* fileName);
void initEnv();
const char *ncclGetEnv(const char *name);
void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);

View File

@ -7,10 +7,11 @@
#ifndef NCCL_PROXY_H_
#define NCCL_PROXY_H_
#include "devcomm.h"
#include "device.h"
#include "info.h"
#include "socket.h"
#include "ipcsocket.h"
#include "nccl_net.h"
#include <pthread.h>
#include "shm.h"
#include "p2p.h"
@ -65,6 +66,8 @@ struct ncclProxySubArgs {
uint64_t end;
void* requests[NCCL_STEPS];
void* profilingEvents[NCCL_STEPS];
void* recvRequestsCache[NCCL_STEPS];
int recvRequestsSubCount;
};
struct ncclProxyArgs {
@ -146,7 +149,7 @@ struct ncclProxyProgressState {
char opsPoolShmSuffix[6];
pthread_t thread;
bool stop;
volatile int stop;
struct ncclProxyPeer** localPeers;
struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
struct ncclProxyArgs* active;
@ -157,11 +160,12 @@ struct ncclProxyProgressState {
// Expected proxy response fifo
struct ncclExpectedProxyResponse {
void* opId;
int respSize;
bool done;
void* respBuff;
struct ncclExpectedProxyResponse* next;
void* opId;
int respSize;
bool done;
void* respBuff;
ncclResult_t res;
struct ncclExpectedProxyResponse* next;
};
struct ncclProxyAsyncOp {
@ -181,7 +185,16 @@ struct ncclProxyLocalPeer {
int asyncOpCounter;
};
// Common response header for all proxyOps
// We pack this into a struct to reduce the number of blocking send and recv calls
struct ncclProxyRpcResponseHeader {
void* opId;
ncclResult_t res;
int respSize;
};
struct ncclProxyState {
int internalRefCount;
int refCount;
int tpRank;
int tpnRanks;
@ -196,11 +209,13 @@ struct ncclProxyState {
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
volatile uint32_t* abortFlag;
volatile uint32_t* abortFlagRefCount;
// Service thread
pthread_t thread;
struct ncclSocket* listenSock;
int stop;
volatile int stop;
CUcontext cudaCtx;
ncclResult_t asyncResult;
// Used by main thread
union ncclSocketAddress* peerAddresses;
@ -233,8 +248,11 @@ struct ncclProxyConnection {
struct ncclProxyArgs *proxyAppend;
struct ncclProxyArgs **proxyAppendPtr;
void* transportResources;
ncclNetDeviceHandle_t* netDeviceHandle;
void* mhandles[NCCL_NUM_PROTOCOLS];
proxyConnectState state;
struct ncclCollNetSharedRes* collNet;
int needsProxyProgress;
};
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@ -260,7 +278,7 @@ enum ncclProxyMsgType {
ncclProxyMsgClose = 6,
ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8,
ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
};
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
@ -272,9 +290,10 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd);
ncclResult_t ncclProxyStop(struct ncclComm* comm);
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclProxyState *proxyState);
ncclResult_t ncclProxyTryDetach(struct ncclProxyState *proxyState);
#endif

View File

@ -14,4 +14,12 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
struct ncclShmemCollBuff {
volatile size_t *cnt[2];
volatile void *ptr[2];
int round;
};
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
#endif

View File

@ -7,7 +7,7 @@
#ifndef NCCL_TRANSPORT_H_
#define NCCL_TRANSPORT_H_
#include "devcomm.h"
#include "device.h"
#include "graph.h"
#include "nvmlwrap.h"
#include "core.h"
@ -65,6 +65,7 @@ struct ncclNvlsSharedRes {
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* ucBuff; // Unicast NVLS buffer address
char shareableHandle[NVLS_HANDLE_SIZE];
size_t ucGran;
int nChannels;
};
@ -102,8 +103,20 @@ struct ncclTransport {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
#define USE_POSIX_FD 1
#if USE_POSIX_FD
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
#else
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
#endif
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
enum { collNetRecv=0, collNetSend=1 };

22
src/include/tuner.h Normal file
View File

@ -0,0 +1,22 @@
/*************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_INT_TUNER_H_
#define NCCL_INT_TUNER_H_
#include "nccl_tuner.h"
// Tuning plugin to override NCCL's default algorithm/protocol tuning.
// Attempts to load NCCL tuner from environmental variable.
// Returns ncclSuccess if the correct tuner symbol has been found and
// successully loaded. Otherwise returns an error and also logs the error.
ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner);
// Cleans up NCCL tuner plugin.
ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner);
#endif

View File

@ -13,6 +13,7 @@
#include <stdint.h>
#include <time.h>
#include <sched.h>
#include <algorithm>
#include <new>
int ncclCudaCompCap();
@ -259,11 +260,6 @@ struct ncclMemoryPool {
struct Cell {
Cell *next;
};
template<int Size, int Align>
union CellSized {
Cell cell;
alignas(Align) char space[Size];
};
struct Cell* head;
struct Cell* tail; // meaningful only when head != nullptr
};
@ -275,14 +271,15 @@ inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
template<typename T>
inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
using Cell = ncclMemoryPool::Cell;
using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
Cell* cell;
if (__builtin_expect(me->head != nullptr, true)) {
cell = me->head;
me->head = cell->next;
} else {
// Use the internal allocate() since it doesn't memset to 0 yet.
cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
size_t cellSize = std::max(sizeof(Cell), sizeof(T));
size_t cellAlign = std::max(alignof(Cell), alignof(T));
cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign);
}
memset(cell, 0, sizeof(T));
return reinterpret_cast<T*>(cell);
@ -349,6 +346,32 @@ inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
return ans;
}
template<typename T, T *T::*next>
inline bool ncclIntruQueueDelete(ncclIntruQueue<T,next> *me, T *x) {
T *prev = nullptr;
T *cur = me->head;
bool found = false;
while (cur) {
if (cur == x) {
found = true;
break;
}
prev = cur;
cur = cur->*next;
}
if (found) {
if (prev == nullptr)
me->head = cur->*next;
else
prev->*next = cur->*next;
if (cur == me->tail)
me->tail = prev;
}
return found;
}
template<typename T, T *T::*next>
inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
T *ans = me->head;

View File

@ -16,6 +16,7 @@
#include "enqueue.h"
#include "graph.h"
#include "argcheck.h"
#include "tuner.h"
#include <fcntl.h>
#include <string.h>
#include <errno.h>
@ -24,6 +25,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "param.h"
#define STR2(v) #v
#define STR(v) STR2(v)
@ -177,7 +179,13 @@ static ncclResult_t commFree(ncclComm_t comm) {
* free all intra-process communicators; therefore, we only need to focus on local
* resource cleanup in commFree(). */
if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
pthread_join(comm->proxyState->thread, nullptr);
if (*comm->abortFlag == 0) {
/* regular thread join */
pthread_join(comm->proxyState->thread, nullptr);
} else {
/* try to detach thread due to abort */
ncclProxyTryDetach(comm->proxyState);
}
}
delete[] comm->userRedOps;
@ -211,7 +219,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->sharedRes->tpRankToLocalRank);
NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
NCCLCHECK(ncclProxyDestroy(comm));
NCCLCHECK(ncclProxyDestroy(comm->sharedRes->proxyState));
free(comm->sharedRes);
}
}
@ -229,13 +237,25 @@ static ncclResult_t commFree(ncclComm_t comm) {
if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) {
NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
free(comm->abortFlagRefCount);
free((void*)comm->abortFlagRefCount);
}
free((void*)comm->config.netName);
free(comm->topParentRanks);
free(comm->topParentLocalRanks);
while (!ncclIntruQueueEmpty(&comm->regRecordQueue)) {
struct ncclRegRecord* rec = ncclIntruQueueDequeue(&comm->regRecordQueue);
NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
free(rec->addrs);
free(rec);
}
while (!ncclIntruQueueEmpty(&comm->regRequestQueue)) {
struct ncclRegRequest* req = ncclIntruQueueDequeue(&comm->regRequestQueue);
free(req);
}
commPoison(comm); // poison comm before free to avoid comm reuse.
free(comm);
@ -275,7 +295,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
ncclResult_t ret = ncclSuccess;
if (*comm->abortFlag) {
ncclGroupJobAbort();
ncclGroupJobAbort(comm->groupJob);
} else {
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
if (ret != ncclSuccess) {
@ -284,6 +304,11 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
if (ret == ncclInProgress) ret = ncclInvalidArgument;
goto exit;
}
/* if there is linked group job, we should complete it. */
if (comm->groupJob) {
NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
comm->groupJob = NULL;
}
}
exit:
@ -338,6 +363,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList);
ncclMemoryPoolConstruct(&comm->memPool_ncclNvlsHandleList);
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@ -373,6 +399,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
comm->topParentRanks[i] = i;
}
ncclIntruQueueConstruct(&comm->regRequestQueue);
ncclIntruQueueConstruct(&comm->regRecordQueue);
ncclIntruQueueMpscConstruct(&comm->callbackQueue);
return ncclSuccess;
}
@ -393,6 +421,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
}
tmpCommAndChans.comm.p2pChunkSize = comm->p2pChunkSize;
tmpCommAndChans.comm.channels = &devCommAndChans->channels[0];
comm->workFifoDepth = ncclParamWorkFifoDepth();
@ -500,7 +529,6 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
#define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
#define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
#define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */
NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
@ -516,8 +544,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
}
@ -525,6 +551,10 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
// Make sure P2P chunksize is not larger than coll chunksize.
if (comm->p2pChunkSize * NCCL_STEPS > comm->buffSizes[NCCL_PROTO_SIMPLE]) comm->p2pChunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (comm->sharedRes->owner != comm) {
/* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */
comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize);
@ -606,7 +636,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
if (share) {
if (myinfo->isMaster) {
comm->collNetSharedRes = parent->collNetSharedRes;
comm->collNetChannels = std::min(std::max(comm->nChannels, comm->nvlsChannels), parent->collNetSharedRes->nChannels);
comm->collNetChannels = std::min(comm->nChannels, parent->collNetSharedRes->nChannels);
for (int c = 0; c < comm->collNetChannels; ++c)
NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail);
}
@ -625,8 +655,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
} else {
/* this allocated buffer will be freed on proxy side */
NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
/* TODO: min or max? */
comm->collNetChannels = comm->collNetSharedRes->nChannels = std::max(comm->nChannels, comm->nvlsChannels);
comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels;
comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
for (int c = 0; c < comm->collNetChannels; c++) {
struct ncclChannel* channel = comm->channels + c;
@ -804,6 +833,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap);
for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap);
comm->nvlsRegSupport = 1;
for (int i = 0; i < nranks; i++) {
if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
&& (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
@ -816,6 +847,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
comm->intraNext = comm->peerInfo[i].comm;
}
}
if (comm->nvlsRegSupport) {
for (int j = i + 1; j < nranks; j++) {
if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash &&
comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) {
comm->nvlsRegSupport = 0;
break;
}
}
}
}
TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
@ -859,7 +900,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
// Determine local CollNet support
if (collNetSupport(comm)) {
char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
if (collNetEnable != NULL) {
INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
if (strcmp(collNetEnable, "1") == 0) {
@ -872,22 +913,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
NCCLCHECK(ncclNvlsInit(comm));
// Get rings and trees
memset(&ringGraph, 0, sizeof(struct ncclTopoGraph));
ringGraph.id = 0;
ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
ringGraph.collNet = 0;
ringGraph.minChannels = 1;
ringGraph.maxChannels = MAXCHANNELS/2;
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail);
memset(&treeGraph, 0, sizeof(struct ncclTopoGraph));
treeGraph.id = 1;
treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
treeGraph.collNet = 0;
treeGraph.minChannels = ringGraph.nChannels;
treeGraph.maxChannels = ringGraph.nChannels;
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail);
memset(&collNetGraph, 0, sizeof(struct ncclTopoGraph));
collNetGraph.id = 2;
collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
collNetGraph.collNet = 1;
@ -895,20 +937,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
if (comm->collNetSupport) {
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
} else {
collNetGraph.nChannels = 0;
}
memset(&nvlsGraph, 0, sizeof(struct ncclTopoGraph));
nvlsGraph.id = 3;
nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
nvlsGraph.collNet = 0;
nvlsGraph.minChannels = 1;
nvlsGraph.maxChannels = MAXCHANNELS;
if (comm->nvlsSupport) {
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
} else {
nvlsGraph.nChannels = 0;
}
// Initialize num P2P LL buffers for this communicator
@ -1136,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
int sendNode = (node+delta)%nNodes;
for (int step=0; step < steps; step++) {
int recvIndex = (localRank-step+steps)%steps;
int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
tasks->p2pRecvOrder[i] = recvRank;
int sendIndex = (localRank+step)%steps;
int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
@ -1197,7 +1235,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
char* str = getenv("NCCL_LAUNCH_MODE");
const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
enum ncclLaunchMode mode, modeOld;
if (str && strcasecmp(str, "GROUP") == 0) {
mode = ncclLaunchModeGroup;
@ -1357,6 +1395,11 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);
NCCLCHECKGOTO(ncclLoadTunerPlugin(&comm->tuner), res, fail);
if (comm->tuner) {
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog));
}
// update communicator state
comm->initState = ncclSuccess;
@ -1425,7 +1468,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
comm->config.maxCTAs = maxCTAsEnv;
}
envNetName = getenv("NCCL_NET");
envNetName = ncclGetEnv("NCCL_NET");
if (envNetName)
tmpNetName = envNetName;
if (tmpNetName != NULL) {
@ -1560,7 +1603,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
ncclResult_t res = ncclSuccess;
ncclComm_t comm = NULL;
struct ncclCommInitRankAsyncJob *job = NULL;
char* env = getenv("NCCL_COMM_ID");
const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env && myrank == 0) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail);
@ -1602,7 +1645,7 @@ exit:
fail:
if (comm) {
if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag);
if (comm->abortFlagRefCount) free(comm->abortFlagRefCount);
if (comm->abortFlagRefCount) free((void*)comm->abortFlagRefCount);
free(comm);
}
if (newcomm) *newcomm = NULL;
@ -1777,6 +1820,11 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
CUDACHECK(cudaSetDevice(commDevice));
}
if (comm->tuner != NULL) {
NCCLCHECK(comm->tuner->destroy());
NCCLCHECK(ncclCloseTunerPlugin(&comm->tuner));
}
NCCLCHECK(commFree(comm));
if (savedDevice != commDevice) {
@ -1991,6 +2039,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
NCCLCHECK(ncclGroupStartInternal());
NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail);
NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
/* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
*newcomm = NCCL_COMM_NULL;
@ -2037,7 +2086,7 @@ fail:
if (childComm) {
if (comm && !comm->config.splitShare) {
if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag);
if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount);
if (childComm->abortFlagRefCount) free((void*)childComm->abortFlagRefCount);
}
free(childComm);
}
@ -2074,6 +2123,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
*asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
return ncclSuccess;
}
@ -2116,3 +2166,208 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
*rank = comm->rank;
return ncclSuccess;
}
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
ncclResult_t ret = ncclSuccess;
#if CUDART_VERSION >= 12010
size_t granularity;
if (ncclParamLocalRegister()) {
if (comm == NCCL_COMM_NULL || buff == NULL || handle == NULL || size == 0) {
WARN("Invalid arguments comm %p, buff %p, size %ld, handle %p", comm, buff, size, handle);
ret = ncclInvalidArgument;
} else if (comm->nvlsSupport) {
CUmulticastObjectProp prop = comm->nvlsResources->properties;
prop.size = size;
CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
if ((uintptr_t)buff % comm->nvlsResources->ucGran == 0 && size % granularity == 0) {
/* we can direct register what user provide */
struct ncclRegRequest* req;
NCCLCHECK(ncclCalloc(&req, 1));
req->buff = (uintptr_t)buff;
req->size = size;
ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
*handle = (void*)req;
} else {
void* base;
size_t baseSize;
/* Since we don't provide actually allocated buffer size for users by ncclMemAlloc,
* therefore, we need to get the full range of the buffer by cuMemGetAddressRange to
* register buffers. */
CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&base, &baseSize, (CUdeviceptr)buff));
if ((uintptr_t)base % comm->nvlsResources->ucGran == 0 && baseSize % granularity == 0) {
struct ncclRegRequest* req;
NCCLCHECK(ncclCalloc(&req, 1));
req->buff = (uintptr_t)base;
req->size = baseSize;
ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
*handle = (void*)req;
} else {
WARN("register fails, buffer %p (aligned %s, granularity %ld) and size %ld (aligned %s, granularity %ld) for registration", buff, (uintptr_t)buff % comm->nvlsResources->ucGran == 0 ? "TRUE" : "FALSE", comm->nvlsResources->ucGran, size, size % granularity == 0 ? "TRUE" : "FALSE", granularity);
ret = ncclInvalidArgument;
}
}
}
}
#endif
return ret;
}
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
ncclResult_t ret = ncclSuccess;
#if CUDART_VERSION >= 12010
struct ncclRegRequest* dreq = (struct ncclRegRequest*)handle;
if (ncclParamLocalRegister()) {
if (comm == NCCL_COMM_NULL || handle == NULL) {
WARN("Invalid arguments comm %p, handle %p", comm, handle);
ret = ncclInvalidArgument;
} else {
struct ncclRegRecord* rec;
/* first release register record */
rec = ncclIntruQueueHead(&comm->regRecordQueue);
while (rec) {
if (rec->buff == dreq->buff && rec->size == dreq->size) {
NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
ncclIntruQueueDelete(&comm->regRecordQueue, rec);
free(rec->addrs);
free(rec);
break;
}
rec = rec->next;
}
/* then free register request */
if (ncclIntruQueueDelete(&comm->regRequestQueue, dreq) == false) {
WARN("Invalid handle %p", handle);
ret = ncclInvalidArgument;
}
}
}
#endif
return ret;
}
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
ncclResult_t ret = ncclSuccess;
#if CUDART_VERSION >= 12010
size_t memGran = 0;
size_t mcGran = 0;
CUdevice currentDev;
CUmemAllocationProp memprop = {};
CUmulticastObjectProp mcprop = {};
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
int cudaDev;
int flag = 0;
int dcnt;
int mcSupport = 0;
if (ptr == NULL || size == 0) goto fallback;
if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
if (CUPFN(cuMulticastCreate) != NULL)
CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
if (mcSupport) {
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
memprop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
memprop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
/* mc property */
CUDACHECK(cudaGetDeviceCount(&dcnt));
mcprop.size = size;
/* device cnt is a dummy value right now, it might affect mc granularity in the future. */
mcprop.numDevices = dcnt;
mcprop.handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
mcprop.flags = 0;
CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
/* only size needs to be aligned to mcGran */
ALIGN_SIZE(size, mcGran);
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
/* Now allow RW access to the newly mapped memory */
for (int i = 0; i < dcnt; ++i) {
int p2p = 0;
if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, cudaDev, i) == cudaSuccess) && p2p)) {
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = i;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
}
}
goto exit;
}
fallback:
#endif
CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
exit:
return ret;
fail:
goto exit;
}
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
ncclResult_t ncclMemFree(void *ptr) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
ncclResult_t ret = ncclSuccess;
int saveDevice;
CUDACHECK(cudaGetDevice(&saveDevice));
#if CUDART_VERSION >= 12010
CUdevice ptrDev = 0;
int mcSupport = 0;
if (ptr == NULL) goto fallback;
if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
if (CUPFN(cuMulticastCreate) != NULL)
CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail);
CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
if (mcSupport) {
NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
goto exit;
}
fallback:
#endif
CUDACHECKGOTO(cudaFree(ptr), ret, fail);
exit:
cudaSetDevice(saveDevice);
return ret;
fail:
goto exit;
}

View File

@ -12,7 +12,7 @@
#include <dlfcn.h>
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0);
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
static int ncclCuMemSupported = 0;
@ -43,7 +43,9 @@ error:
}
int ncclCuMemEnable() {
return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable());
// NCCL_CUMEM_ENABLE=-2 means auto-detect CUMEM support
int param = ncclParamCuMemEnable();
return param >= 0 ? param : (param == -2 && ncclCuMemSupported);
}
#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
@ -74,6 +76,8 @@ DECLARE_CUDA_PFN(cuMemRelease, 10020);
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
/* ncclMemAlloc/Free */
DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000);
#if CUDA_VERSION >= 11070
/* transport/collNet.cc/net.cc*/
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
@ -137,6 +141,8 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
LOAD_SYM(cuMemSetAccess, 10020, 1);
LOAD_SYM(cuMemUnmap, 10020, 1);
/* ncclMemAlloc/Free */
LOAD_SYM(cuPointerGetAttribute, 4000, 1);
#if CUDA_VERSION >= 11070
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
#endif
@ -158,7 +164,7 @@ static ncclResult_t initResult;
static void initOnceFunc() {
do {
char* val = getenv("CUDA_LAUNCH_BLOCKING");
const char* val = ncclGetEnv("CUDA_LAUNCH_BLOCKING");
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
} while (0);
@ -167,7 +173,7 @@ static void initOnceFunc() {
* Load CUDA driver library
*/
char path[1024];
char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
const char *ncclCudaPath = ncclGetEnv("NCCL_CUDA_PATH");
if (ncclCudaPath == NULL)
snprintf(path, 1024, "%s", "libcuda.so");
else

View File

@ -50,6 +50,9 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp);
ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init);
ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str);
ASSIGN_SYM(ibvSymbols, ibv_query_ece, ibv_internal_query_ece);
ASSIGN_SYM(ibvSymbols, ibv_set_ece, ibv_internal_set_ece);
ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr;
ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port;
@ -123,6 +126,9 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init);
LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str);
LOAD_SYM_VERSION(ibvhandle, "ibv_query_ece", ibvSymbols->ibv_internal_query_ece, "IBVERBS_1.10");
LOAD_SYM_VERSION(ibvhandle, "ibv_set_ece", ibvSymbols->ibv_internal_set_ece, "IBVERBS_1.10");
return ncclSuccess;
teardown:
@ -150,6 +156,8 @@ teardown:
ibvSymbols->ibv_internal_destroy_qp = NULL;
ibvSymbols->ibv_internal_fork_init = NULL;
ibvSymbols->ibv_internal_event_type_str = NULL;
ibvSymbols->ibv_internal_query_ece = NULL;
ibvSymbols->ibv_internal_set_ece = NULL;
if (ibvhandle != NULL) dlclose(ibvhandle);
return ncclSystemError;

View File

@ -45,11 +45,30 @@ ncclResult_t wrap_ibv_symbols(void) {
} \
return ncclSuccess;
#define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(container, internal_name, call, success_retval, name, supported) \
if (container.internal_name == NULL) { \
INFO(NCCL_NET, "Call to " name " skipped, internal_name doesn't exist"); \
*supported = 0; \
return ncclSuccess; \
} \
int ret = container.call; \
if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
*supported = 0; \
return ncclSuccess; \
} else if (ret != success_retval) { \
WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
*supported = 1; \
return ncclSystemError; \
} \
*supported = 1; \
return ncclSuccess;
#define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
int ret = container.call; \
if (ret != success_retval) { \
WARN("Call to " name " failed with error %s", strerror(ret)); \
WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
return ncclSystemError; \
} \
return ncclSuccess;
@ -187,6 +206,14 @@ ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
}
ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_query_ece, ibv_internal_query_ece(qp, ece), 0, "ibv_query_ece", supported);
}
ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_set_ece, ibv_internal_set_ece(qp, ece), 0, "ibv_set_ece", supported);
}
ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
*ret = (char *) ibvSymbols.ibv_internal_event_type_str(event);
return ncclSuccess;

View File

@ -30,7 +30,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
handle->fd = -1;
handle->socketName[0] = '\0';
if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
WARN("UDS: Socket creation error : %d", errno);
WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno);
return ncclSystemError;
}
@ -54,7 +54,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
WARN("UDS: Binding to socket %s failed : %d", temp, errno);
WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno);
close(fd);
return ncclSystemError;
}
@ -73,6 +73,15 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
return ncclSuccess;
}
ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) {
if (handle == NULL) {
WARN("ncclSocketGetFd: pass NULL socket");
return ncclInvalidArgument;
}
if (fd) *fd = handle->fd;
return ncclSuccess;
}
ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
if (handle == NULL) {
return ncclInternalError;
@ -90,7 +99,7 @@ ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
return ncclSuccess;
}
ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
@ -107,8 +116,13 @@ ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
iov[0].iov_base = (void *)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
if (hdr == NULL) {
iov[0].iov_base = (void *)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
} else {
iov[0].iov_base = hdr;
iov[0].iov_len = hdrLen;
}
msg.msg_iov = iov;
msg.msg_iovlen = 1;
@ -121,25 +135,30 @@ ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
}
if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
if (recvFd != NULL) {
if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
return ncclSystemError;
}
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return ncclSystemError;
}
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return ncclSystemError;
TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
}
TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
return ncclSuccess;
}
ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
struct msghdr msg;
ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd);
}
ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
char temp[NCCL_IPC_SOCKNAME_LEN];
@ -149,6 +168,7 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra
} control_un;
struct cmsghdr *cmptr;
char dummy_buffer[1];
struct sockaddr_un cliaddr;
// Construct client address to send this shareable handle to
@ -162,35 +182,43 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra
}
(void) strncpy(cliaddr.sun_path, temp, len);
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
if (sendFd != -1) {
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
}
msg.msg_name = (void *)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
iov[0].iov_base = (void *)"";
iov[0].iov_len = 1;
if (hdr == NULL) {
iov[0].iov_base = (void *)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
} else {
iov[0].iov_base = hdr;
iov[0].iov_len = hdrLen;
}
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
ssize_t sendResult;
while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
return ncclSystemError;
}
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
@ -198,3 +226,7 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra
return ncclSuccess;
}
ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash);
}

View File

@ -63,7 +63,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&mutex);
if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
char* str = getenv(env);
const char* str = ncclGetEnv(env);
int64_t value = deftVal;
if (str && strlen(str) > 0) {
errno = 0;
@ -79,3 +79,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
}
pthread_mutex_unlock(&mutex);
}
const char *ncclGetEnv(const char *name) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, initEnv);
return getenv(name);
}

View File

@ -61,7 +61,7 @@ void ncclProfilingDump() {
static int dumpDone = 0;
if (dumpDone) return;
dumpDone = 1;
const char* str = getenv("NCCL_PROXY_PROFILE");
const char* str = ncclGetEnv("NCCL_PROXY_PROFILE");
if (!str) { free(profilingEvents); return; }
FILE* f = fopen(str, "w");
fprintf(f, "[\n");

View File

@ -5,6 +5,7 @@
************************************************************************/
#include "shm.h"
#include "comm.h"
#include "checks.h"
#include <sys/types.h>
#include <sys/mman.h>
@ -67,7 +68,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
}
if (ftruncate(fd, realShmSize) != 0) {
if (fallocate(fd, 0, 0, realShmSize) != 0) {
WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
ret = ncclSystemError;
goto fail;
@ -162,3 +163,37 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
}
return ret;
}
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) {
ncclResult_t ret = ncclSuccess;
int curRound = shmem->round;
size_t mycnt;
if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL) {
ret = ncclInvalidArgument;
goto exit;
}
memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize);
/* sync among local ranks */
mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL);
if (mycnt == comm->localRanks) {
*shmem->cnt[curRound ^ 1] = 0; /* prepare next round */
__atomic_store_n(shmem->cnt[curRound], comm->localRanks + 1, __ATOMIC_RELEASE); /* release everyone */
} else {
uint64_t t0 = clockNano();
while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
if (clockNano() - t0 >= 5 * 1000) sched_yield();
if (*comm->abortFlag == 1) {
ret = ncclInternalError;
goto exit;
}
}
}
memcpy(recvbuff, (const void*)shmem->ptr[curRound], comm->localRanks * typeSize);
shmem->round ^= 1;
exit:
return ret;
}

View File

@ -11,6 +11,7 @@
#include <unistd.h>
#include <ifaddrs.h>
#include <net/if.h>
#include "param.h"
static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
int bytes = 0;
@ -84,7 +85,7 @@ static uint16_t socketToPort(union ncclSocketAddress *addr) {
/* Allow the user to force the IPv4/IPv6 interface selection */
static int envSocketFamily(void) {
int family = -1; // Family selection is not forced, will use first one found
char* env = getenv("NCCL_SOCKET_FAMILY");
const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY");
if (env == NULL)
return family;
@ -325,7 +326,7 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
// Allow user to force the INET socket family selection
int sock_family = envSocketFamily();
// User specified interface
char* env = getenv("NCCL_SOCKET_IFNAME");
const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
if (env && strlen(env) > 1) {
INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
// Specified by user : find or fail
@ -337,10 +338,10 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
// else see if we can get some hint from COMM ID
if (nIfs == 0) {
char* commId = getenv("NCCL_COMM_ID");
const char* commId = ncclGetEnv("NCCL_COMM_ID");
if (commId && strlen(commId) > 1) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
// Try to find interface that is in the same subnet as the IP in comm id
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
// Try to find interface that is in the same subnet as the IP in comm id
union ncclSocketAddress idAddr;
ncclSocketGetAddrFromString(&idAddr, commId);
nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);

82
src/misc/tuner.cc Normal file
View File

@ -0,0 +1,82 @@
/*************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
*
* See LICENSE.txt for license information
************************************************************************/
#include <dlfcn.h>
#include <errno.h>
#include <stdlib.h>
#include "debug.h"
#include "nccl_tuner.h"
pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
static int tunerPluginRefCount = -1;
static void* tunerPluginLib = nullptr;
ncclTuner_t* tunerSymbol = nullptr;
ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
// Initialize to nullptr by default if plugin tuner cannot be loaded.
*tuner = nullptr;
if (tunerPluginRefCount == -2) return ncclSuccess;
pthread_mutex_lock(&tunerPluginLock);
if (tunerPluginRefCount == -1) {
tunerPluginRefCount = -2; // Default: no plugin, don't try again later
const char* name = getenv("NCCL_TUNER_PLUGIN");
if (name) {
INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
}
if (tunerPluginLib == nullptr) {
// dlopen does not guarantee to set errno, but dlerror only gives us a
// string, so checking errno doesn't hurt to try to provide a better
// error message
if (errno == ENOENT) {
INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
} else {
INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
}
} else {
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
if (tunerSymbol == nullptr) {
INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
dlclose(tunerPluginLib);
tunerPluginLib = nullptr;
} else {
INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
tunerPluginRefCount = 0;
}
}
}
if (tunerPluginRefCount >= 0) {
*tuner = tunerSymbol;
INFO(NCCL_INIT, "Using tuner plugin: '%s'", tunerSymbol->name);
tunerPluginRefCount++;
}
pthread_mutex_unlock(&tunerPluginLock);
return ncclSuccess;
}
ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner) {
if (*tuner == nullptr) return ncclSuccess;
pthread_mutex_lock(&tunerPluginLock);
if (--tunerPluginRefCount == 0) {
if (tunerPluginLib == nullptr) {
WARN("Tuner plugin refcount is 0, yet tunerPluginLib ptr is NULL\n");
} else {
INFO(NCCL_TUNING, "Closing tuner: '%s'", tunerSymbol->name);
dlclose(tunerPluginLib);
}
tunerPluginLib = nullptr;
tunerSymbol = nullptr;
*tuner = nullptr;
tunerPluginRefCount = -1;
}
pthread_mutex_unlock(&tunerPluginLock);
return ncclSuccess;
}

View File

@ -85,13 +85,13 @@ uint64_t getHash(const char* string, int n) {
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
uint64_t getHostHash(void) {
char hostHash[1024];
char *hostId;
const char *hostId;
// Fall back is the full hostname if something fails
(void) getHostName(hostHash, sizeof(hostHash), '\0');
int offset = strlen(hostHash);
if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
if ((hostId = ncclGetEnv("NCCL_HOSTID")) != NULL) {
INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
strncpy(hostHash, hostId, sizeof(hostHash));
} else {

View File

@ -78,6 +78,15 @@ typedef struct ncclConfig_v21700 {
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
}
/* NCCL malloc and free function for all types of NCCL optimizations
* (e.g. user buffer registration). The actual allocated size might
* be larger than requested due to granularity requirement. */
ncclResult_t ncclMemAlloc(void** ptr, size_t size);
ncclResult_t pncclMemAlloc(void** ptr, size_t size);
ncclResult_t ncclMemFree(void *ptr);
ncclResult_t pncclMemFree(void *ptr);
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
* This integer is coded with the MAJOR, MINOR and PATCH level of the
* NCCL library
@ -417,6 +426,14 @@ ncclResult_t pncclGroupStart();
ncclResult_t ncclGroupEnd();
ncclResult_t pncclGroupEnd();
/* Register CUDA buffer for zero-copy operation */
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
/* Deregister CUDA buffer */
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
#ifdef __cplusplus
} // end extern "C"
#endif

View File

@ -1,3 +1,9 @@
/*************************************************************************
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "net.h"
#include "bootstrap.h"
#include "checks.h"
@ -9,148 +15,190 @@
//#include <sys/stat.h>
//#include <unistd.h>
static ncclNet_v6_t ncclNet_v4_as_v6;
static ncclNet_v6_t ncclNet_v5_as_v6;
static ncclNet_v4_t *ncclNet_v4;
static ncclNet_v7_t ncclNet_v5_as_v7;
static ncclNet_v7_t ncclNet_v6_as_v7;
static ncclNet_v5_t *ncclNet_v5;
static ncclCollNet_v6_t ncclCollNet_v4_as_v6;
static ncclCollNet_v6_t ncclCollNet_v5_as_v6;
static ncclCollNet_v4_t *ncclCollNet_v4;
static ncclNet_v6_t *ncclNet_v6;
static ncclCollNet_v7_t ncclCollNet_v5_as_v7;
static ncclCollNet_v7_t ncclCollNet_v6_as_v7;
static ncclCollNet_v5_t *ncclCollNet_v5;
static ncclCollNet_v6_t *ncclCollNet_v6;
static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
ncclNetProperties_v4_t p4;
ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
ncclNetProperties_v6_t p6;
ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
if (ans != ncclSuccess) return ans;
props->name = p4.name;
props->pciPath = p4.pciPath;
props->guid = p4.guid;
props->ptrSupport = p4.ptrSupport;
props->speed = p4.speed;
props->port = p4.port;
props->maxComms = p4.maxComms;
props->maxRecvs = 1;
props->latency = 0;
props->name = p6.name;
props->pciPath = p6.pciPath;
props->guid = p6.guid;
props->ptrSupport = p6.ptrSupport;
props->speed = p6.speed;
props->port = p6.port;
props->maxComms = p6.maxComms;
props->maxRecvs = p6.maxRecvs;
props->latency = p6.latency;
props->netDeviceType = NCCL_NET_DEVICE_HOST;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
return ncclSuccess;
}
static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
static ncclResult_t ncclNet_v6_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
return ncclNet_v6->connect(dev, handle, sendComm);
}
static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
if (n == 0) return ncclSuccess;
if (n != 1) return ncclInvalidArgument;
return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
static ncclResult_t ncclNet_v6_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
return ncclNet_v6->accept(listenComm, recvComm);
}
static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
if (n == 0) return ncclSuccess;
if (n != 1) return ncclInvalidArgument;
return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
}
// We use a wrapper around the v4 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclNet_v4->init(logfn));
ncclNet_v4_as_v6.name = ncclNet_v4->name;
ncclNet_v4_as_v6.devices = ncclNet_v4->devices;
ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties;
ncclNet_v4_as_v6.listen = ncclNet_v4->listen;
ncclNet_v4_as_v6.connect = ncclNet_v4->connect;
ncclNet_v4_as_v6.accept = ncclNet_v4->accept;
ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr;
ncclNet_v4_as_v6.regMrDmaBuf = NULL;
ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr;
ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend;
ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv;
ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush;
ncclNet_v4_as_v6.test = ncclNet_v4->test;
ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend;
ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv;
ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen;
static ncclResult_t ncclNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclNet_v6->init(logfn));
ncclNet_v6_as_v7.name = ncclNet_v6->name;
ncclNet_v6_as_v7.devices = ncclNet_v6->devices;
ncclNet_v6_as_v7.getProperties = ncclNet_v6_as_v7_getProperties; // ncclNet_v5->getProperties;
ncclNet_v6_as_v7.listen = ncclNet_v6->listen;
ncclNet_v6_as_v7.connect = ncclNet_v6_as_v7_connect;
ncclNet_v6_as_v7.accept = ncclNet_v6_as_v7_accept;
ncclNet_v6_as_v7.regMr = ncclNet_v6->regMr;
ncclNet_v6_as_v7.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
ncclNet_v6_as_v7.deregMr = ncclNet_v6->deregMr;
ncclNet_v6_as_v7.isend = ncclNet_v6->isend;
ncclNet_v6_as_v7.irecv = ncclNet_v6->irecv;
ncclNet_v6_as_v7.iflush = ncclNet_v6->iflush;
ncclNet_v6_as_v7.test = ncclNet_v6->test;
ncclNet_v6_as_v7.closeSend = ncclNet_v6->closeSend;
ncclNet_v6_as_v7.closeRecv = ncclNet_v6->closeRecv;
ncclNet_v6_as_v7.closeListen = ncclNet_v6->closeListen;
ncclNet_v6_as_v7.getDeviceMr = NULL;
ncclNet_v6_as_v7.irecvConsumed = NULL;
return ncclSuccess;
}
static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
ncclNetProperties_v6_t p6;
ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
if (ans != ncclSuccess) return ans;
props->name = p6.name;
props->pciPath = p6.pciPath;
props->guid = p6.guid;
props->ptrSupport = p6.ptrSupport;
props->speed = p6.speed;
props->port = p6.port;
props->maxComms = p6.maxComms;
props->maxRecvs = p6.maxRecvs;
props->latency = p6.latency;
props->netDeviceType = NCCL_NET_DEVICE_HOST;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
return ncclSuccess;
}
static ncclResult_t ncclNet_v5_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
return ncclNet_v5->connect(dev, handle, sendComm);
}
static ncclResult_t ncclNet_v5_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
return ncclNet_v5->accept(listenComm, recvComm);
}
// We use a wrapper around the v5 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
static ncclResult_t ncclNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclNet_v5->init(logfn));
ncclNet_v5_as_v6.name = ncclNet_v5->name;
ncclNet_v5_as_v6.devices = ncclNet_v5->devices;
ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties;
ncclNet_v5_as_v6.listen = ncclNet_v5->listen;
ncclNet_v5_as_v6.connect = ncclNet_v5->connect;
ncclNet_v5_as_v6.accept = ncclNet_v5->accept;
ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr;
ncclNet_v5_as_v6.regMrDmaBuf = NULL;
ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr;
ncclNet_v5_as_v6.isend = ncclNet_v5->isend;
ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv;
ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush;
ncclNet_v5_as_v6.test = ncclNet_v5->test;
ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend;
ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv;
ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen;
ncclNet_v5_as_v7.name = ncclNet_v5->name;
ncclNet_v5_as_v7.devices = ncclNet_v5->devices;
ncclNet_v5_as_v7.getProperties = ncclNet_v5_as_v7_getProperties;
ncclNet_v5_as_v7.listen = ncclNet_v5->listen;
ncclNet_v5_as_v7.connect = ncclNet_v5_as_v7_connect;
ncclNet_v5_as_v7.accept = ncclNet_v5_as_v7_accept;
ncclNet_v5_as_v7.regMr = ncclNet_v5->regMr;
ncclNet_v5_as_v7.regMrDmaBuf = NULL;
ncclNet_v5_as_v7.deregMr = ncclNet_v5->deregMr;
ncclNet_v5_as_v7.isend = ncclNet_v5->isend;
ncclNet_v5_as_v7.irecv = ncclNet_v5->irecv;
ncclNet_v5_as_v7.iflush = ncclNet_v5->iflush;
ncclNet_v5_as_v7.test = ncclNet_v5->test;
ncclNet_v5_as_v7.closeSend = ncclNet_v5->closeSend;
ncclNet_v5_as_v7.closeRecv = ncclNet_v5->closeRecv;
ncclNet_v5_as_v7.closeListen = ncclNet_v5->closeListen;
ncclNet_v5_as_v7.getDeviceMr = NULL;
ncclNet_v5_as_v7.irecvConsumed = NULL;
return ncclSuccess;
}
static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
ncclNetProperties_v4_t p4;
ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
ncclNetProperties_v6_t p6;
ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
if (ans != ncclSuccess) return ans;
props->name = p4.name;
props->pciPath = p4.pciPath;
props->guid = p4.guid;
props->ptrSupport = p4.ptrSupport;
props->speed = p4.speed;
props->port = p4.port;
props->maxComms = p4.maxComms;
props->maxRecvs = 1;
props->latency = 0;
return ncclSuccess;
}
// We use a wrapper around the v4 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclCollNet_v4->init(logfn));
ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices;
ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties;
ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen;
ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect;
ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport;
ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr;
ncclCollNet_v4_as_v6.regMrDmaBuf = NULL;
ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr;
ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce;
ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush;
ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test;
ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl;
ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen;
props->name = p6.name;
props->pciPath = p6.pciPath;
props->guid = p6.guid;
props->ptrSupport = p6.ptrSupport;
props->speed = p6.speed;
props->port = p6.port;
props->maxComms = p6.maxComms;
props->maxRecvs = p6.maxRecvs;
props->latency = p6.latency;
props->netDeviceType = NCCL_NET_DEVICE_HOST;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
return ncclSuccess;
}
// We use a wrapper around the v5 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
static ncclResult_t ncclCollNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclCollNet_v5->init(logfn));
ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices;
ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties;
ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen;
ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect;
ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport;
ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr;
ncclCollNet_v5_as_v6.regMrDmaBuf = NULL;
ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr;
ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce;
ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush;
ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test;
ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl;
ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen;
ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
ncclCollNet_v5_as_v7.devices = ncclCollNet_v5->devices;
ncclCollNet_v5_as_v7.getProperties = ncclCollNet_v5_as_v7_getProperties;
ncclCollNet_v5_as_v7.listen = ncclCollNet_v5->listen;
ncclCollNet_v5_as_v7.connect = ncclCollNet_v5->connect;
ncclCollNet_v5_as_v7.reduceSupport = ncclCollNet_v5->reduceSupport;
ncclCollNet_v5_as_v7.regMr = ncclCollNet_v5->regMr;
ncclCollNet_v5_as_v7.regMrDmaBuf = NULL;
ncclCollNet_v5_as_v7.deregMr = ncclCollNet_v5->deregMr;
ncclCollNet_v5_as_v7.iallreduce = ncclCollNet_v5->iallreduce;
ncclCollNet_v5_as_v7.iflush = ncclCollNet_v5->iflush;
ncclCollNet_v5_as_v7.test = ncclCollNet_v5->test;
ncclCollNet_v5_as_v7.closeColl = ncclCollNet_v5->closeColl;
ncclCollNet_v5_as_v7.closeListen = ncclCollNet_v5->closeListen;
return ncclSuccess;
}
static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
ncclNetProperties_v6_t p6;
ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
if (ans != ncclSuccess) return ans;
props->name = p6.name;
props->pciPath = p6.pciPath;
props->guid = p6.guid;
props->ptrSupport = p6.ptrSupport;
props->speed = p6.speed;
props->port = p6.port;
props->maxComms = p6.maxComms;
props->maxRecvs = p6.maxRecvs;
props->latency = p6.latency;
props->netDeviceType = NCCL_NET_DEVICE_HOST;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
return ncclSuccess;
}
// We use a wrapper around the v5 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclCollNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclCollNet_v6->init(logfn));
ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
ncclCollNet_v6_as_v7.devices = ncclCollNet_v6->devices;
ncclCollNet_v6_as_v7.getProperties = ncclCollNet_v6_as_v7_getProperties;
ncclCollNet_v6_as_v7.listen = ncclCollNet_v6->listen;
ncclCollNet_v6_as_v7.connect = ncclCollNet_v6->connect;
ncclCollNet_v6_as_v7.reduceSupport = ncclCollNet_v6->reduceSupport;
ncclCollNet_v6_as_v7.regMr = ncclCollNet_v6->regMr;
ncclCollNet_v6_as_v7.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
ncclCollNet_v6_as_v7.deregMr = ncclCollNet_v6->deregMr;
ncclCollNet_v6_as_v7.iallreduce = ncclCollNet_v6->iallreduce;
ncclCollNet_v6_as_v7.iflush = ncclCollNet_v6->iflush;
ncclCollNet_v6_as_v7.test = ncclCollNet_v6->test;
ncclCollNet_v6_as_v7.closeColl = ncclCollNet_v6->closeColl;
ncclCollNet_v6_as_v7.closeListen = ncclCollNet_v6->closeListen;
return ncclSuccess;
}
@ -167,7 +215,7 @@ enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, n
ncclResult_t ncclNetPluginInit() {
char ncclNetPluginName[128];
const char* envPluginName = getenv("NCCL_NET_PLUGIN");
const char* envPluginName = ncclGetEnv("NCCL_NET_PLUGIN");
if (envPluginName && strlen(envPluginName)) {
snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName);
@ -176,62 +224,97 @@ ncclResult_t ncclNetPluginInit() {
}
void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
if (netPluginLib == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror());
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation");
// dlopen does not guarantee to set errno, but dlerror only gives us a
// string, so checking errno doesn't hurt to try to provide a better
// error message
if (errno == ENOENT) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : dlerror=%s No plugin found (%s), using internal implementation", dlerror(), ncclNetPluginName);
// exit(-1);
} else {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
}
return ncclSuccess;
}
ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
ncclNets[0] = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
if (ncclNets[0] == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
// Try v5 plugin
ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
if (ncclNet_v5 == nullptr) {
ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
if (ncclNet_v4 == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5).");
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
// Try v6 plugin
ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
if (ncclNet_v6 == nullptr) {
// Try v5 plugin
ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
if (ncclNet_v5 == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
if (netPluginLib != nullptr) dlclose(netPluginLib);
return ncclSuccess;
} else {
ncclNets[0] = &ncclNet_v5_as_v7;
ncclNet_v5_as_v7.init = ncclNet_v5_as_v7_init;
// Set the name right away to allow for NCCL_NET=... to work
ncclNet_v5_as_v7.name = ncclNet_v5->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
}
ncclNets[0] = &ncclNet_v4_as_v6;
ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init;
// Set the name right away to allow for NCCL_NET=... to work
ncclNet_v4_as_v6.name = ncclNet_v4->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name);
} else {
ncclNets[0] = &ncclNet_v5_as_v6;
ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init;
ncclNets[0] = &ncclNet_v6_as_v7;
ncclNet_v6_as_v7.init = ncclNet_v6_as_v7_init;
// Set the name right away to allow for NCCL_NET=... to work
ncclNet_v5_as_v6.name = ncclNet_v5->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
ncclNet_v6_as_v7.name = ncclNet_v6->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
}
}
// Check for CollNet
ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
ncclCollNets[0] = (ncclCollNet_v7_t*) dlsym(netPluginLib, "ncclCollNetPlugin_v7");
if (ncclCollNets[0] == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
if (ncclCollNet_v5 == nullptr) {
ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
if (ncclCollNet_v4 == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).");
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
if (ncclCollNet_v6 == nullptr) {
ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
if (ncclCollNet_v5 == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
} else {
ncclCollNets[0] = &ncclCollNet_v4_as_v6;
ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init;
ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name);
ncclCollNets[0] = &ncclCollNet_v5_as_v7;
ncclCollNet_v5_as_v7.init = ncclCollNet_v5_as_v7_init;
ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
}
} else {
ncclCollNets[0] = &ncclCollNet_v5_as_v6;
ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init;
ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
ncclCollNets[0] = &ncclCollNet_v6_as_v7;
ncclCollNet_v6_as_v7.init = ncclCollNet_v6_as_v7_init;
ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
}
}
return ncclSuccess;
}
ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
ncclNetProperties_t props;
NCCLCHECK(net->getProperties(dev, &props));
ncclNetDeviceType type = props.netDeviceType;
if (type) switch (type) {
case NCCL_NET_DEVICE_UNPACK:
if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
props.netDeviceVersion);
return ncclSuccess;
} else {
WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
return ncclInternalError;
}
default:
WARN("Unknown device code index");
return ncclInternalError;
}
INFO(NCCL_INIT, "Using non-device net plugin version %d",
props.netDeviceVersion);
return ncclSuccess;
}
static ncclResult_t netGetState(int i, enum ncclNetState* state) {
pthread_mutex_lock(&netLock);
if (ncclNetStates[i] == ncclNetStateInit) {
@ -268,6 +351,10 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
NCCLCHECK(netGetState(i, &state));
if (state != ncclNetStateEnabled) continue;
if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
// Mismatched device plugin version
continue;
}
comm->ncclNet = ncclNets[i];
ok = true;
@ -334,10 +421,10 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
}
if (sComm == NULL)
NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2);
NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2);
if (rComm == NULL)
NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2);
NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
connected = (rComm != NULL) && (sComm != NULL);
}
@ -366,5 +453,11 @@ cleanup1:
}
int ncclNetVersion(struct ncclComm* comm) {
return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6);
if (comm->ncclNet == &ncclNet_v5_as_v7) {
return 5;
} else if (comm->ncclNet == &ncclNet_v6_as_v7) {
return 6;
} else {
return 7;
}
}

View File

@ -15,6 +15,16 @@
#include <sys/syscall.h>
#include <assert.h>
#include <unistd.h>
#include <sys/time.h>
#define PROGRESS_RUNNING 0
#define PROGRESS_REQUEST_STOP 1
#define PROGRESS_ABORT 2
#define PROGRESS_COMPLETE 3
#define SERVICE_RUNNING 0
#define SERVICE_COMPLETE 1
enum { proxyRecv=0, proxySend=1 };
@ -50,7 +60,7 @@ static void expectedProxyResponseFree(struct ncclProxyState* state) {
}
}
static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) {
static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize, ncclResult_t res) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
while (elem) {
if (elem->opId == opId) {
@ -67,6 +77,7 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
memcpy(elem->respBuff, respBuff, respSize);
free(respBuff);
elem->done = true;
elem->res = res;
return ncclSuccess;
}
elem = elem->next;
@ -84,6 +95,7 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v
// Pre-alloc response buffer
ex->respBuff = malloc(respSize);
ex->respSize = respSize;
ex->res = ncclInternalError;
ex->done = false;
// Enqueue
@ -109,10 +121,11 @@ static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, v
prev->next = elem->next;
}
memcpy(respBuff, elem->respBuff, elem->respSize);
ncclResult_t res = elem->res;
free(elem->respBuff);
free(elem);
*found = 1;
return ncclSuccess;
return res;
}
prev = elem;
elem = elem->next;
@ -509,7 +522,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel
type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
return ncclInternalError;
}
if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
if (connector->proxyConn.proxyProgress == NULL) return ncclSuccess;
if (justInquire) *justInquire = true;
else {
@ -707,13 +720,13 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
if (state->active == NULL) {
pthread_mutex_lock(&pool->mutex);
while (pool->nextOps == -1 && !state->stop) {
while (pool->nextOps == -1 && state->stop == PROGRESS_RUNNING) {
struct ncclProxyArgs profArgs; // Only used for profiling purposes
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
pthread_cond_wait(&pool->cond, &pool->mutex);
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
}
if (state->stop) { // We might have been woken up to stop.
if (state->stop != PROGRESS_RUNNING) { // We might have been woken up to stop.
pthread_mutex_unlock(&pool->mutex);
return ncclSuccess;
}
@ -851,12 +864,13 @@ void* ncclProxyProgress(void *proxyState_) {
* frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
int proxyOpAppendCounter = 0;
struct ncclProxyArgs profArgs; // Only used for profiling purposes
while ((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) {
while (state->stop == PROGRESS_RUNNING || (state->stop == PROGRESS_REQUEST_STOP && state->active)) {
int idle = 1;
ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
if (ret != ncclSuccess) {
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
return NULL;
__atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
continue;
}
if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
@ -864,11 +878,12 @@ void* ncclProxyProgress(void *proxyState_) {
int added = 0;
proxyOpAppendCounter = 0;
TIME_START(3);
if (state->stop == false)
if (state->stop == PROGRESS_RUNNING)
ret = ncclProxyGetPostedOps(proxyState, &added);
if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
if (ret != ncclSuccess) {
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
__atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
}
if (added == 0) {
sched_yield(); // No request progressed. Let others run.
@ -876,6 +891,9 @@ void* ncclProxyProgress(void *proxyState_) {
}
lastIdle = idle;
}
/* progress serive thread should be waiting for me, I need to notify it. */
__atomic_store_n(&state->stop, PROGRESS_COMPLETE, __ATOMIC_RELEASE);
return NULL;
}
@ -898,7 +916,11 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) {
static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) {
struct ncclProxyProgressState* state = &proxyState->progressState;
if (!state->thread) {
pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState);
pthread_attr_t attr;
SYSCHECK(pthread_attr_init(&attr), "pthread_attr_init");
SYSCHECK(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED), "pthread_attr_setdetachstate");
SYSCHECK(pthread_create(&state->thread, &attr, ncclProxyProgress, proxyState), "pthread_create");
SYSCHECK(pthread_attr_destroy(&attr), "pthread_attr_destroy");
ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks);
}
return ncclSuccess;
@ -910,10 +932,17 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
// Request the proxy to stop and then wake it
if (state->opsPool) {
pthread_mutex_lock(&state->opsPool->mutex);
state->stop = true;
if (*proxyState->abortFlag == 0)
state->stop = PROGRESS_REQUEST_STOP;
else
state->stop = PROGRESS_ABORT;
pthread_cond_signal(&state->opsPool->cond);
pthread_mutex_unlock(&state->opsPool->mutex);
pthread_join(state->thread, NULL);
/* progress thread is always detached, wait for it to exit. */
uint64_t t0 = clockNano();
while (__atomic_load_n(&state->stop, __ATOMIC_ACQUIRE) != PROGRESS_COMPLETE) {
if (clockNano() - t0 >= 1000) sched_yield();
}
}
// Free off any memory allocated for the proxy arg pools
@ -1005,7 +1034,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
int ready, proxyRank = -1;
struct ncclProxyState* sharedProxyState = comm->proxyState;
// Keep one connection per mlocal rank
// Keep one connection per local rank
for (int i = 0; i < comm->localRanks; ++i) {
/* find the proxy rank in comm. */
if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
@ -1058,42 +1087,43 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
}
}
INFO(NCCL_NET|NCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
return ncclSuccess;
}
// cuMem API support
// The response is sent out-of-band using ncclIpcSocket for this specific command
ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd) {
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd) {
ncclResult_t ret = ncclSuccess;
ncclResult_t res = ncclInProgress;
struct ncclIpcSocket ipcSock = { 0 };
void* opId = malloc(1);
void *opId = (void*)((((uintptr_t)random()) << 32) | random());
// Create a UDS socket to receive the converted fd
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
// Request the conversion of the fd over sockets
NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error);
// Request the allocation of a UDS fd for the handle over sockets
NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), 0, opId), ret, error);
// Receive converted fd over UDS
NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, convertedFd));
TRACE(NCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd);
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
// Receive the converted fd over UDS
NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, convertedFd), ret, error);
TRACE(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx rank %d returned fd %d", *(uint64_t*)handle, proxyConn->tpLocalRank, *convertedFd);
NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), ret, error);
// Wait for proxy response (sockets)
while (res == ncclInProgress) {
res = ncclPollProxyResponse(comm, proxyConn, NULL, opId);
}
free(opId);
return res;
return ret;
error:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
WARN("ncclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank);
WARN("ncclProxyClientGetFd call to rank %d handle 0x%lx failed : %d", proxyConn->tpRank, *(uint64_t*)handle, ret);
return ret;
}
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" };
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
struct ncclSocket* sock;
ncclResult_t ret = ncclSuccess;
@ -1132,14 +1162,13 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
// Check response queue
int found = 0;
NCCLCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found));
ncclResult_t res = expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found);
if (found == 0) {
// Attempt to read in a new response header from the proxy thread
struct ncclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
void* recvOpId;
ncclProxyRpcResponseHeader resp = {0};
int offset = 0;
if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset)) {
WARN("Socket recv failed while polling for opId=%p", opId);
return ncclInternalError;
}
@ -1147,42 +1176,38 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
if (offset == 0) {
return ncclInProgress;
// If we've returned a partial response, block to receive the rest of it
} else if (offset < sizeof(recvOpId)) {
while (offset < sizeof(recvOpId))
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
} else if (offset < sizeof(resp)) {
while (offset < sizeof(resp))
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset));
}
INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", recvOpId);
// Now do a blocking recv of the response size
int respSize = 0;
NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize)));
INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", resp.opId);
// If there's a respSize to recv
if (respSize > 0) {
if (recvOpId != opId) {
if (resp.respSize > 0) {
if (resp.opId != opId) {
// Unexpected response, need to buffer the socket data
respBuff = malloc(respSize);
respBuff = malloc(resp.respSize);
}
assert(respBuff != NULL);
NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize));
NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize));
}
if (recvOpId == opId) {
INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId));
return ncclSuccess;
if (resp.opId == opId) {
INFO(NCCL_PROXY, "resp.opId=%p matches expected opId=%p", resp.opId, opId);
NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, resp.opId));
return resp.res;
} else {
INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize);
INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", resp.opId, respBuff, resp.respSize);
// Store the result and mark response as completed
NCCLCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize));
NCCLCHECK(expectedProxyResponseStore(sharedProxyState, resp.opId, respBuff, resp.respSize, resp.res));
return ncclInProgress;
}
} else {
INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId);
}
return ncclSuccess;
return res;
}
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
@ -1284,38 +1309,52 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
}
// cuMem API support
static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, int fd) {
static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, uint64_t handle) {
#if CUDART_VERSION >= 11030
// cuMem API support
ncclResult_t ret = ncclSuccess;
struct ncclIpcSocket ipcSock = { 0 };
uint64_t hash = (uint64_t) opId;
INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, peer->tpLocalRank, hash);
INFO(NCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash);
CUmemAllocationHandleType type = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
int fd = -1;
CUCHECK(cuMemExportToShareableHandle(&fd, handle, type, 0));
// Send back the converted fd using UDS
NCCLCHECK(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag));
NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash));
NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, error);
NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash), ret, error);
error:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
return ncclSuccess;
// We can now safely close the exported fd
(void) close(fd);
return ret;
#else
return ncclInternalError;
#endif
}
static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclProxyState* proxyState, int* asyncOpCount, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool) {
int done = 1;
ncclResult_t res = ncclInternalError;
if (op->type == ncclProxyMsgSetup) {
TRACE(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
res = op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done);
} else if (op->type == ncclProxyMsgConnect) {
TRACE(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
res = op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done);
} else if (op->type == ncclProxyMsgSharedInit) {
int nChannels = (int) *op->reqBuff;
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
if (op->connection->tcomm->proxySharedInit) NCCLCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels));
if (op->connection->tcomm->proxySharedInit) res = op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels);
__atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
} else if (op->type == ncclProxyMsgConvertFd) {
int fd = *(int *)op->reqBuff;
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd);
NCCLCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support
} else if (op->type == ncclProxyMsgGetFd) {
uint64_t handle = *(uint64_t*)op->reqBuff;
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgGetFd opId=%p op.reqBuff=%p handle=0x%lx", op->opId, op->reqBuff, handle);
res = proxyGetFd(peer, op->opId, proxyState, handle); // cuMem API support
} else if (op->type == ncclProxyMsgInit) {
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
NCCLCHECK(proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection));
res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection);
} else return ncclInternalError;
if (done) {
@ -1329,11 +1368,10 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
* to abort and close the connection, it can cause segfault if the requester is using
* the respBuff. */
// Send the opId for referencing async operation
NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
ncclProxyRpcResponseHeader resp = {op->opId, res, op->respSize};
// Send the response size
NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
// Send the opId for referencing async operation
NCCLCHECK(ncclSocketSend(op->connection->sock, &resp, sizeof(resp)));
if (op->respSize) {
// Send the response
@ -1386,7 +1424,7 @@ static bool proxyMatchOpType(int type) {
case ncclProxyMsgSharedInit:
case ncclProxyMsgSetup:
case ncclProxyMsgConnect:
case ncclProxyMsgConvertFd:
case ncclProxyMsgGetFd:
return true;
default:
return false;
@ -1544,6 +1582,19 @@ void* ncclProxyService(void* _args) {
ncclSocketClose(proxyState->listenSock);
free(proxyState->listenSock);
proxyOpsFree(proxyState);
if (*proxyState->abortFlag) {
/* abort happened, need to notify main thread I am done. */
__atomic_store_n(&proxyState->stop, SERVICE_COMPLETE, __ATOMIC_RELEASE);
}
if (ncclAtomicRefCountDecrement(proxyState->abortFlagRefCount) == 0) {
ncclCudaHostFree((void *)proxyState->abortFlag);
free((void*)proxyState->abortFlagRefCount);
}
/* proxy itself holds one internal ref count, needs to call ncclProxyDestroy */
ncclProxyDestroy(proxyState);
return NULL;
}
@ -1552,8 +1603,16 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1));
comm->proxyState = comm->sharedRes->proxyState;
comm->proxyState->refCount = 1;
/* ref count for communicator and proxy service thread. */
comm->proxyState->internalRefCount = 2;
comm->proxyState->listenSock = sock;
comm->proxyState->peerAddresses = peerAddresses;
// Seed the random number generator for UDS filename generation
struct timeval time;
gettimeofday(&time,NULL);
unsigned int seed = time.tv_sec*time.tv_usec;
seed ^= getpid();
srandom(seed);
return ncclSuccess;
}
@ -1568,6 +1627,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
proxyState->tpLocalnRanks = comm->localRanks;
proxyState->cudaDev = comm->cudaDev;
proxyState->abortFlag = comm->abortFlag;
proxyState->abortFlagRefCount = comm->abortFlagRefCount;
ncclAtomicRefCountIncrement(comm->abortFlagRefCount);
proxyState->p2pnChannels = comm->p2pnChannels;
proxyState->p2pChunkSize = comm->p2pChunkSize;
proxyState->nChannels = comm->nChannels;
@ -1584,8 +1645,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
}
ncclResult_t ncclProxyStop(struct ncclComm* comm) {
if (comm->sharedRes && comm->sharedRes->proxyState) {
struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
if (comm->proxyState) {
struct ncclProxyState* sharedProxyState = comm->proxyState;
if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
if (sharedProxyState->peerAddresses) {
@ -1625,15 +1686,41 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
return ncclSuccess;
}
ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
assert(sharedProxyState->refCount == 0);
free(sharedProxyState->peerAddresses);
free(sharedProxyState->peerSocks);
free(sharedProxyState->proxyOps);
free(sharedProxyState->sharedDevMems);
expectedProxyResponseFree(sharedProxyState);
free(sharedProxyState);
ncclResult_t ncclProxyDestroy(struct ncclProxyState *proxyState) {
if (__atomic_sub_fetch(&proxyState->internalRefCount, 1, __ATOMIC_ACQ_REL) == 0) {
free(proxyState->peerAddresses);
free(proxyState->peerSocks);
free(proxyState->proxyOps);
free(proxyState->sharedDevMems);
expectedProxyResponseFree(proxyState);
free(proxyState);
}
return ncclSuccess;
}
/* detach all proxy threads in case of abort */
ncclResult_t ncclProxyTryDetach(struct ncclProxyState *proxyState) {
if (proxyState && proxyState->thread) {
/* proxy service thread can call cudaFreeHost to free pinned host mem, but
* it can cause a hang if main thread is issuing other cuda calls. To solution
* should be allocate/free pinned host mem using cuMem* driver API, this waiting
* 5 secs is just a workaround for now. */
bool join = false;
struct timespec start, now;
clock_gettime(CLOCK_MONOTONIC, &start);
do {
clock_gettime(CLOCK_MONOTONIC, &now);
if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) == SERVICE_COMPLETE) {
/* proxy thread is done, join it. */
pthread_join(proxyState->thread, NULL);
join = true;
break;
}
} while(now.tv_sec - start.tv_sec < 5);
if (join == false) {
pthread_detach(proxyState->thread);
}
}
return ncclSuccess;
}

Some files were not shown because too many files have changed in this diff Show More