Add support for external network.

Dynamically load external network from libnccl-net.so.
Add init function in networks.
Move PCI scoring to net.cu, only ask transport to provide a path.
Simplify CUDA PCI path detection.
Add dummy external network
This commit is contained in:
Sylvain Jeaugey 2018-11-13 10:37:20 -08:00
parent d7a58cfa58
commit 0d3a20f96d
29 changed files with 437 additions and 361 deletions

17
ext-net/dummy/Makefile Normal file
View File

@ -0,0 +1,17 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
NCCL_HOME:=../../build/
CUDA_HOME:=/usr/local/cuda
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
PLUGIN_SO:=libnccl-net.so
default: $(PLUGIN_SO)
$(PLUGIN_SO): plugin.c
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
clean:
rm -f $(PLUGIN_SO)

44
ext-net/dummy/plugin.c Normal file
View File

@ -0,0 +1,44 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include <nccl.h>
#include <nccl_net.h>
#define __hidden __attribute__ ((visibility("hidden")))
__hidden ncclResult_t pluginInit() { return ncclSuccess; }
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int type, void** request) { return ncclInternalError; }
__hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, int type, void** request) { return ncclInternalError; }
__hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size) { return ncclInternalError; }
__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
ncclNet_t NCCL_PLUGIN_SYMBOL = {
"Dummy",
pluginInit,
pluginDevices,
pluginPciPath,
pluginPtrSupport,
pluginListen,
pluginConnect,
pluginAccept,
pluginIsend,
pluginIrecv,
pluginFlush,
pluginTest,
pluginCloseSend,
pluginCloseRecv,
pluginCloseListen
};

View File

@ -8,7 +8,7 @@ include ../makefiles/common.mk
include ../makefiles/version.mk include ../makefiles/version.mk
##### src files ##### src files
INCEXPORTS := nccl.h INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \ LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \ misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \ transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
@ -80,6 +80,11 @@ $(INCDIR)/%.h : %.h
mkdir -p $(INCDIR) mkdir -p $(INCDIR)
cp -f $< $@ cp -f $< $@
$(INCDIR)/nccl_%.h : include/nccl_%.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
cp -f $< $@
$(OBJDIR)/%.o : %.cu $(OBJDIR)/%.o : %.cu
@printf "Compiling %-35s > %s\n" $< $@ @printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@` mkdir -p `dirname $@`

View File

@ -217,7 +217,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
int rank = state->rank; int rank = state->rank;
int nranks = state->nranks; int nranks = state->nranks;
TRACE(INIT, "rank %d nranks %d size %d", rank, nranks, size); TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
/* Simple ring based AllGather /* Simple ring based AllGather
* At each step i receive data from (rank-i-1) from left * At each step i receive data from (rank-i-1) from left
@ -233,7 +233,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size)); NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
} }
TRACE(INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
return ncclSuccess; return ncclSuccess;
} }

View File

@ -12,7 +12,7 @@
ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count, ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype); size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) { if (comm->nRanks == 1) {
if (sendbuff != recvbuff) if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));

View File

@ -12,7 +12,7 @@
ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count, ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype); size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) { if (comm->nRanks == 1) {
if (sendbuff != recvbuff) if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));

View File

@ -12,7 +12,7 @@
ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count, ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype); size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) { if (comm->nRanks == 1) {
if (sendbuff != recvbuff) if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));

View File

@ -12,7 +12,7 @@
ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count, ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype); size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) { if (comm->nRanks == 1) {
if (sendbuff != recvbuff) if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));

View File

@ -12,7 +12,7 @@
ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count, ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype); size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) { if (comm->nRanks == 1) {
if (sendbuff != recvbuff) if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));

View File

@ -271,7 +271,7 @@ struct ncclComm {
while (ret == -1) { \ while (ret == -1) { \
SYSCHECKVAL(call, name, ret); \ SYSCHECKVAL(call, name, ret); \
if (ret == -1) { \ if (ret == -1) { \
INFO(ALL,"Got %s, retrying", strerror(errno)); \ INFO(NCCL_ALL,"Got %s, retrying", strerror(errno)); \
}\ }\
} \ } \
} while (0); } while (0);
@ -313,7 +313,7 @@ struct ncclComm {
ncclResult_t res = call; \ ncclResult_t res = call; \
if (res != ncclSuccess) { \ if (res != ncclSuccess) { \
/* Print the back trace*/ \ /* Print the back trace*/ \
INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \ return res; \
} \ } \
} while (0); } while (0);
@ -322,7 +322,7 @@ struct ncclComm {
res = call; \ res = call; \
if (res != ncclSuccess) { \ if (res != ncclSuccess) { \
/* Print the back trace*/ \ /* Print the back trace*/ \
INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
goto label; \ goto label; \
} \ } \
} while (0); } while (0);

View File

@ -16,65 +16,24 @@
#include <limits.h> #include <limits.h>
#include <string.h> #include <string.h>
#include "nccl.h" #include "nccl.h"
#include "nccl_net.h"
#define gettid() (pid_t) syscall(SYS_gettid) #define gettid() (pid_t) syscall(SYS_gettid)
typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel; extern int ncclDebugLevel;
typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys;
extern DebugLevel ncclDebugLevel;
extern uint64_t ncclDebugMask; extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock; extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile; extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen); extern ncclResult_t getHostName(char* hostname, int maxlen);
#define WARN(...) do { \ extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
if (ncclDebugLevel >= WARN) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
int cudaDev; \
cudaGetDevice(&cudaDev); \
pthread_mutex_lock(&ncclDebugOutputLock); \
fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \
fprintf(ncclDebugFile,__VA_ARGS__); \
fprintf(ncclDebugFile,"\n"); \
fflush(ncclDebugFile); \
pthread_mutex_unlock(&ncclDebugOutputLock); \
if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \
} \
} while(0)
#define INFO(FLAGS, ...) do { \ #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) { \ #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
char hostname[1024]; \
getHostName(hostname, 1024); \
int cudaDev; \
cudaGetDevice(&cudaDev); \
pthread_mutex_lock(&ncclDebugOutputLock); \
fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \
fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
fflush(ncclDebugFile); \
pthread_mutex_unlock(&ncclDebugOutputLock); \
} \
} while(0)
#ifdef ENABLE_TRACE #ifdef ENABLE_TRACE
#define TRACE(FLAGS, ...) do { \ #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
int cudaDev; \
cudaGetDevice(&cudaDev); \
pthread_mutex_lock(&ncclDebugOutputLock); \
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; \
fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \
fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
fflush(ncclDebugFile); \
pthread_mutex_unlock(&ncclDebugOutputLock); \
} \
} while(0)
extern std::chrono::high_resolution_clock::time_point ncclEpoch; extern std::chrono::high_resolution_clock::time_point ncclEpoch;
#else #else
#define TRACE(...) #define TRACE(...)
#endif #endif
@ -84,17 +43,17 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
static inline void initDebug() { static inline void initDebug() {
const char* nccl_debug = getenv("NCCL_DEBUG"); const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) { if (nccl_debug == NULL) {
ncclDebugLevel = NONE; ncclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) { } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = VERSION; ncclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) { } else if (strcasecmp(nccl_debug, "WARN") == 0) {
ncclDebugLevel = WARN; ncclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) { } else if (strcasecmp(nccl_debug, "INFO") == 0) {
ncclDebugLevel = INFO; ncclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) { } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
ncclDebugLevel = ABORT; ncclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) { } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
ncclDebugLevel = TRACE; ncclDebugLevel = NCCL_LOG_TRACE;
} }
/* Parse the NCCL_DEBUG_SUBSYS env var /* Parse the NCCL_DEBUG_SUBSYS env var
@ -109,17 +68,17 @@ static inline void initDebug() {
uint64_t mask = 0; uint64_t mask = 0;
if (subsys[0] == '^') { invert = 1; subsys++; } if (subsys[0] == '^') { invert = 1; subsys++; }
if (strcasecmp(subsys, "INIT") == 0) { if (strcasecmp(subsys, "INIT") == 0) {
mask = INIT; mask = NCCL_INIT;
} else if (strcasecmp(subsys, "COLL") == 0) { } else if (strcasecmp(subsys, "COLL") == 0) {
mask = COLL; mask = NCCL_COLL;
} else if (strcasecmp(subsys, "P2P") == 0) { } else if (strcasecmp(subsys, "P2P") == 0) {
mask = P2P; mask = NCCL_P2P;
} else if (strcasecmp(subsys, "SHM") == 0) { } else if (strcasecmp(subsys, "SHM") == 0) {
mask = SHM; mask = NCCL_SHM;
} else if (strcasecmp(subsys, "NET") == 0) { } else if (strcasecmp(subsys, "NET") == 0) {
mask = NET; mask = NCCL_NET;
} else if (strcasecmp(subsys, "ALL") == 0) { } else if (strcasecmp(subsys, "ALL") == 0) {
mask = ALL; mask = NCCL_ALL;
} }
if (mask) { if (mask) {
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
@ -133,7 +92,7 @@ static inline void initDebug() {
* NCCL_DEBUG level is > VERSION * NCCL_DEBUG level is > VERSION
*/ */
const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE"); const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) { if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
int c = 0; int c = 0;
char debug_fn[PATH_MAX+1] = ""; char debug_fn[PATH_MAX+1] = "";
char *dfn = debug_fn; char *dfn = debug_fn;
@ -164,7 +123,7 @@ static inline void initDebug() {
if (debug_fn[0] != '\0') { if (debug_fn[0] != '\0') {
FILE *file = fopen(debug_fn, "w"); FILE *file = fopen(debug_fn, "w");
if (file != NULL) { if (file != NULL) {
INFO(ALL,"DEBUG file is '%s'", debug_fn); INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
ncclDebugFile = file; ncclDebugFile = file;
} }
} }

View File

@ -9,25 +9,25 @@
#include "nccl.h" #include "nccl.h"
#define NCCL_NET_MAJOR 1
#define NCCL_NET_MINOR 0
#define NCCL_NET_HANDLE_MAXSIZE 64 #define NCCL_NET_HANDLE_MAXSIZE 64
#define NCCL_PTR_HOST 0x1 #define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2 #define NCCL_PTR_CUDA 0x2
#define NCCL_MAX_SCORE 0x7 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
typedef struct { typedef struct {
// Name of the network (mainly for logs) // Name of the network (mainly for logs)
const char* name; const char* name;
// Return the number of network devices along with their scores relative to the // Initialize the network.
// current CUDA device. The per device score should be a value from 1-7 with a ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// higher score representing a better choice for performance. // Return the number of adapters.
// This call should allocate the 'scores' array using malloc(3), and it ncclResult_t (*devices)(int* ndev);
// will then be freed automatically by NCCL. // Return the device path in /sys. NCCL will call free on this path.
ncclResult_t (*devices)(int* ndev, int** scores); ncclResult_t (*pciPath)(int dev, char** path);
// Return whether this device supports host pointers and/or CUDA pointers // Return whether this device supports host pointers and/or CUDA pointers
// as data from the current GPU. Supported types should be composed with // as data from the current GPU. Supported types should be composed with
// NCCL_PTR_HOST and NCCL_PTR_CUDA. // NCCL_PTR_HOST and NCCL_PTR_CUDA.
@ -53,12 +53,10 @@ typedef struct {
ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm); ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_t; } ncclNet_v1_t;
extern typedef ncclNet_v1_t ncclNet_t;
#ifdef __cplusplus
"C" #define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
#endif
ncclNet_t* ncclNet;
#endif // end include guard #endif // end include guard

View File

@ -10,6 +10,7 @@
#include "nccl.h" #include "nccl.h"
#include "nccl_net.h" #include "nccl_net.h"
extern ncclNet_t* ncclNet;
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
/* Socket Interface Selection type */ /* Socket Interface Selection type */
@ -19,7 +20,8 @@ typedef enum { findSubnetIf = -1,
// Translation to external API // Translation to external API
static const char* ncclNetName() { return ncclNet->name; } static const char* ncclNetName() { return ncclNet->name; }
static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; } static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; } static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
@ -32,7 +34,6 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
extern bool ncclIbSupport();
extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str); extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetIb;
extern ncclNet_t ncclNetSocket; extern ncclNet_t ncclNetSocket;

View File

@ -67,10 +67,10 @@ int64_t ncclParam##name() { \
errno = 0; \ errno = 0; \
int64_t v = strtoll(str, NULL, 0); \ int64_t v = strtoll(str, NULL, 0); \
if (errno) { \ if (errno) { \
INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \ INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
} else { \ } else { \
value = v; \ value = v; \
INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value); \ INFO(NCCL_ALL,"%s set by environment to %lu.", "NCCL_" env, value); \
} \ } \
} \ } \
} \ } \

View File

@ -76,7 +76,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
if (family != AF_INET && family != AF_INET6) if (family != AF_INET && family != AF_INET6)
continue; continue;
TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
/* Allow the caller to force the socket family type */ /* Allow the caller to force the socket family type */
if (sock_family != -1 && family != sock_family) if (sock_family != -1 && family != sock_family)
@ -106,7 +106,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
// Store the IP address // Store the IP address
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
memcpy(addrs+found, interface->ifa_addr, salen); memcpy(addrs+found, interface->ifa_addr, salen);
INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
found++; found++;
} }
} }
@ -183,7 +183,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
// Store the interface name // Store the interface name
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
found++; found++;
if (found == maxIfs) break; if (found == maxIfs) break;
} }
@ -333,7 +333,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
#ifdef ENABLE_TRACE #ifdef ENABLE_TRACE
char line[1024]; char line[1024];
TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line)); TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
#endif #endif
/* Put the socket in listen mode */ /* Put the socket in listen mode */
@ -363,7 +363,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
#ifdef ENABLE_TRACE #ifdef ENABLE_TRACE
char line[1024]; char line[1024];
TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line)); TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
#endif #endif
SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED); SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
@ -381,7 +381,7 @@ static ncclResult_t socketReceive(int fd, void* ptr, int size) {
return ncclSystemError; return ncclSystemError;
} }
if (recvsize == -1) { if (recvsize == -1) {
INFO(NET,"Recv : got retcode %d, retrying", errno); INFO(NCCL_NET,"Recv : got retcode %d, retrying", errno);
continue; continue;
} }
data += recvsize; data += recvsize;
@ -397,7 +397,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
int sendsize; int sendsize;
SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize); SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
if (sendsize == -1) { if (sendsize == -1) {
INFO(NET,"Send : got retcode %d, retrying", errno); INFO(NCCL_NET,"Send : got retcode %d, retrying", errno);
continue; continue;
} }
data += sendsize; data += sendsize;

View File

@ -8,53 +8,26 @@
#define NCCL_TOPO_H_ #define NCCL_TOPO_H_
#include "nccl.h" #include "nccl.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h> #include <ctype.h>
#define MAXPATHSIZE 1024
static ncclResult_t getCudaPath(int cudaDev, char** path) { static ncclResult_t getCudaPath(int cudaDev, char** path) {
char busId[16]; char busId[16];
CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev)); CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
for (int i=0; i<16; i++) busId[i] = tolower(busId[i]); for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
char busPath[] = "/sys/class/pci_bus/0000:00/device"; char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1); memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
char* cudaRpath = realpath(busPath, NULL); memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, sizeof("0000:00:00.0")-1);
char pathname[MAXPATHSIZE]; *path = realpath(busPath, NULL);
strncpy(pathname, cudaRpath, MAXPATHSIZE);
strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname));
strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname));
free(cudaRpath);
*path = realpath(pathname, NULL);
if (*path == NULL) { if (*path == NULL) {
WARN("Could not find real path of %s", pathname); WARN("Could not find real path of %s", busPath);
return ncclSystemError; return ncclSystemError;
} }
return ncclSuccess; return ncclSuccess;
} }
static ncclResult_t getMlxPath(char* ibName, char** path) { enum ncclPathDist {
char devicepath[MAXPATHSIZE];
snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName);
*path = realpath(devicepath, NULL);
if (*path == NULL) {
WARN("Could not find real path of %s", devicepath);
return ncclSystemError;
}
return ncclSuccess;
}
static ncclResult_t getSockPath(char* ifName, char** path) {
char devicepath[MAXPATHSIZE];
snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName);
*path = realpath(devicepath, NULL);
if (*path == NULL) {
INFO(NET|INIT, "Could not find real path of %s", devicepath);
return ncclSystemError;
}
return ncclSuccess;
}
enum ncclIbPathDist {
PATH_PIX = 0, PATH_PIX = 0,
PATH_PXB = 1, PATH_PXB = 1,
PATH_PHB = 2, PATH_PHB = 2,
@ -74,7 +47,7 @@ static int pciDistance(char* path1, char* path2) {
if (same == 1) score++; if (same == 1) score++;
} }
} }
if (score == 3) return PATH_SOC; if (score <= 3) return PATH_SOC;
if (score == 4) return PATH_PHB; if (score == 4) return PATH_PHB;
if (score == depth-1) return PATH_PIX; if (score == depth-1) return PATH_PIX;
return PATH_PXB; return PATH_PXB;

View File

@ -28,9 +28,13 @@
#include <string.h> #include <string.h>
#include <errno.h> #include <errno.h>
#include <assert.h> #include <assert.h>
#include <dlfcn.h>
DebugLevel ncclDebugLevel; #define STR2(v) #v
uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT #define STR(v) STR2(v)
int ncclDebugLevel;
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
pthread_mutex_t ncclDebugOutputLock; pthread_mutex_t ncclDebugOutputLock;
FILE *ncclDebugFile = stdout; FILE *ncclDebugFile = stdout;
@ -48,7 +52,6 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
extern "C" __attribute__ ((visibility("default")))
ncclNet_t* ncclNet = NULL; ncclNet_t* ncclNet = NULL;
// We define this as weak to let tests redefine their own // We define this as weak to let tests redefine their own
@ -69,13 +72,53 @@ int ncclCudaFullCompCap() {
return ccMajor*10+ccMinor; return ccMajor*10+ccMinor;
} }
void initNet() { ncclResult_t initNet(ncclNet_t* net) {
if (ncclNet != NULL) { int ndev;
INFO(INIT,"Using external Network %s", ncclNetName()); NCCLCHECK(net->init(ncclDebugLog));
} else { NCCLCHECK(net->devices(&ndev));
ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket; if (ndev <= 0) {
INFO(INIT,"Using internal Network %s", ncclNetName()); INFO(NCCL_INIT, "Net/%s: call to devices() returned 0 devices.", net->name);
return ncclSystemError;
} }
return ncclSuccess;
}
ncclResult_t initNetPlugin(ncclNet_t** net) {
void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
if (netPluginLib == NULL) {
INFO(NCCL_INIT, "Unable to load libnccl-net.so : %s", dlerror());
return ncclSuccess;
}
ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
if (extNet == NULL) {
INFO(NCCL_INIT, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
goto cleanup;
}
if (initNet(extNet) == ncclSuccess) {
*net = extNet;
return ncclSuccess;
}
cleanup:
if (netPluginLib != NULL) dlclose(netPluginLib);
return ncclSuccess;
}
ncclResult_t initNet() {
// Always initialize sockets as we use it for bootstrap
NCCLCHECK(initNet(&ncclNetSocket));
NCCLCHECK(initNetPlugin(&ncclNet));
if (ncclNet != NULL) {
INFO(NCCL_INIT, "Using external Network %s", ncclNetName());
return ncclSuccess;
}
if (initNet(&ncclNetIb) == ncclSuccess) {
ncclNet = &ncclNetIb;
} else {
ncclNet = &ncclNetSocket;
}
INFO(NCCL_INIT,"Using internal Network %s", ncclNetName());
return ncclSuccess;
} }
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2); NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
@ -171,7 +214,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
struct ncclComm* comm; struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1)); NCCLCHECK(ncclCalloc(&comm, 1));
INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev); INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
comm->rank = rank; comm->rank = rank;
comm->nRanks = ndev; comm->nRanks = ndev;
cudaGetDevice(&comm->cudaDev); cudaGetDevice(&comm->cudaDev);
@ -204,16 +247,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
} }
// Pre-process the string so that running "strings" on the lib can quickly reveal the version. // Pre-process the string so that running "strings" on the lib can quickly reveal the version.
#define STR2(v) #v
#define STR(v) STR2(v)
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
static void showVersion() { static void showVersion() {
static int shown = 0; static int shown = 0;
if (shown == 0 && ncclDebugLevel >= VERSION) { if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
printf("%s\n", VERSION_STRING); printf("%s\n", VERSION_STRING);
fflush(stdout); fflush(stdout);
if (ncclDebugFile != stdout) if (ncclDebugFile != stdout)
INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
shown = 1; shown = 1;
} }
} }
@ -294,12 +335,12 @@ void dumpMatrix(int* connectMatrix, int nranks) {
line[STRLENGTH] = '\0'; line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH); memset(line, ' ', STRLENGTH);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j); for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
INFO(INIT,"%s", line); INFO(NCCL_INIT,"%s", line);
for (int i=0; i<nranks; i++) { for (int i=0; i<nranks; i++) {
memset(line, ' ', STRLENGTH); memset(line, ' ', STRLENGTH);
sprintf(line, "%3d ", i); sprintf(line, "%3d ", i);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]); for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
INFO(INIT,"%s", line); INFO(NCCL_INIT,"%s", line);
} }
} }
@ -308,12 +349,12 @@ void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
line[STRLENGTH] = '\0'; line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH); memset(line, ' ', STRLENGTH);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j); for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
INFO(INIT,"%s", line); INFO(NCCL_INIT,"%s", line);
for (int i=0; i<nranks; i++) { for (int i=0; i<nranks; i++) {
memset(line, ' ', STRLENGTH); memset(line, ' ', STRLENGTH);
sprintf(line, "%3d ", i); sprintf(line, "%3d ", i);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]); for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
INFO(INIT,"%s", line); INFO(NCCL_INIT,"%s", line);
} }
} }
@ -325,7 +366,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
memset(line, ' ', STRLENGTH); memset(line, ' ', STRLENGTH);
strncpy(line, prefix, PREFIXLEN); strncpy(line, prefix, PREFIXLEN);
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]); for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
INFO(INIT,"%s", line); INFO(NCCL_INIT,"%s", line);
} }
static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
@ -477,7 +518,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++) for (int i=0; i<nranks; i++)
comm->nThreads = std::max(allData[i], comm->nThreads); comm->nThreads = std::max(allData[i], comm->nThreads);
if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads); if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
// Determine the minimum CUDA Compute capability of all GPUs // Determine the minimum CUDA Compute capability of all GPUs
int myCompCap = ncclCudaCompCap(); int myCompCap = ncclCudaCompCap();
@ -486,7 +527,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++) for (int i=0; i<nranks; i++)
minCompCap = std::min(allData[i], minCompCap); minCompCap = std::min(allData[i], minCompCap);
if (rank == 0) INFO(INIT,"Min Comp Cap %d", minCompCap); if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
// Find min nrings across ranks // Find min nrings across ranks
allData[rank] = nrings; allData[rank] = nrings;
@ -547,7 +588,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
multiNode = 1; multiNode = 1;
} }
} }
TRACE(INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0); rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) { if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
@ -596,7 +637,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
INFO(INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks); INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
return ncclSuccess; return ncclSuccess;
cleanup: cleanup:
@ -615,7 +656,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
NCCLCHECK(ncclInit()); NCCLCHECK(ncclInit());
if (myrank == 0) showVersion(); if (myrank == 0) showVersion();
INFO(INIT,"rank %d nranks %d", myrank, nranks); INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
// Make sure the CUDA runtime is initialized. // Make sure the CUDA runtime is initialized.
CUDACHECK(cudaFree(NULL)); CUDACHECK(cudaFree(NULL));
@ -679,8 +720,8 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
free(prev); free(prev);
free(next); free(next);
INFO(INIT,"Using %d threads", nthreads); INFO(NCCL_INIT,"Using %d threads", nthreads);
INFO(INIT,"Min Comp Cap %d", minCompCap); INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
int* rings; int* rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS)); NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
@ -733,7 +774,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
NCCLCHECK(wrapNvmlInit()); NCCLCHECK(wrapNvmlInit());
showVersion(); showVersion();
INFO(INIT,"nranks %d", ndev); INFO(NCCL_INIT,"nranks %d", ndev);
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms")); NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
if (ndev < 1) { if (ndev < 1) {
@ -793,7 +834,7 @@ cleanup:
final: final:
if(wrapNvmlShutdown() != ncclSuccess) if(wrapNvmlShutdown() != ncclSuccess)
INFO(INIT,"NCCL did not shutdown nvml properly"); INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
cudaSetDevice(savedDevice); cudaSetDevice(savedDevice);
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return res; return res;

View File

@ -175,7 +175,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
// We can't print the CG mode before the first barrier happened. // We can't print the CG mode before the first barrier happened.
if (comm->rank == 0 && *comm->intraCGMode & 0x10) { if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
*comm->intraCGMode ^= 0x10; *comm->intraCGMode ^= 0x10;
INFO(INIT,"Launch mode %s%s%s", INFO(NCCL_INIT,"Launch mode %s%s%s",
comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
*comm->intraCGMode ? "/CGMD" : "", *comm->intraCGMode ? "/CGMD" : "",
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");

View File

@ -58,7 +58,7 @@ ncclResult_t ncclSetDevice(int cudaDev) {
#define CHECK(a) do { \ #define CHECK(a) do { \
if ((args->ret = (a)) != ncclSuccess) { \ if ((args->ret = (a)) != ncclSuccess) { \
INFO(INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
return args; \ return args; \
} \ } \
} while(0) } while(0)

View File

@ -61,7 +61,7 @@ ncclResult_t wrapNvmlSymbols(void) {
cast = (void**)&funcptr; \ cast = (void**)&funcptr; \
tmp = dlsym(handle, symbol); \ tmp = dlsym(handle, symbol); \
if (tmp == NULL) { \ if (tmp == NULL) { \
INFO(INIT,"dlsym failed on %s, ignoring", symbol); \ INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \
} \ } \
*cast = tmp; \ *cast = tmp; \
} while (0) } while (0)
@ -208,7 +208,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
} }
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive); nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
INFO(INIT,"nvmlDeviceGetNvLinkState() failed: %s ", INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
nvmlInternalErrorString(ret)); nvmlInternalErrorString(ret));
return ncclSystemError; return ncclSystemError;
} }
@ -223,7 +223,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci); nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
if (ret != NVML_ERROR_NOT_SUPPORTED) if (ret != NVML_ERROR_NOT_SUPPORTED)
INFO(INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ", INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
nvmlInternalErrorString(ret)); nvmlInternalErrorString(ret));
return ncclSystemError; return ncclSystemError;
} }
@ -239,7 +239,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult); nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
if (ret != NVML_ERROR_NOT_SUPPORTED) if (ret != NVML_ERROR_NOT_SUPPORTED)
INFO(INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ", INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
nvmlInternalErrorString(ret)); nvmlInternalErrorString(ret));
return ncclSystemError; return ncclSystemError;
} }

View File

@ -5,9 +5,10 @@
************************************************************************/ ************************************************************************/
#include "core.h" #include "core.h"
#include "net.h"
#include "param.h" #include "param.h"
#define NCCL_MAX_SCORE 7
/* Parse user defined rings. Format is like : /* Parse user defined rings. Format is like :
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0" * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
* Rings with a non-matching number of ranks are ignored so we can provide * Rings with a non-matching number of ranks are ignored so we can provide
@ -188,11 +189,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
if (str && strlen(str)>0) { if (str && strlen(str)>0) {
int ret = parseRings(str, nrings, nranks, prev, next); int ret = parseRings(str, nrings, nranks, prev, next);
if (ret == ncclSuccess && *nrings > 0) { if (ret == ncclSuccess && *nrings > 0) {
if (rank == 0) INFO(INIT,"%d ring(s) set by environment", *nrings); if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
NCCLCHECK(getEnvThreads(nthreads)); NCCLCHECK(getEnvThreads(nthreads));
return ncclSuccess; return ncclSuccess;
} }
if (rank == 0) INFO(INIT,"No valid ring found in environment, ignoring"); if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
*nrings = 0; *nrings = 0;
} }
@ -333,13 +334,13 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
minNrings = MAXRINGS; minNrings = MAXRINGS;
} }
if (maxNrings > 0 && maxNrings <= *nrings) { if (maxNrings > 0 && maxNrings <= *nrings) {
if (rank == 0) INFO(INIT,"Limiting to %d rings per user request.", maxNrings); if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
*nrings = maxNrings; *nrings = maxNrings;
} else { } else {
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1; int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
if (minNrings < defaultMinNrings) minNrings = defaultMinNrings; if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
if (minNrings > 0 && minNrings > *nrings) { if (minNrings > 0 && minNrings > *nrings) {
if (rank == 0 && minNrings > defaultMinNrings) INFO(INIT,"Duplicating rings to %d per user request.", minNrings); if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) { for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
for (int i=0; i<nranks; i++) { for (int i=0; i<nranks; i++) {
prev[r*nranks+i] = prev[(r-*nrings)*nranks+i]; prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];

View File

@ -6,8 +6,10 @@
#include "utils.h" #include "utils.h"
#include "debug.h" #include "debug.h"
#include "nccl_net.h"
#include <unistd.h> #include <unistd.h>
#include <string.h> #include <string.h>
#include <stdarg.h>
ncclResult_t getHostName(char* hostname, int maxlen) { ncclResult_t getHostName(char* hostname, int maxlen) {
if (gethostname(hostname, maxlen) != 0) { if (gethostname(hostname, maxlen) != 0) {
@ -20,6 +22,53 @@ ncclResult_t getHostName(char* hostname, int maxlen) {
return ncclSuccess; return ncclSuccess;
} }
/* Common logging function used by the INFO, WARN and TRACE macros
* Also exported to the dynamically loadable Net transport modules so
* they can share the debugging mechanisms and output files
*/
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
char hostname[1024];
getHostName(hostname, 1024);
int cudaDev;
cudaGetDevice(&cudaDev);
char buffer[1024];
size_t len = 0;
pthread_mutex_lock(&ncclDebugOutputLock);
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
#ifdef ENABLE_TRACE
else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
}
#endif
if (len) {
va_list vargs;
va_start(vargs, fmt);
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
fprintf(ncclDebugFile,"%s\n", buffer);
fflush(ncclDebugFile);
}
pthread_mutex_unlock(&ncclDebugOutputLock);
// If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
hostname, getpid(), gettid(), cudaDev, filefunc, line);
abort();
}
}
uint64_t getHash(const char* string) { uint64_t getHash(const char* string) {
// Based on DJB2, result = result * 33 + char // Based on DJB2, result = result * 33 + char
uint64_t result = 5381; uint64_t result = 5381;
@ -51,7 +100,7 @@ uint64_t getHostHash(void) {
offset += len; offset += len;
// Trailing '\0' // Trailing '\0'
uname[offset]='\0'; uname[offset]='\0';
TRACE(INIT,"unique hostname '%s'", uname); TRACE(NCCL_INIT,"unique hostname '%s'", uname);
return getHash(uname); return getHash(uname);
} }
@ -71,7 +120,7 @@ uint64_t getPidHash(void) {
if (len < 0) len = 0; if (len < 0) len = 0;
pname[plen+len]='\0'; pname[plen+len]='\0';
TRACE(INIT,"unique PID '%s'", pname); TRACE(NCCL_INIT,"unique PID '%s'", pname);
return getHash(pname); return getHash(pname);
} }

View File

@ -26,7 +26,7 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) {
NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize)); NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
ring->devMemRecv = recvMem; ring->devMemRecv = recvMem;
TRACE(INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize); TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
// Pre-configure send/recv pointers. Those are the default, they may change later. // Pre-configure send/recv pointers. Those are the default, they may change later.
ring->recv.conn.buff = recvMem->buff; ring->recv.conn.buff = recvMem->buff;

View File

@ -113,8 +113,8 @@ ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRoun
int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
int nsteps = nstepsPerRound * nrounds * substeps; int nsteps = nstepsPerRound * nrounds * substeps;
TRACE(NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm); TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
TRACE(NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm); TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
for (int r=0; r<nrings; r++) { for (int r=0; r<nrings; r++) {
struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings); struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 }; struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
@ -159,7 +159,7 @@ ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclCo
struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send; struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy); threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
if (proxyfunc) { if (proxyfunc) {
TRACE(NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm); TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
struct transportProxyInfo* info; struct transportProxyInfo* info;
NCCLCHECK(ncclCalloc(&info, 1)); NCCLCHECK(ncclCalloc(&info, 1));
connector->proxyInfo = info; connector->proxyInfo = info;

View File

@ -19,11 +19,21 @@
#define NET_BITS_PER_IF 3 #define NET_BITS_PER_IF 3
#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1) #define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t"); static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
static ncclTvalue_t getTvalue(short* distances, int ndev) {
ncclTvalue_t tvalue = 0;
for (int d=0; d<ndev; d++) {
int score = 1 + PATH_SOC - distances[d];
// Keep 3 bits of score info per dev
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
}
return tvalue;
}
struct netInfo { struct netInfo {
int rank; int rank;
int ndev; int ndev;
short scores[NET_MAX_IFS]; ncclTvalue_t tValue;
short distances[NET_MAX_IFS];
}; };
struct netConnectInfo { struct netConnectInfo {
@ -38,7 +48,7 @@ struct netSendResources {
struct ncclRecvMem* devHostRecvMem; struct ncclRecvMem* devHostRecvMem;
struct ncclSendMem* hostDevMem; struct ncclSendMem* hostDevMem;
int netDev; int netDev;
bool cudaSupport; int useGdr;
struct ncclRecvMem* devNetMem; struct ncclRecvMem* devNetMem;
uint64_t llStep; uint64_t llStep;
uint64_t llLastCleaning; uint64_t llLastCleaning;
@ -53,7 +63,7 @@ struct netRecvResources {
struct ncclRecvMem* devHostRecvMem; struct ncclRecvMem* devHostRecvMem;
struct ncclRecvMem* hostDevMem; struct ncclRecvMem* hostDevMem;
int netDev; int netDev;
bool cudaSupport; int useGdr;
uint64_t llStep; uint64_t llStep;
uint64_t llLastCleaning; uint64_t llLastCleaning;
}; };
@ -64,26 +74,37 @@ ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
struct netInfo* info = (struct netInfo*)opaqueInfo; struct netInfo* info = (struct netInfo*)opaqueInfo;
static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large"); static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
info->rank = rank; info->rank = rank;
int *scores; NCCLCHECK(ncclNetDevices(&info->ndev));
NCCLCHECK(ncclNetDevices(&info->ndev, &scores));
if (info->ndev == 0) { if (info->ndev == 0) {
WARN("Error : Network returned 0 device"); WARN("Error : Network returned 0 device");
return ncclSystemError; return ncclSystemError;
} }
if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS; if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
for (int d=0; d<info->ndev; d++) info->scores[d] = scores[d];
free(scores); // Find distance with current GPU
int cudaDev;
cudaGetDevice(&cudaDev);
char* cudaPath;
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
char line[1024];
sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName());
for (int d=0; d<info->ndev; d++) {
char* nicPath;
ncclResult_t err = ncclNetPciPath(d, &nicPath);
info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]);
if (err == ncclSuccess) free(nicPath);
}
INFO(NCCL_INIT|NCCL_NET, "%s", line);
free(cudaPath);
return ncclSuccess; return ncclSuccess;
} }
/* Determine if we can communicate with the peer */ /* Determine if we can communicate with the peer */
ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
ret[0] = 0;
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
for (int d=0; d<myInfo->ndev; d++) { ret[0] = getTvalue(myInfo->distances, myInfo->ndev);
// Keep 3 bits of score info per dev
ret[0] |= ((myInfo->scores[d] & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
}
return ncclSuccess; return ncclSuccess;
} }
@ -175,13 +196,13 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
return ncclSuccess; return ncclSuccess;
} }
int getDev(int ringId, int nDev, short* scores) { int getDev(int ringId, int nDev, short* distances) {
int maxScore = 0; int minDistance = PATH_SOC;
for (int d=0; d<nDev; d++) if (scores[d] > maxScore) maxScore = scores[d]; for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d];
int skip = ringId+1; int skip = ringId+1;
while (skip) { while (skip) {
for (int d=0; d<nDev; d++) { for (int d=0; d<nDev; d++) {
if (scores[d] == maxScore) { if (distances[d] == minDistance) {
skip--; skip--;
if (skip == 0) return d; if (skip == 0) return d;
} }
@ -191,6 +212,40 @@ int getDev(int ringId, int nDev, short* scores) {
} }
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
*useGdr = 0;
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
if (read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = ncclParamNetGdrRead();
if (gdrReadParam == 0) return ncclSuccess;
else if (gdrReadParam < 0) { // default : enable only on DGX2
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
int nvlinks = getNumNvlinks(busId);
if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
}
}
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = ncclParamNetGdrLevel();
if (distance >= netGdrLevel) {
INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel);
return ncclSuccess;
}
// Finally, check if the NIC supports it
int flags;
NCCLCHECK(ncclNetPtrSupport(dev, &flags));
if (flags & NCCL_PTR_CUDA == 0) return ncclSuccess;
*useGdr = 1;
INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
return ncclSuccess;
}
/* Determine if we will use this transport for this peer and return connect /* Determine if we will use this transport for this peer and return connect
* information for this peer */ * information for this peer */
@ -200,34 +255,11 @@ ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
ring->send.transportResources = resources; ring->send.transportResources = resources;
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores); resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
resources->cudaSupport = false; NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr));
// Get user's GDR READ setting
int gdrReadParam = ncclParamNetGdrRead();
// Determine whether the GPU has NVLink
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
int nvlinks = getNumNvlinks(busId);
// Enable GDR read when:
// 1) user sets it, or
// 2) we are on a NVSwitch platform (i.e. no P2P traffic over PCI-E switch) AND the GPU is Volta
bool enableGdrRead = (gdrReadParam > 0) || (nvlinks >= CONNECT_NVSWITCH && ncclCudaCompCap() > 6 && gdrReadParam != 0);
if (enableGdrRead) {
int flags;
NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
if (flags & NCCL_PTR_CUDA)
resources->cudaSupport = true;
}
if (resources->cudaSupport)
INFO(INIT|NET, "Net: enabling net device %d to read from rank %d", resources->netDev, myInfo->rank);
int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize; int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
if (resources->cudaSupport) { if (resources->useGdr) {
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size)); NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
} }
@ -243,10 +275,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
ring->recv.transportResources = resources; ring->recv.transportResources = resources;
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores); resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
int flags; NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr));
NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
resources->cudaSupport = (flags & NCCL_PTR_CUDA) ? true : false;
int sendSize = sizeof(struct ncclSendMem); int sendSize = sizeof(struct ncclSendMem);
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@ -255,8 +285,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo; struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
INFO(INIT|NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
resources->cudaSupport ? "/GDRDMA" : "", resources->useGdr ? "/GDRDMA" : "",
(resources->hostDevMem != NULL) ? "/GDCopy" : ""); (resources->hostDevMem != NULL) ? "/GDCopy" : "");
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
@ -267,7 +297,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
// Setup device pointers // Setup device pointers
struct netSendResources* resources = (struct netSendResources*)send->transportResources; struct netSendResources* resources = (struct netSendResources*)send->transportResources;
if (resources->cudaSupport) { if (resources->useGdr) {
send->conn.buff = resources->devNetMem->buff; send->conn.buff = resources->devNetMem->buff;
// We don't use devMem for llMode because the CPU has to read the data // We don't use devMem for llMode because the CPU has to read the data
send->conn.llBuff = resources->devHostRecvMem->llBuff; send->conn.llBuff = resources->devHostRecvMem->llBuff;
@ -299,7 +329,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
recv->conn.head = &resources->devHostSendMem->head; recv->conn.head = &resources->devHostSendMem->head;
recv->conn.llHead = &resources->devHostSendMem->llHead; recv->conn.llHead = &resources->devHostSendMem->llHead;
if (resources->cudaSupport == false) { if (resources->useGdr == 0) {
recv->conn.buff = resources->devHostRecvMem->buff; recv->conn.buff = resources->devHostRecvMem->buff;
recv->conn.llBuff = resources->devHostRecvMem->llBuff; recv->conn.llBuff = resources->devHostRecvMem->llBuff;
} }
@ -320,7 +350,7 @@ ncclResult_t netSendFree(void* transportResources) {
struct netSendResources* resources = (struct netSendResources*)transportResources; struct netSendResources* resources = (struct netSendResources*)transportResources;
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
if (resources->cudaSupport) if (resources->useGdr)
CUDACHECK(cudaFree(resources->devNetMem)); CUDACHECK(cudaFree(resources->devNetMem));
NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
free(resources); free(resources);
@ -344,9 +374,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
volatile uint64_t* prevTail = &resources->hostRecvMem->tail; volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem; struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head; uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
struct ncclRecvMem* localMem = resources->cudaSupport ? resources->devNetMem : resources->hostRecvMem; struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem;
char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff; char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST; int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo; volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
int sliceSize = buffSize / args->substeps; int sliceSize = buffSize / args->substeps;
@ -362,8 +392,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (!args->needProxy) goto nextColl; if (!args->needProxy) goto nextColl;
TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
// Update in case we skipped some collectives // Update in case we skipped some collectives
if (llMode == 0) resources->hostRecvMem->opCount = args->opCount; if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
@ -440,10 +470,10 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
int llMode = args->llMode; int llMode = args->llMode;
volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head; volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
struct ncclRecvMem* localMem = resources->cudaSupport ? ring->devMemRecv : resources->hostRecvMem; struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem;
char* localBuff = llMode ? localMem->llBuff : localMem->buff; char* localBuff = llMode ? localMem->llBuff : localMem->buff;
char* nextBuff = (resources->cudaSupport == false && resources->hostDevMem) ? resources->hostDevMem->buff : NULL; char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST; int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail; uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
@ -458,8 +488,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
if (!args->needProxy) goto nextColl; if (!args->needProxy) goto nextColl;
TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
if (llMode == 0) { if (llMode == 0) {
// Waiting for next opCount is only needed before writing nextTail. // Waiting for next opCount is only needed before writing nextTail.

View File

@ -82,8 +82,12 @@ static void* ncclIbAsyncThreadMain(void* args) {
return NULL; return NULL;
} }
static void initDevices() { NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
if(wrap_ibv_symbols() != ncclSuccess) { return; }
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
if (ncclParamIbDisable()) return ncclInternalError;
if (ncclNIbDevs == -1) { if (ncclNIbDevs == -1) {
pthread_mutex_lock(&ncclIbLock); pthread_mutex_lock(&ncclIbLock);
wrap_ibv_fork_init(); wrap_ibv_fork_init();
@ -91,9 +95,9 @@ static void initDevices() {
ncclNIbDevs = 0; ncclNIbDevs = 0;
if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
WARN("NET/IB : No IP interface found."); WARN("NET/IB : No IP interface found.");
return; return ncclInternalError;
} }
INFO(INIT|NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName); INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
// Detect IB cards // Detect IB cards
int nIbDevs; int nIbDevs;
@ -105,7 +109,7 @@ static void initDevices() {
bool searchNot = userIbEnv && userIbEnv[0] == '^'; bool searchNot = userIbEnv && userIbEnv[0] == '^';
int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS); int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return; if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
for (int d=0; d<nIbDevs; d++) { for (int d=0; d<nIbDevs; d++) {
struct ibv_context * context; struct ibv_context * context;
@ -134,7 +138,7 @@ static void initDevices() {
if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) { if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
continue; continue;
} }
INFO(INIT|NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
ncclIbDevs[ncclNIbDevs].device = d; ncclIbDevs[ncclNIbDevs].device = d;
ncclIbDevs[ncclNIbDevs].port = port; ncclIbDevs[ncclNIbDevs].port = port;
@ -145,38 +149,29 @@ static void initDevices() {
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
} }
if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return; } } if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } }
} }
} }
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return; }; if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
} }
pthread_mutex_unlock(&ncclIbLock); pthread_mutex_unlock(&ncclIbLock);
} }
return ncclSuccess;
} }
ncclResult_t ncclIbDevices(int* ndev, int** scores) { ncclResult_t ncclIbDevices(int* ndev) {
initDevices();
*ndev = ncclNIbDevs; *ndev = ncclNIbDevs;
int cudaDev; return ncclSuccess;
cudaGetDevice(&cudaDev); }
char* cudaPath;
ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath); ncclResult_t ncclIbPciPath(int dev, char** path) {
int* sc; char devicepath[PATH_MAX];
NCCLCHECK(ncclCalloc(&sc, ncclNIbDevs)); snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName);
char line[1024]; *path = realpath(devicepath, NULL);
sprintf(line, "CUDA Dev %d, IB Ports : ", cudaDev); if (*path == NULL) {
for (int d=0; d<ncclNIbDevs; d++) { WARN("Could not find real path of %s", devicepath);
char* mlxPath; return ncclSystemError;
ncclResult_t err2 = getMlxPath(ncclIbDevs[d].devName, &mlxPath);
int distance = (err1 != ncclSuccess || err2 != ncclSuccess || mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
sprintf(line+strlen(line), "%s/%d(%s) ", ncclIbDevs[d].devName, ncclIbDevs[d].port, pathDists[distance]);
sc[d] = 1+PATH_SOC-distance;
if (err2 == ncclSuccess) free(mlxPath);
} }
INFO(INIT|NET,"%s", line);
if (err1 == ncclSuccess) free(cudaPath);
*scores = sc;
return ncclSuccess; return ncclSuccess;
} }
@ -207,45 +202,21 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
return ret; return ret;
} }
NCCL_PARAM(IbGdrLevel, "IB_GDR_LEVEL", -2);
NCCL_PARAM(IbCudaSupport, "IB_CUDA_SUPPORT", -2);
ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) { ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
initDevices();
*supportedTypes = NCCL_PTR_HOST; *supportedTypes = NCCL_PTR_HOST;
int cudaDev; int cudaDev;
if (cudaGetDevice(&cudaDev) != cudaSuccess) return ncclSuccess; CUDACHECK(cudaGetDevice(&cudaDev));
int ibGdrLevel = PATH_PHB; if (ncclIbGdrSupport(dev) != ncclSuccess) {
if (ncclParamIbCudaSupport() != -2) ibGdrLevel = ncclParamIbCudaSupport() ? PATH_SOC + 1 : 0; INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
if (ncclParamIbGdrLevel() != -2) ibGdrLevel = ncclParamIbGdrLevel(); return ncclSuccess;
if (ibGdrLevel > 0) {
int gdrSupport = ncclIbGdrSupport(dev);
if (gdrSupport > 0) {
INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (%s)", cudaDev, ncclIbDevs[dev].devName, gdrSupport == 1 ? "no module" : "not supported by GPU");
ibGdrLevel = 0;
}
}
if (ibGdrLevel <= 0) return ncclSuccess;
char* cudaPath;
if (getCudaPath(cudaDev, &cudaPath) != ncclSuccess) return ncclSuccess;
char* mlxPath;
if (getMlxPath(ncclIbDevs[dev].devName, &mlxPath) != ncclSuccess) { free(cudaPath); return ncclSuccess; }
int distance = (mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
free(mlxPath); free(cudaPath);
if (distance < ibGdrLevel) {
*supportedTypes |= NCCL_PTR_CUDA;
} else {
INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (distance %d >= %d)", cudaDev, ncclIbDevs[dev].devName, distance, ibGdrLevel);
} }
*supportedTypes |= NCCL_PTR_CUDA;
return ncclSuccess; return ncclSuccess;
} }
static ncclResult_t GetSocketAddr(union socketAddress* addr) { static ncclResult_t GetSocketAddr(union socketAddress* addr) {
if (ncclNIbDevs == -1) initDevices();
memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
return ncclSuccess; return ncclSuccess;
} }
@ -442,7 +413,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
*sendComm = comm; *sendComm = comm;
// IB Setup // IB Setup
initDevices(); /*NOTE: We need to do this for ncclNet unit test that bypasses nccl initialization*/
ibv_context* ctx = ncclIbDevs[dev].context; ibv_context* ctx = ncclIbDevs[dev].context;
NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs)); NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
uint8_t ib_port = ncclIbDevs[dev].port; uint8_t ib_port = ncclIbDevs[dev].port;
@ -464,13 +434,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
// RoCE support // RoCE support
qpInfo.lid = portAttr.lid; qpInfo.lid = portAttr.lid;
if (qpInfo.lid) { // IB if (qpInfo.lid) { // IB
INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid); INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
} else { // RoCE } else { // RoCE
union ibv_gid gid; union ibv_gid gid;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid)); NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
qpInfo.spn = gid.global.subnet_prefix; qpInfo.spn = gid.global.subnet_prefix;
qpInfo.iid = gid.global.interface_id; qpInfo.iid = gid.global.interface_id;
INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
} }
NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo))); NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
@ -649,7 +619,7 @@ ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct
NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
*mrRet = verbs->mrPool+elem; *mrRet = verbs->mrPool+elem;
verbs->mrPool[elem].refcnt++; verbs->mrPool[elem].refcnt++;
TRACE(INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey); TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
return ncclSuccess; return ncclSuccess;
} }
@ -903,7 +873,9 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
ncclNet_t ncclNetIb = { ncclNet_t ncclNetIb = {
"IB", "IB",
ncclIbInit,
ncclIbDevices, ncclIbDevices,
ncclIbPciPath,
ncclIbPtrSupport, ncclIbPtrSupport,
ncclIbListen, ncclIbListen,
ncclIbConnect, ncclIbConnect,
@ -917,10 +889,3 @@ ncclNet_t ncclNetIb = {
ncclIbCloseListen ncclIbCloseListen
}; };
NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
bool ncclIbSupport() {
if (ncclParamIbDisable()) return 0;
initDevices();
return ncclNIbDevs > 0;
}

View File

@ -8,67 +8,58 @@
#include "core.h" #include "core.h"
#include "socket.h" #include "socket.h"
#include "net.h" #include "net.h"
#include "topo.h"
#include <assert.h> #include <assert.h>
#include <pthread.h> #include <pthread.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <poll.h> #include <poll.h>
#include <limits.h>
/* Init functions */ /* Init functions */
static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
static union socketAddress ncclNetIfAddrs[MAX_IFS];
static int ncclNetIfs = -1;
pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
if (ncclNetIfs == -1) {
pthread_mutex_lock(&ncclSocketLock);
if (ncclNetIfs == -1) {
ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
if (ncclNetIfs <= 0) {
WARN("NET/Socket : no interface found");
return ncclInternalError;
}
}
pthread_mutex_unlock(&ncclSocketLock);
}
return ncclSuccess;
}
ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) { ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
*supportedTypes = NCCL_PTR_HOST; *supportedTypes = NCCL_PTR_HOST;
return ncclSuccess; return ncclSuccess;
} }
static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; ncclResult_t ncclSocketDevices(int* ndev) {
static union socketAddress ncclNetIfAddrs[MAX_IFS]; *ndev = ncclNetIfs;
static int ncclNetIfs = -1; return ncclSuccess;
pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
static void initDevices() {
if (ncclNetIfs == -1) {
pthread_mutex_lock(&ncclSocketLock);
if (ncclNetIfs == -1) {
ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs);
if (ncclNetIfs <= 0) {
WARN("NET/Socket : no interface found");
}
}
pthread_mutex_unlock(&ncclSocketLock);
}
} }
ncclResult_t ncclSocketDevices(int* ndev, int** scores) { ncclResult_t ncclSocketPciPath(int dev, char** path) {
initDevices(); char devicepath[PATH_MAX];
*ndev = ncclNetIfs; snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
int cudaDev; *path = realpath(devicepath, NULL);
cudaGetDevice(&cudaDev); if (*path == NULL) {
char* cudaPath; INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath); return ncclSystemError;
int* sc;
NCCLCHECK(ncclCalloc(&sc, ncclNetIfs));
char line[1024];
sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev);
for (int i=0; i<ncclNetIfs; i++) {
char* sockPath;
ncclResult_t err2 = getSockPath(ncclNetIfNames+i*MAX_IF_NAME_SIZE, &sockPath);
int distance = (err1 != ncclSuccess || err2 != ncclSuccess || sockPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(sockPath, cudaPath);
sprintf(line+strlen(line), "%s(%s) ", ncclNetIfNames+i*MAX_IF_NAME_SIZE, pathDists[distance]);
sc[i] = 1+PATH_SOC-distance;
if (err2 == ncclSuccess) free(sockPath);
} }
INFO(INIT|NET,"%s", line);
if (err1 == ncclSuccess) free(cudaPath);
*scores = sc;
return ncclSuccess; return ncclSuccess;
} }
static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
if (ncclNetIfs == -1) initDevices();
if (dev >= ncclNetIfs) return ncclInternalError; if (dev >= ncclNetIfs) return ncclInternalError;
memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
return ncclSuccess; return ncclSuccess;
@ -223,7 +214,9 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
ncclNet_t ncclNetSocket = { ncclNet_t ncclNetSocket = {
"Socket", "Socket",
ncclSocketInit,
ncclSocketDevices, ncclSocketDevices,
ncclSocketPciPath,
ncclSocketPtrSupport, ncclSocketPtrSupport,
ncclSocketListen, ncclSocketListen,
ncclSocketConnect, ncclSocketConnect,

View File

@ -85,7 +85,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
// See if CUDA can do P2P // See if CUDA can do P2P
int p2p; int p2p;
if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) { if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
INFO(INIT|P2P,"peer query failed between dev %d and dev %d", INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
myInfo->cudaDev, peerInfo->cudaDev); myInfo->cudaDev, peerInfo->cudaDev);
return ncclSuccess; return ncclSuccess;
} }
@ -454,7 +454,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
info.direct = 1; info.direct = 1;
info.directPtr = ring->devMemSend; info.directPtr = ring->devMemSend;
if (myInfo->cudaDev == peerInfo->cudaDev) { if (myInfo->cudaDev == peerInfo->cudaDev) {
INFO(INIT|P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank); INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
} else { } else {
// Enable P2P access // Enable P2P access
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
@ -465,7 +465,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
peerInfo->cudaDev, err, cudaGetErrorString(err)); peerInfo->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError; return ncclInternalError;
} }
INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer", INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
} }
} else { } else {
@ -477,7 +477,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err)); myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError; return ncclInternalError;
} }
INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC", INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
//TRACE_DUMP_IPC(&info.devIpc); //TRACE_DUMP_IPC(&info.devIpc);
} }
@ -495,7 +495,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
info.direct = 1; info.direct = 1;
info.directPtr = ring->devMemRecv; info.directPtr = ring->devMemRecv;
if (myInfo->cudaDev == peerInfo->cudaDev) { if (myInfo->cudaDev == peerInfo->cudaDev) {
TRACE(INIT|P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank); TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
} else { } else {
// Enable P2P access // Enable P2P access
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
@ -506,7 +506,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
peerInfo->cudaDev, err, cudaGetErrorString(err)); peerInfo->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError; return ncclInternalError;
} }
TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
} }
} else { } else {
info.direct = 0; info.direct = 0;
@ -517,7 +517,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err)); myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError; return ncclInternalError;
} }
TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
//TRACE_DUMP_IPC(&info.devIpc); //TRACE_DUMP_IPC(&info.devIpc);
} }
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");

View File

@ -168,10 +168,10 @@ ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
char shmName[MAX_SHM_NAME_LEN]; char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
INFO(INIT|SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo)); memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
@ -189,7 +189,7 @@ ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
char shmName[MAX_SHM_NAME_LEN]; char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
@ -207,7 +207,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
char shmName[MAX_SHM_NAME_LEN]; char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank); sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
resources->remShmSize = info->shmSize; resources->remShmSize = info->shmSize;
TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
// Remove the file to ensure proper clean-up // Remove the file to ensure proper clean-up
NCCLCHECK(shmUnlink(shmName)); NCCLCHECK(shmUnlink(shmName));
@ -231,7 +231,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
char shmName[MAX_SHM_NAME_LEN]; char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank); sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
resources->remShmSize = info->shmSize; resources->remShmSize = info->shmSize;
TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
NCCLCHECK(shmUnlink(shmName)); NCCLCHECK(shmUnlink(shmName));
recv->conn.head = &resources->devRemHostMem->head; recv->conn.head = &resources->devRemHostMem->head;