Add support for external network.
Dynamically load external network from libnccl-net.so. Add init function in networks. Move PCI scoring to net.cu, only ask transport to provide a path. Simplify CUDA PCI path detection. Add dummy external network
This commit is contained in:
parent
d7a58cfa58
commit
0d3a20f96d
17
ext-net/dummy/Makefile
Normal file
17
ext-net/dummy/Makefile
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# See LICENSE.txt for license information
|
||||||
|
#
|
||||||
|
NCCL_HOME:=../../build/
|
||||||
|
CUDA_HOME:=/usr/local/cuda
|
||||||
|
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
|
||||||
|
PLUGIN_SO:=libnccl-net.so
|
||||||
|
|
||||||
|
default: $(PLUGIN_SO)
|
||||||
|
|
||||||
|
$(PLUGIN_SO): plugin.c
|
||||||
|
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f $(PLUGIN_SO)
|
44
ext-net/dummy/plugin.c
Normal file
44
ext-net/dummy/plugin.c
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <nccl.h>
|
||||||
|
#include <nccl_net.h>
|
||||||
|
|
||||||
|
#define __hidden __attribute__ ((visibility("hidden")))
|
||||||
|
|
||||||
|
__hidden ncclResult_t pluginInit() { return ncclSuccess; }
|
||||||
|
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
|
||||||
|
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int type, void** request) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, int type, void** request) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
|
||||||
|
__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
|
||||||
|
|
||||||
|
ncclNet_t NCCL_PLUGIN_SYMBOL = {
|
||||||
|
"Dummy",
|
||||||
|
pluginInit,
|
||||||
|
pluginDevices,
|
||||||
|
pluginPciPath,
|
||||||
|
pluginPtrSupport,
|
||||||
|
pluginListen,
|
||||||
|
pluginConnect,
|
||||||
|
pluginAccept,
|
||||||
|
pluginIsend,
|
||||||
|
pluginIrecv,
|
||||||
|
pluginFlush,
|
||||||
|
pluginTest,
|
||||||
|
pluginCloseSend,
|
||||||
|
pluginCloseRecv,
|
||||||
|
pluginCloseListen
|
||||||
|
};
|
@ -8,7 +8,7 @@ include ../makefiles/common.mk
|
|||||||
include ../makefiles/version.mk
|
include ../makefiles/version.mk
|
||||||
|
|
||||||
##### src files
|
##### src files
|
||||||
INCEXPORTS := nccl.h
|
INCEXPORTS := nccl.h nccl_net.h
|
||||||
LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
|
LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
|
||||||
misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
|
misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
|
||||||
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
|
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
|
||||||
@ -80,6 +80,11 @@ $(INCDIR)/%.h : %.h
|
|||||||
mkdir -p $(INCDIR)
|
mkdir -p $(INCDIR)
|
||||||
cp -f $< $@
|
cp -f $< $@
|
||||||
|
|
||||||
|
$(INCDIR)/nccl_%.h : include/nccl_%.h
|
||||||
|
@printf "Grabbing %-35s > %s\n" $< $@
|
||||||
|
mkdir -p $(INCDIR)
|
||||||
|
cp -f $< $@
|
||||||
|
|
||||||
$(OBJDIR)/%.o : %.cu
|
$(OBJDIR)/%.o : %.cu
|
||||||
@printf "Compiling %-35s > %s\n" $< $@
|
@printf "Compiling %-35s > %s\n" $< $@
|
||||||
mkdir -p `dirname $@`
|
mkdir -p `dirname $@`
|
||||||
|
@ -217,7 +217,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
|||||||
int rank = state->rank;
|
int rank = state->rank;
|
||||||
int nranks = state->nranks;
|
int nranks = state->nranks;
|
||||||
|
|
||||||
TRACE(INIT, "rank %d nranks %d size %d", rank, nranks, size);
|
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
|
||||||
|
|
||||||
/* Simple ring based AllGather
|
/* Simple ring based AllGather
|
||||||
* At each step i receive data from (rank-i-1) from left
|
* At each step i receive data from (rank-i-1) from left
|
||||||
@ -233,7 +233,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
|||||||
NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
|
NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
|
||||||
}
|
}
|
||||||
|
|
||||||
TRACE(INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
|
ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||||
size_t nbytes = count*ncclTypeSize(datatype);
|
size_t nbytes = count*ncclTypeSize(datatype);
|
||||||
INFO(COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||||
if (comm->nRanks == 1) {
|
if (comm->nRanks == 1) {
|
||||||
if (sendbuff != recvbuff)
|
if (sendbuff != recvbuff)
|
||||||
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
|
ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||||
size_t nbytes = count*ncclTypeSize(datatype);
|
size_t nbytes = count*ncclTypeSize(datatype);
|
||||||
INFO(COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||||
if (comm->nRanks == 1) {
|
if (comm->nRanks == 1) {
|
||||||
if (sendbuff != recvbuff)
|
if (sendbuff != recvbuff)
|
||||||
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
|
ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||||
size_t nbytes = count*ncclTypeSize(datatype);
|
size_t nbytes = count*ncclTypeSize(datatype);
|
||||||
INFO(COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||||
if (comm->nRanks == 1) {
|
if (comm->nRanks == 1) {
|
||||||
if (sendbuff != recvbuff)
|
if (sendbuff != recvbuff)
|
||||||
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
|
ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||||
size_t nbytes = count*ncclTypeSize(datatype);
|
size_t nbytes = count*ncclTypeSize(datatype);
|
||||||
INFO(COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||||
if (comm->nRanks == 1) {
|
if (comm->nRanks == 1) {
|
||||||
if (sendbuff != recvbuff)
|
if (sendbuff != recvbuff)
|
||||||
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
|
ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||||
size_t nbytes = count*ncclTypeSize(datatype);
|
size_t nbytes = count*ncclTypeSize(datatype);
|
||||||
INFO(COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||||
if (comm->nRanks == 1) {
|
if (comm->nRanks == 1) {
|
||||||
if (sendbuff != recvbuff)
|
if (sendbuff != recvbuff)
|
||||||
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
@ -271,7 +271,7 @@ struct ncclComm {
|
|||||||
while (ret == -1) { \
|
while (ret == -1) { \
|
||||||
SYSCHECKVAL(call, name, ret); \
|
SYSCHECKVAL(call, name, ret); \
|
||||||
if (ret == -1) { \
|
if (ret == -1) { \
|
||||||
INFO(ALL,"Got %s, retrying", strerror(errno)); \
|
INFO(NCCL_ALL,"Got %s, retrying", strerror(errno)); \
|
||||||
}\
|
}\
|
||||||
} \
|
} \
|
||||||
} while (0);
|
} while (0);
|
||||||
@ -313,7 +313,7 @@ struct ncclComm {
|
|||||||
ncclResult_t res = call; \
|
ncclResult_t res = call; \
|
||||||
if (res != ncclSuccess) { \
|
if (res != ncclSuccess) { \
|
||||||
/* Print the back trace*/ \
|
/* Print the back trace*/ \
|
||||||
INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||||
return res; \
|
return res; \
|
||||||
} \
|
} \
|
||||||
} while (0);
|
} while (0);
|
||||||
@ -322,7 +322,7 @@ struct ncclComm {
|
|||||||
res = call; \
|
res = call; \
|
||||||
if (res != ncclSuccess) { \
|
if (res != ncclSuccess) { \
|
||||||
/* Print the back trace*/ \
|
/* Print the back trace*/ \
|
||||||
INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||||
goto label; \
|
goto label; \
|
||||||
} \
|
} \
|
||||||
} while (0);
|
} while (0);
|
||||||
|
@ -16,65 +16,24 @@
|
|||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
|
#include "nccl_net.h"
|
||||||
|
|
||||||
#define gettid() (pid_t) syscall(SYS_gettid)
|
#define gettid() (pid_t) syscall(SYS_gettid)
|
||||||
|
|
||||||
typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel;
|
extern int ncclDebugLevel;
|
||||||
typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys;
|
|
||||||
extern DebugLevel ncclDebugLevel;
|
|
||||||
extern uint64_t ncclDebugMask;
|
extern uint64_t ncclDebugMask;
|
||||||
extern pthread_mutex_t ncclDebugOutputLock;
|
extern pthread_mutex_t ncclDebugOutputLock;
|
||||||
extern FILE *ncclDebugFile;
|
extern FILE *ncclDebugFile;
|
||||||
extern ncclResult_t getHostName(char* hostname, int maxlen);
|
extern ncclResult_t getHostName(char* hostname, int maxlen);
|
||||||
|
|
||||||
#define WARN(...) do { \
|
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
||||||
if (ncclDebugLevel >= WARN) { \
|
|
||||||
char hostname[1024]; \
|
|
||||||
getHostName(hostname, 1024); \
|
|
||||||
int cudaDev; \
|
|
||||||
cudaGetDevice(&cudaDev); \
|
|
||||||
pthread_mutex_lock(&ncclDebugOutputLock); \
|
|
||||||
fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \
|
|
||||||
fprintf(ncclDebugFile,__VA_ARGS__); \
|
|
||||||
fprintf(ncclDebugFile,"\n"); \
|
|
||||||
fflush(ncclDebugFile); \
|
|
||||||
pthread_mutex_unlock(&ncclDebugOutputLock); \
|
|
||||||
if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#define INFO(FLAGS, ...) do { \
|
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
|
||||||
if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) { \
|
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||||
char hostname[1024]; \
|
|
||||||
getHostName(hostname, 1024); \
|
|
||||||
int cudaDev; \
|
|
||||||
cudaGetDevice(&cudaDev); \
|
|
||||||
pthread_mutex_lock(&ncclDebugOutputLock); \
|
|
||||||
fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \
|
|
||||||
fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
|
|
||||||
fflush(ncclDebugFile); \
|
|
||||||
pthread_mutex_unlock(&ncclDebugOutputLock); \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#ifdef ENABLE_TRACE
|
#ifdef ENABLE_TRACE
|
||||||
#define TRACE(FLAGS, ...) do { \
|
#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||||
if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) { \
|
|
||||||
char hostname[1024]; \
|
|
||||||
getHostName(hostname, 1024); \
|
|
||||||
int cudaDev; \
|
|
||||||
cudaGetDevice(&cudaDev); \
|
|
||||||
pthread_mutex_lock(&ncclDebugOutputLock); \
|
|
||||||
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \
|
|
||||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; \
|
|
||||||
fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \
|
|
||||||
fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
|
|
||||||
fflush(ncclDebugFile); \
|
|
||||||
pthread_mutex_unlock(&ncclDebugOutputLock); \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
extern std::chrono::high_resolution_clock::time_point ncclEpoch;
|
extern std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#define TRACE(...)
|
#define TRACE(...)
|
||||||
#endif
|
#endif
|
||||||
@ -84,17 +43,17 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
|
|||||||
static inline void initDebug() {
|
static inline void initDebug() {
|
||||||
const char* nccl_debug = getenv("NCCL_DEBUG");
|
const char* nccl_debug = getenv("NCCL_DEBUG");
|
||||||
if (nccl_debug == NULL) {
|
if (nccl_debug == NULL) {
|
||||||
ncclDebugLevel = NONE;
|
ncclDebugLevel = NCCL_LOG_NONE;
|
||||||
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
||||||
ncclDebugLevel = VERSION;
|
ncclDebugLevel = NCCL_LOG_VERSION;
|
||||||
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
|
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
|
||||||
ncclDebugLevel = WARN;
|
ncclDebugLevel = NCCL_LOG_WARN;
|
||||||
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
|
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
|
||||||
ncclDebugLevel = INFO;
|
ncclDebugLevel = NCCL_LOG_INFO;
|
||||||
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
|
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
|
||||||
ncclDebugLevel = ABORT;
|
ncclDebugLevel = NCCL_LOG_ABORT;
|
||||||
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
|
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
|
||||||
ncclDebugLevel = TRACE;
|
ncclDebugLevel = NCCL_LOG_TRACE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Parse the NCCL_DEBUG_SUBSYS env var
|
/* Parse the NCCL_DEBUG_SUBSYS env var
|
||||||
@ -109,17 +68,17 @@ static inline void initDebug() {
|
|||||||
uint64_t mask = 0;
|
uint64_t mask = 0;
|
||||||
if (subsys[0] == '^') { invert = 1; subsys++; }
|
if (subsys[0] == '^') { invert = 1; subsys++; }
|
||||||
if (strcasecmp(subsys, "INIT") == 0) {
|
if (strcasecmp(subsys, "INIT") == 0) {
|
||||||
mask = INIT;
|
mask = NCCL_INIT;
|
||||||
} else if (strcasecmp(subsys, "COLL") == 0) {
|
} else if (strcasecmp(subsys, "COLL") == 0) {
|
||||||
mask = COLL;
|
mask = NCCL_COLL;
|
||||||
} else if (strcasecmp(subsys, "P2P") == 0) {
|
} else if (strcasecmp(subsys, "P2P") == 0) {
|
||||||
mask = P2P;
|
mask = NCCL_P2P;
|
||||||
} else if (strcasecmp(subsys, "SHM") == 0) {
|
} else if (strcasecmp(subsys, "SHM") == 0) {
|
||||||
mask = SHM;
|
mask = NCCL_SHM;
|
||||||
} else if (strcasecmp(subsys, "NET") == 0) {
|
} else if (strcasecmp(subsys, "NET") == 0) {
|
||||||
mask = NET;
|
mask = NCCL_NET;
|
||||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||||
mask = ALL;
|
mask = NCCL_ALL;
|
||||||
}
|
}
|
||||||
if (mask) {
|
if (mask) {
|
||||||
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
|
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
|
||||||
@ -133,7 +92,7 @@ static inline void initDebug() {
|
|||||||
* NCCL_DEBUG level is > VERSION
|
* NCCL_DEBUG level is > VERSION
|
||||||
*/
|
*/
|
||||||
const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
|
const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
|
||||||
if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) {
|
if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
|
||||||
int c = 0;
|
int c = 0;
|
||||||
char debug_fn[PATH_MAX+1] = "";
|
char debug_fn[PATH_MAX+1] = "";
|
||||||
char *dfn = debug_fn;
|
char *dfn = debug_fn;
|
||||||
@ -164,7 +123,7 @@ static inline void initDebug() {
|
|||||||
if (debug_fn[0] != '\0') {
|
if (debug_fn[0] != '\0') {
|
||||||
FILE *file = fopen(debug_fn, "w");
|
FILE *file = fopen(debug_fn, "w");
|
||||||
if (file != NULL) {
|
if (file != NULL) {
|
||||||
INFO(ALL,"DEBUG file is '%s'", debug_fn);
|
INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
|
||||||
ncclDebugFile = file;
|
ncclDebugFile = file;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,25 +9,25 @@
|
|||||||
|
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
|
|
||||||
#define NCCL_NET_MAJOR 1
|
|
||||||
#define NCCL_NET_MINOR 0
|
|
||||||
|
|
||||||
#define NCCL_NET_HANDLE_MAXSIZE 64
|
#define NCCL_NET_HANDLE_MAXSIZE 64
|
||||||
|
|
||||||
#define NCCL_PTR_HOST 0x1
|
#define NCCL_PTR_HOST 0x1
|
||||||
#define NCCL_PTR_CUDA 0x2
|
#define NCCL_PTR_CUDA 0x2
|
||||||
|
|
||||||
#define NCCL_MAX_SCORE 0x7
|
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||||
|
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||||
|
|
||||||
|
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
// Name of the network (mainly for logs)
|
// Name of the network (mainly for logs)
|
||||||
const char* name;
|
const char* name;
|
||||||
// Return the number of network devices along with their scores relative to the
|
// Initialize the network.
|
||||||
// current CUDA device. The per device score should be a value from 1-7 with a
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
// higher score representing a better choice for performance.
|
// Return the number of adapters.
|
||||||
// This call should allocate the 'scores' array using malloc(3), and it
|
ncclResult_t (*devices)(int* ndev);
|
||||||
// will then be freed automatically by NCCL.
|
// Return the device path in /sys. NCCL will call free on this path.
|
||||||
ncclResult_t (*devices)(int* ndev, int** scores);
|
ncclResult_t (*pciPath)(int dev, char** path);
|
||||||
// Return whether this device supports host pointers and/or CUDA pointers
|
// Return whether this device supports host pointers and/or CUDA pointers
|
||||||
// as data from the current GPU. Supported types should be composed with
|
// as data from the current GPU. Supported types should be composed with
|
||||||
// NCCL_PTR_HOST and NCCL_PTR_CUDA.
|
// NCCL_PTR_HOST and NCCL_PTR_CUDA.
|
||||||
@ -53,12 +53,10 @@ typedef struct {
|
|||||||
ncclResult_t (*closeSend)(void* sendComm);
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
ncclResult_t (*closeRecv)(void* recvComm);
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
ncclResult_t (*closeListen)(void* listenComm);
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
} ncclNet_t;
|
} ncclNet_v1_t;
|
||||||
|
|
||||||
extern
|
typedef ncclNet_v1_t ncclNet_t;
|
||||||
#ifdef __cplusplus
|
|
||||||
"C"
|
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
|
||||||
#endif
|
|
||||||
ncclNet_t* ncclNet;
|
|
||||||
|
|
||||||
#endif // end include guard
|
#endif // end include guard
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
#include "nccl_net.h"
|
#include "nccl_net.h"
|
||||||
|
|
||||||
|
extern ncclNet_t* ncclNet;
|
||||||
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
||||||
|
|
||||||
/* Socket Interface Selection type */
|
/* Socket Interface Selection type */
|
||||||
@ -19,7 +20,8 @@ typedef enum { findSubnetIf = -1,
|
|||||||
|
|
||||||
// Translation to external API
|
// Translation to external API
|
||||||
static const char* ncclNetName() { return ncclNet->name; }
|
static const char* ncclNetName() { return ncclNet->name; }
|
||||||
static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; }
|
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
|
||||||
|
static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
|
||||||
static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
|
static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
|
||||||
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||||
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
|
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
|
||||||
@ -32,7 +34,6 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
|
|||||||
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||||
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
|
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
|
||||||
|
|
||||||
extern bool ncclIbSupport();
|
|
||||||
extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
|
extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
|
||||||
extern ncclNet_t ncclNetIb;
|
extern ncclNet_t ncclNetIb;
|
||||||
extern ncclNet_t ncclNetSocket;
|
extern ncclNet_t ncclNetSocket;
|
||||||
|
@ -67,10 +67,10 @@ int64_t ncclParam##name() { \
|
|||||||
errno = 0; \
|
errno = 0; \
|
||||||
int64_t v = strtoll(str, NULL, 0); \
|
int64_t v = strtoll(str, NULL, 0); \
|
||||||
if (errno) { \
|
if (errno) { \
|
||||||
INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
|
INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
|
||||||
} else { \
|
} else { \
|
||||||
value = v; \
|
value = v; \
|
||||||
INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value); \
|
INFO(NCCL_ALL,"%s set by environment to %lu.", "NCCL_" env, value); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
@ -76,7 +76,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
|
|||||||
if (family != AF_INET && family != AF_INET6)
|
if (family != AF_INET && family != AF_INET6)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
|
TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
|
||||||
|
|
||||||
/* Allow the caller to force the socket family type */
|
/* Allow the caller to force the socket family type */
|
||||||
if (sock_family != -1 && family != sock_family)
|
if (sock_family != -1 && family != sock_family)
|
||||||
@ -106,7 +106,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
|
|||||||
// Store the IP address
|
// Store the IP address
|
||||||
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
|
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
|
||||||
memcpy(addrs+found, interface->ifa_addr, salen);
|
memcpy(addrs+found, interface->ifa_addr, salen);
|
||||||
INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
|
INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
|
||||||
found++;
|
found++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -183,7 +183,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
|
|||||||
// Store the interface name
|
// Store the interface name
|
||||||
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
|
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
|
||||||
|
|
||||||
INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
|
INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
|
||||||
found++;
|
found++;
|
||||||
if (found == maxIfs) break;
|
if (found == maxIfs) break;
|
||||||
}
|
}
|
||||||
@ -333,7 +333,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
|
|||||||
|
|
||||||
#ifdef ENABLE_TRACE
|
#ifdef ENABLE_TRACE
|
||||||
char line[1024];
|
char line[1024];
|
||||||
TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
|
TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Put the socket in listen mode */
|
/* Put the socket in listen mode */
|
||||||
@ -363,7 +363,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
|
|||||||
|
|
||||||
#ifdef ENABLE_TRACE
|
#ifdef ENABLE_TRACE
|
||||||
char line[1024];
|
char line[1024];
|
||||||
TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
|
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
|
SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
|
||||||
@ -381,7 +381,7 @@ static ncclResult_t socketReceive(int fd, void* ptr, int size) {
|
|||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
if (recvsize == -1) {
|
if (recvsize == -1) {
|
||||||
INFO(NET,"Recv : got retcode %d, retrying", errno);
|
INFO(NCCL_NET,"Recv : got retcode %d, retrying", errno);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
data += recvsize;
|
data += recvsize;
|
||||||
@ -397,7 +397,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
|
|||||||
int sendsize;
|
int sendsize;
|
||||||
SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
|
SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
|
||||||
if (sendsize == -1) {
|
if (sendsize == -1) {
|
||||||
INFO(NET,"Send : got retcode %d, retrying", errno);
|
INFO(NCCL_NET,"Send : got retcode %d, retrying", errno);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
data += sendsize;
|
data += sendsize;
|
||||||
|
@ -8,53 +8,26 @@
|
|||||||
#define NCCL_TOPO_H_
|
#define NCCL_TOPO_H_
|
||||||
|
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
|
#include <limits.h>
|
||||||
|
#include <stdlib.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
|
||||||
#define MAXPATHSIZE 1024
|
|
||||||
|
|
||||||
static ncclResult_t getCudaPath(int cudaDev, char** path) {
|
static ncclResult_t getCudaPath(int cudaDev, char** path) {
|
||||||
char busId[16];
|
char busId[16];
|
||||||
CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
|
CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
|
||||||
for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
|
for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
|
||||||
char busPath[] = "/sys/class/pci_bus/0000:00/device";
|
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
|
||||||
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
|
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
|
||||||
char* cudaRpath = realpath(busPath, NULL);
|
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, sizeof("0000:00:00.0")-1);
|
||||||
char pathname[MAXPATHSIZE];
|
*path = realpath(busPath, NULL);
|
||||||
strncpy(pathname, cudaRpath, MAXPATHSIZE);
|
|
||||||
strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname));
|
|
||||||
strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname));
|
|
||||||
free(cudaRpath);
|
|
||||||
*path = realpath(pathname, NULL);
|
|
||||||
if (*path == NULL) {
|
if (*path == NULL) {
|
||||||
WARN("Could not find real path of %s", pathname);
|
WARN("Could not find real path of %s", busPath);
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t getMlxPath(char* ibName, char** path) {
|
enum ncclPathDist {
|
||||||
char devicepath[MAXPATHSIZE];
|
|
||||||
snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName);
|
|
||||||
*path = realpath(devicepath, NULL);
|
|
||||||
if (*path == NULL) {
|
|
||||||
WARN("Could not find real path of %s", devicepath);
|
|
||||||
return ncclSystemError;
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ncclResult_t getSockPath(char* ifName, char** path) {
|
|
||||||
char devicepath[MAXPATHSIZE];
|
|
||||||
snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName);
|
|
||||||
*path = realpath(devicepath, NULL);
|
|
||||||
if (*path == NULL) {
|
|
||||||
INFO(NET|INIT, "Could not find real path of %s", devicepath);
|
|
||||||
return ncclSystemError;
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ncclIbPathDist {
|
|
||||||
PATH_PIX = 0,
|
PATH_PIX = 0,
|
||||||
PATH_PXB = 1,
|
PATH_PXB = 1,
|
||||||
PATH_PHB = 2,
|
PATH_PHB = 2,
|
||||||
@ -74,7 +47,7 @@ static int pciDistance(char* path1, char* path2) {
|
|||||||
if (same == 1) score++;
|
if (same == 1) score++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (score == 3) return PATH_SOC;
|
if (score <= 3) return PATH_SOC;
|
||||||
if (score == 4) return PATH_PHB;
|
if (score == 4) return PATH_PHB;
|
||||||
if (score == depth-1) return PATH_PIX;
|
if (score == depth-1) return PATH_PIX;
|
||||||
return PATH_PXB;
|
return PATH_PXB;
|
||||||
|
97
src/init.cu
97
src/init.cu
@ -28,9 +28,13 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <dlfcn.h>
|
||||||
|
|
||||||
DebugLevel ncclDebugLevel;
|
#define STR2(v) #v
|
||||||
uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT
|
#define STR(v) STR2(v)
|
||||||
|
|
||||||
|
int ncclDebugLevel;
|
||||||
|
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
|
||||||
pthread_mutex_t ncclDebugOutputLock;
|
pthread_mutex_t ncclDebugOutputLock;
|
||||||
FILE *ncclDebugFile = stdout;
|
FILE *ncclDebugFile = stdout;
|
||||||
|
|
||||||
@ -48,7 +52,6 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
|||||||
|
|
||||||
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
||||||
|
|
||||||
extern "C" __attribute__ ((visibility("default")))
|
|
||||||
ncclNet_t* ncclNet = NULL;
|
ncclNet_t* ncclNet = NULL;
|
||||||
|
|
||||||
// We define this as weak to let tests redefine their own
|
// We define this as weak to let tests redefine their own
|
||||||
@ -69,13 +72,53 @@ int ncclCudaFullCompCap() {
|
|||||||
return ccMajor*10+ccMinor;
|
return ccMajor*10+ccMinor;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initNet() {
|
ncclResult_t initNet(ncclNet_t* net) {
|
||||||
if (ncclNet != NULL) {
|
int ndev;
|
||||||
INFO(INIT,"Using external Network %s", ncclNetName());
|
NCCLCHECK(net->init(ncclDebugLog));
|
||||||
} else {
|
NCCLCHECK(net->devices(&ndev));
|
||||||
ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket;
|
if (ndev <= 0) {
|
||||||
INFO(INIT,"Using internal Network %s", ncclNetName());
|
INFO(NCCL_INIT, "Net/%s: call to devices() returned 0 devices.", net->name);
|
||||||
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t initNetPlugin(ncclNet_t** net) {
|
||||||
|
void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
|
||||||
|
if (netPluginLib == NULL) {
|
||||||
|
INFO(NCCL_INIT, "Unable to load libnccl-net.so : %s", dlerror());
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
|
||||||
|
if (extNet == NULL) {
|
||||||
|
INFO(NCCL_INIT, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
if (initNet(extNet) == ncclSuccess) {
|
||||||
|
*net = extNet;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
cleanup:
|
||||||
|
if (netPluginLib != NULL) dlclose(netPluginLib);
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t initNet() {
|
||||||
|
// Always initialize sockets as we use it for bootstrap
|
||||||
|
NCCLCHECK(initNet(&ncclNetSocket));
|
||||||
|
|
||||||
|
NCCLCHECK(initNetPlugin(&ncclNet));
|
||||||
|
if (ncclNet != NULL) {
|
||||||
|
INFO(NCCL_INIT, "Using external Network %s", ncclNetName());
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
if (initNet(&ncclNetIb) == ncclSuccess) {
|
||||||
|
ncclNet = &ncclNetIb;
|
||||||
|
} else {
|
||||||
|
ncclNet = &ncclNetSocket;
|
||||||
|
}
|
||||||
|
INFO(NCCL_INIT,"Using internal Network %s", ncclNetName());
|
||||||
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
|
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
|
||||||
@ -171,7 +214,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||||||
struct ncclComm* comm;
|
struct ncclComm* comm;
|
||||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||||
|
|
||||||
INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
|
INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
|
||||||
comm->rank = rank;
|
comm->rank = rank;
|
||||||
comm->nRanks = ndev;
|
comm->nRanks = ndev;
|
||||||
cudaGetDevice(&comm->cudaDev);
|
cudaGetDevice(&comm->cudaDev);
|
||||||
@ -204,16 +247,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
|
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
|
||||||
#define STR2(v) #v
|
|
||||||
#define STR(v) STR2(v)
|
|
||||||
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
|
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
|
||||||
static void showVersion() {
|
static void showVersion() {
|
||||||
static int shown = 0;
|
static int shown = 0;
|
||||||
if (shown == 0 && ncclDebugLevel >= VERSION) {
|
if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
|
||||||
printf("%s\n", VERSION_STRING);
|
printf("%s\n", VERSION_STRING);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
if (ncclDebugFile != stdout)
|
if (ncclDebugFile != stdout)
|
||||||
INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
|
INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
|
||||||
shown = 1;
|
shown = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -294,12 +335,12 @@ void dumpMatrix(int* connectMatrix, int nranks) {
|
|||||||
line[STRLENGTH] = '\0';
|
line[STRLENGTH] = '\0';
|
||||||
memset(line, ' ', STRLENGTH);
|
memset(line, ' ', STRLENGTH);
|
||||||
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
|
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
|
||||||
INFO(INIT,"%s", line);
|
INFO(NCCL_INIT,"%s", line);
|
||||||
for (int i=0; i<nranks; i++) {
|
for (int i=0; i<nranks; i++) {
|
||||||
memset(line, ' ', STRLENGTH);
|
memset(line, ' ', STRLENGTH);
|
||||||
sprintf(line, "%3d ", i);
|
sprintf(line, "%3d ", i);
|
||||||
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
|
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
|
||||||
INFO(INIT,"%s", line);
|
INFO(NCCL_INIT,"%s", line);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -308,12 +349,12 @@ void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
|
|||||||
line[STRLENGTH] = '\0';
|
line[STRLENGTH] = '\0';
|
||||||
memset(line, ' ', STRLENGTH);
|
memset(line, ' ', STRLENGTH);
|
||||||
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
|
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
|
||||||
INFO(INIT,"%s", line);
|
INFO(NCCL_INIT,"%s", line);
|
||||||
for (int i=0; i<nranks; i++) {
|
for (int i=0; i<nranks; i++) {
|
||||||
memset(line, ' ', STRLENGTH);
|
memset(line, ' ', STRLENGTH);
|
||||||
sprintf(line, "%3d ", i);
|
sprintf(line, "%3d ", i);
|
||||||
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
|
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
|
||||||
INFO(INIT,"%s", line);
|
INFO(NCCL_INIT,"%s", line);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -325,7 +366,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
|
|||||||
memset(line, ' ', STRLENGTH);
|
memset(line, ' ', STRLENGTH);
|
||||||
strncpy(line, prefix, PREFIXLEN);
|
strncpy(line, prefix, PREFIXLEN);
|
||||||
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
|
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
|
||||||
INFO(INIT,"%s", line);
|
INFO(NCCL_INIT,"%s", line);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
|
static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
|
||||||
@ -477,7 +518,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
|
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
|
||||||
for (int i=0; i<nranks; i++)
|
for (int i=0; i<nranks; i++)
|
||||||
comm->nThreads = std::max(allData[i], comm->nThreads);
|
comm->nThreads = std::max(allData[i], comm->nThreads);
|
||||||
if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads);
|
if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
|
||||||
|
|
||||||
// Determine the minimum CUDA Compute capability of all GPUs
|
// Determine the minimum CUDA Compute capability of all GPUs
|
||||||
int myCompCap = ncclCudaCompCap();
|
int myCompCap = ncclCudaCompCap();
|
||||||
@ -486,7 +527,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
|
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
|
||||||
for (int i=0; i<nranks; i++)
|
for (int i=0; i<nranks; i++)
|
||||||
minCompCap = std::min(allData[i], minCompCap);
|
minCompCap = std::min(allData[i], minCompCap);
|
||||||
if (rank == 0) INFO(INIT,"Min Comp Cap %d", minCompCap);
|
if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
|
||||||
|
|
||||||
// Find min nrings across ranks
|
// Find min nrings across ranks
|
||||||
allData[rank] = nrings;
|
allData[rank] = nrings;
|
||||||
@ -547,7 +588,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
multiNode = 1;
|
multiNode = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TRACE(INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
||||||
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
|
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
|
||||||
if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
|
if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
|
||||||
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
||||||
@ -596,7 +637,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
|
|||||||
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||||
NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
|
NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
|
||||||
|
|
||||||
INFO(INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
|
INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
cleanup:
|
cleanup:
|
||||||
@ -615,7 +656,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
|
|||||||
NCCLCHECK(ncclInit());
|
NCCLCHECK(ncclInit());
|
||||||
if (myrank == 0) showVersion();
|
if (myrank == 0) showVersion();
|
||||||
|
|
||||||
INFO(INIT,"rank %d nranks %d", myrank, nranks);
|
INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
|
||||||
|
|
||||||
// Make sure the CUDA runtime is initialized.
|
// Make sure the CUDA runtime is initialized.
|
||||||
CUDACHECK(cudaFree(NULL));
|
CUDACHECK(cudaFree(NULL));
|
||||||
@ -679,8 +720,8 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
|
|||||||
free(prev);
|
free(prev);
|
||||||
free(next);
|
free(next);
|
||||||
|
|
||||||
INFO(INIT,"Using %d threads", nthreads);
|
INFO(NCCL_INIT,"Using %d threads", nthreads);
|
||||||
INFO(INIT,"Min Comp Cap %d", minCompCap);
|
INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
|
||||||
|
|
||||||
int* rings;
|
int* rings;
|
||||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
|
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
|
||||||
@ -733,7 +774,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
|||||||
NCCLCHECK(wrapNvmlInit());
|
NCCLCHECK(wrapNvmlInit());
|
||||||
showVersion();
|
showVersion();
|
||||||
|
|
||||||
INFO(INIT,"nranks %d", ndev);
|
INFO(NCCL_INIT,"nranks %d", ndev);
|
||||||
|
|
||||||
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
|
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
|
||||||
if (ndev < 1) {
|
if (ndev < 1) {
|
||||||
@ -793,7 +834,7 @@ cleanup:
|
|||||||
|
|
||||||
final:
|
final:
|
||||||
if(wrapNvmlShutdown() != ncclSuccess)
|
if(wrapNvmlShutdown() != ncclSuccess)
|
||||||
INFO(INIT,"NCCL did not shutdown nvml properly");
|
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
|
||||||
cudaSetDevice(savedDevice);
|
cudaSetDevice(savedDevice);
|
||||||
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||||
return res;
|
return res;
|
||||||
|
@ -175,7 +175,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
|||||||
// We can't print the CG mode before the first barrier happened.
|
// We can't print the CG mode before the first barrier happened.
|
||||||
if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
|
if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
|
||||||
*comm->intraCGMode ^= 0x10;
|
*comm->intraCGMode ^= 0x10;
|
||||||
INFO(INIT,"Launch mode %s%s%s",
|
INFO(NCCL_INIT,"Launch mode %s%s%s",
|
||||||
comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
|
comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
|
||||||
*comm->intraCGMode ? "/CGMD" : "",
|
*comm->intraCGMode ? "/CGMD" : "",
|
||||||
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
|
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
|
||||||
|
@ -58,7 +58,7 @@ ncclResult_t ncclSetDevice(int cudaDev) {
|
|||||||
|
|
||||||
#define CHECK(a) do { \
|
#define CHECK(a) do { \
|
||||||
if ((args->ret = (a)) != ncclSuccess) { \
|
if ((args->ret = (a)) != ncclSuccess) { \
|
||||||
INFO(INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
|
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
|
||||||
return args; \
|
return args; \
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
@ -61,7 +61,7 @@ ncclResult_t wrapNvmlSymbols(void) {
|
|||||||
cast = (void**)&funcptr; \
|
cast = (void**)&funcptr; \
|
||||||
tmp = dlsym(handle, symbol); \
|
tmp = dlsym(handle, symbol); \
|
||||||
if (tmp == NULL) { \
|
if (tmp == NULL) { \
|
||||||
INFO(INIT,"dlsym failed on %s, ignoring", symbol); \
|
INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \
|
||||||
} \
|
} \
|
||||||
*cast = tmp; \
|
*cast = tmp; \
|
||||||
} while (0)
|
} while (0)
|
||||||
@ -208,7 +208,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
|
|||||||
}
|
}
|
||||||
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
|
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
INFO(INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
|
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
|
||||||
nvmlInternalErrorString(ret));
|
nvmlInternalErrorString(ret));
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
@ -223,7 +223,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
|
|||||||
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
|
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
if (ret != NVML_ERROR_NOT_SUPPORTED)
|
if (ret != NVML_ERROR_NOT_SUPPORTED)
|
||||||
INFO(INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
|
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
|
||||||
nvmlInternalErrorString(ret));
|
nvmlInternalErrorString(ret));
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
@ -239,7 +239,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
|
|||||||
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
|
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
if (ret != NVML_ERROR_NOT_SUPPORTED)
|
if (ret != NVML_ERROR_NOT_SUPPORTED)
|
||||||
INFO(INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
|
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
|
||||||
nvmlInternalErrorString(ret));
|
nvmlInternalErrorString(ret));
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
|
@ -5,9 +5,10 @@
|
|||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "core.h"
|
#include "core.h"
|
||||||
#include "net.h"
|
|
||||||
#include "param.h"
|
#include "param.h"
|
||||||
|
|
||||||
|
#define NCCL_MAX_SCORE 7
|
||||||
|
|
||||||
/* Parse user defined rings. Format is like :
|
/* Parse user defined rings. Format is like :
|
||||||
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
|
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
|
||||||
* Rings with a non-matching number of ranks are ignored so we can provide
|
* Rings with a non-matching number of ranks are ignored so we can provide
|
||||||
@ -188,11 +189,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
|
|||||||
if (str && strlen(str)>0) {
|
if (str && strlen(str)>0) {
|
||||||
int ret = parseRings(str, nrings, nranks, prev, next);
|
int ret = parseRings(str, nrings, nranks, prev, next);
|
||||||
if (ret == ncclSuccess && *nrings > 0) {
|
if (ret == ncclSuccess && *nrings > 0) {
|
||||||
if (rank == 0) INFO(INIT,"%d ring(s) set by environment", *nrings);
|
if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
|
||||||
NCCLCHECK(getEnvThreads(nthreads));
|
NCCLCHECK(getEnvThreads(nthreads));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
if (rank == 0) INFO(INIT,"No valid ring found in environment, ignoring");
|
if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
|
||||||
*nrings = 0;
|
*nrings = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -333,13 +334,13 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
|
|||||||
minNrings = MAXRINGS;
|
minNrings = MAXRINGS;
|
||||||
}
|
}
|
||||||
if (maxNrings > 0 && maxNrings <= *nrings) {
|
if (maxNrings > 0 && maxNrings <= *nrings) {
|
||||||
if (rank == 0) INFO(INIT,"Limiting to %d rings per user request.", maxNrings);
|
if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
|
||||||
*nrings = maxNrings;
|
*nrings = maxNrings;
|
||||||
} else {
|
} else {
|
||||||
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
|
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
|
||||||
if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
|
if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
|
||||||
if (minNrings > 0 && minNrings > *nrings) {
|
if (minNrings > 0 && minNrings > *nrings) {
|
||||||
if (rank == 0 && minNrings > defaultMinNrings) INFO(INIT,"Duplicating rings to %d per user request.", minNrings);
|
if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
|
||||||
for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
|
for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
|
||||||
for (int i=0; i<nranks; i++) {
|
for (int i=0; i<nranks; i++) {
|
||||||
prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
|
prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
|
||||||
|
@ -6,8 +6,10 @@
|
|||||||
|
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "debug.h"
|
#include "debug.h"
|
||||||
|
#include "nccl_net.h"
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
|
||||||
ncclResult_t getHostName(char* hostname, int maxlen) {
|
ncclResult_t getHostName(char* hostname, int maxlen) {
|
||||||
if (gethostname(hostname, maxlen) != 0) {
|
if (gethostname(hostname, maxlen) != 0) {
|
||||||
@ -20,6 +22,53 @@ ncclResult_t getHostName(char* hostname, int maxlen) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Common logging function used by the INFO, WARN and TRACE macros
|
||||||
|
* Also exported to the dynamically loadable Net transport modules so
|
||||||
|
* they can share the debugging mechanisms and output files
|
||||||
|
*/
|
||||||
|
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||||
|
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
|
||||||
|
|
||||||
|
char hostname[1024];
|
||||||
|
getHostName(hostname, 1024);
|
||||||
|
int cudaDev;
|
||||||
|
cudaGetDevice(&cudaDev);
|
||||||
|
|
||||||
|
char buffer[1024];
|
||||||
|
size_t len = 0;
|
||||||
|
pthread_mutex_lock(&ncclDebugOutputLock);
|
||||||
|
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
|
||||||
|
len = snprintf(buffer, sizeof(buffer),
|
||||||
|
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
|
||||||
|
else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
|
||||||
|
len = snprintf(buffer, sizeof(buffer),
|
||||||
|
"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
|
||||||
|
#ifdef ENABLE_TRACE
|
||||||
|
else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
|
||||||
|
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
|
||||||
|
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||||
|
len = snprintf(buffer, sizeof(buffer),
|
||||||
|
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (len) {
|
||||||
|
va_list vargs;
|
||||||
|
va_start(vargs, fmt);
|
||||||
|
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||||
|
va_end(vargs);
|
||||||
|
fprintf(ncclDebugFile,"%s\n", buffer);
|
||||||
|
fflush(ncclDebugFile);
|
||||||
|
}
|
||||||
|
pthread_mutex_unlock(&ncclDebugOutputLock);
|
||||||
|
|
||||||
|
// If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
|
||||||
|
if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
|
||||||
|
fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
|
||||||
|
hostname, getpid(), gettid(), cudaDev, filefunc, line);
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t getHash(const char* string) {
|
uint64_t getHash(const char* string) {
|
||||||
// Based on DJB2, result = result * 33 + char
|
// Based on DJB2, result = result * 33 + char
|
||||||
uint64_t result = 5381;
|
uint64_t result = 5381;
|
||||||
@ -51,7 +100,7 @@ uint64_t getHostHash(void) {
|
|||||||
offset += len;
|
offset += len;
|
||||||
// Trailing '\0'
|
// Trailing '\0'
|
||||||
uname[offset]='\0';
|
uname[offset]='\0';
|
||||||
TRACE(INIT,"unique hostname '%s'", uname);
|
TRACE(NCCL_INIT,"unique hostname '%s'", uname);
|
||||||
|
|
||||||
return getHash(uname);
|
return getHash(uname);
|
||||||
}
|
}
|
||||||
@ -71,7 +120,7 @@ uint64_t getPidHash(void) {
|
|||||||
if (len < 0) len = 0;
|
if (len < 0) len = 0;
|
||||||
|
|
||||||
pname[plen+len]='\0';
|
pname[plen+len]='\0';
|
||||||
TRACE(INIT,"unique PID '%s'", pname);
|
TRACE(NCCL_INIT,"unique PID '%s'", pname);
|
||||||
|
|
||||||
return getHash(pname);
|
return getHash(pname);
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,7 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) {
|
|||||||
NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
|
NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
|
||||||
ring->devMemRecv = recvMem;
|
ring->devMemRecv = recvMem;
|
||||||
|
|
||||||
TRACE(INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
|
TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
|
||||||
|
|
||||||
// Pre-configure send/recv pointers. Those are the default, they may change later.
|
// Pre-configure send/recv pointers. Those are the default, they may change later.
|
||||||
ring->recv.conn.buff = recvMem->buff;
|
ring->recv.conn.buff = recvMem->buff;
|
||||||
|
@ -113,8 +113,8 @@ ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRoun
|
|||||||
|
|
||||||
int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
|
int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
|
||||||
int nsteps = nstepsPerRound * nrounds * substeps;
|
int nsteps = nstepsPerRound * nrounds * substeps;
|
||||||
TRACE(NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
|
TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
|
||||||
TRACE(NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
|
TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
|
||||||
for (int r=0; r<nrings; r++) {
|
for (int r=0; r<nrings; r++) {
|
||||||
struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
|
struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
|
||||||
struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
|
struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
|
||||||
@ -159,7 +159,7 @@ ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclCo
|
|||||||
struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
|
struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
|
||||||
threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
|
threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
|
||||||
if (proxyfunc) {
|
if (proxyfunc) {
|
||||||
TRACE(NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
|
TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
|
||||||
struct transportProxyInfo* info;
|
struct transportProxyInfo* info;
|
||||||
NCCLCHECK(ncclCalloc(&info, 1));
|
NCCLCHECK(ncclCalloc(&info, 1));
|
||||||
connector->proxyInfo = info;
|
connector->proxyInfo = info;
|
||||||
|
@ -19,11 +19,21 @@
|
|||||||
#define NET_BITS_PER_IF 3
|
#define NET_BITS_PER_IF 3
|
||||||
#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
|
#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
|
||||||
static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
|
static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
|
||||||
|
static ncclTvalue_t getTvalue(short* distances, int ndev) {
|
||||||
|
ncclTvalue_t tvalue = 0;
|
||||||
|
for (int d=0; d<ndev; d++) {
|
||||||
|
int score = 1 + PATH_SOC - distances[d];
|
||||||
|
// Keep 3 bits of score info per dev
|
||||||
|
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
|
||||||
|
}
|
||||||
|
return tvalue;
|
||||||
|
}
|
||||||
|
|
||||||
struct netInfo {
|
struct netInfo {
|
||||||
int rank;
|
int rank;
|
||||||
int ndev;
|
int ndev;
|
||||||
short scores[NET_MAX_IFS];
|
ncclTvalue_t tValue;
|
||||||
|
short distances[NET_MAX_IFS];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct netConnectInfo {
|
struct netConnectInfo {
|
||||||
@ -38,7 +48,7 @@ struct netSendResources {
|
|||||||
struct ncclRecvMem* devHostRecvMem;
|
struct ncclRecvMem* devHostRecvMem;
|
||||||
struct ncclSendMem* hostDevMem;
|
struct ncclSendMem* hostDevMem;
|
||||||
int netDev;
|
int netDev;
|
||||||
bool cudaSupport;
|
int useGdr;
|
||||||
struct ncclRecvMem* devNetMem;
|
struct ncclRecvMem* devNetMem;
|
||||||
uint64_t llStep;
|
uint64_t llStep;
|
||||||
uint64_t llLastCleaning;
|
uint64_t llLastCleaning;
|
||||||
@ -53,7 +63,7 @@ struct netRecvResources {
|
|||||||
struct ncclRecvMem* devHostRecvMem;
|
struct ncclRecvMem* devHostRecvMem;
|
||||||
struct ncclRecvMem* hostDevMem;
|
struct ncclRecvMem* hostDevMem;
|
||||||
int netDev;
|
int netDev;
|
||||||
bool cudaSupport;
|
int useGdr;
|
||||||
uint64_t llStep;
|
uint64_t llStep;
|
||||||
uint64_t llLastCleaning;
|
uint64_t llLastCleaning;
|
||||||
};
|
};
|
||||||
@ -64,26 +74,37 @@ ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
|
|||||||
struct netInfo* info = (struct netInfo*)opaqueInfo;
|
struct netInfo* info = (struct netInfo*)opaqueInfo;
|
||||||
static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
|
static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
|
||||||
info->rank = rank;
|
info->rank = rank;
|
||||||
int *scores;
|
NCCLCHECK(ncclNetDevices(&info->ndev));
|
||||||
NCCLCHECK(ncclNetDevices(&info->ndev, &scores));
|
|
||||||
if (info->ndev == 0) {
|
if (info->ndev == 0) {
|
||||||
WARN("Error : Network returned 0 device");
|
WARN("Error : Network returned 0 device");
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
|
if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
|
||||||
for (int d=0; d<info->ndev; d++) info->scores[d] = scores[d];
|
|
||||||
free(scores);
|
// Find distance with current GPU
|
||||||
|
int cudaDev;
|
||||||
|
cudaGetDevice(&cudaDev);
|
||||||
|
char* cudaPath;
|
||||||
|
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
|
||||||
|
|
||||||
|
char line[1024];
|
||||||
|
sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName());
|
||||||
|
for (int d=0; d<info->ndev; d++) {
|
||||||
|
char* nicPath;
|
||||||
|
ncclResult_t err = ncclNetPciPath(d, &nicPath);
|
||||||
|
info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
|
||||||
|
sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]);
|
||||||
|
if (err == ncclSuccess) free(nicPath);
|
||||||
|
}
|
||||||
|
INFO(NCCL_INIT|NCCL_NET, "%s", line);
|
||||||
|
free(cudaPath);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Determine if we can communicate with the peer */
|
/* Determine if we can communicate with the peer */
|
||||||
ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
|
ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
|
||||||
ret[0] = 0;
|
|
||||||
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
|
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
|
||||||
for (int d=0; d<myInfo->ndev; d++) {
|
ret[0] = getTvalue(myInfo->distances, myInfo->ndev);
|
||||||
// Keep 3 bits of score info per dev
|
|
||||||
ret[0] |= ((myInfo->scores[d] & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -175,13 +196,13 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
int getDev(int ringId, int nDev, short* scores) {
|
int getDev(int ringId, int nDev, short* distances) {
|
||||||
int maxScore = 0;
|
int minDistance = PATH_SOC;
|
||||||
for (int d=0; d<nDev; d++) if (scores[d] > maxScore) maxScore = scores[d];
|
for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d];
|
||||||
int skip = ringId+1;
|
int skip = ringId+1;
|
||||||
while (skip) {
|
while (skip) {
|
||||||
for (int d=0; d<nDev; d++) {
|
for (int d=0; d<nDev; d++) {
|
||||||
if (scores[d] == maxScore) {
|
if (distances[d] == minDistance) {
|
||||||
skip--;
|
skip--;
|
||||||
if (skip == 0) return d;
|
if (skip == 0) return d;
|
||||||
}
|
}
|
||||||
@ -191,6 +212,40 @@ int getDev(int ringId, int nDev, short* scores) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
||||||
|
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
|
||||||
|
|
||||||
|
static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
|
||||||
|
*useGdr = 0;
|
||||||
|
|
||||||
|
int cudaDev;
|
||||||
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||||
|
|
||||||
|
if (read) { // For reads (sends) only enable under certain conditions
|
||||||
|
int gdrReadParam = ncclParamNetGdrRead();
|
||||||
|
if (gdrReadParam == 0) return ncclSuccess;
|
||||||
|
else if (gdrReadParam < 0) { // default : enable only on DGX2
|
||||||
|
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||||
|
CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
|
||||||
|
int nvlinks = getNumNvlinks(busId);
|
||||||
|
if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we are close enough that it makes sense to enable GDR
|
||||||
|
int netGdrLevel = ncclParamNetGdrLevel();
|
||||||
|
if (distance >= netGdrLevel) {
|
||||||
|
INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel);
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finally, check if the NIC supports it
|
||||||
|
int flags;
|
||||||
|
NCCLCHECK(ncclNetPtrSupport(dev, &flags));
|
||||||
|
if (flags & NCCL_PTR_CUDA == 0) return ncclSuccess;
|
||||||
|
*useGdr = 1;
|
||||||
|
INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
/* Determine if we will use this transport for this peer and return connect
|
/* Determine if we will use this transport for this peer and return connect
|
||||||
* information for this peer */
|
* information for this peer */
|
||||||
@ -200,34 +255,11 @@ ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
ring->send.transportResources = resources;
|
ring->send.transportResources = resources;
|
||||||
|
|
||||||
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
|
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
|
||||||
resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
|
resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
|
||||||
resources->cudaSupport = false;
|
NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr));
|
||||||
|
|
||||||
// Get user's GDR READ setting
|
|
||||||
int gdrReadParam = ncclParamNetGdrRead();
|
|
||||||
|
|
||||||
// Determine whether the GPU has NVLink
|
|
||||||
int cudaDev;
|
|
||||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
||||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
|
||||||
CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
|
|
||||||
int nvlinks = getNumNvlinks(busId);
|
|
||||||
|
|
||||||
// Enable GDR read when:
|
|
||||||
// 1) user sets it, or
|
|
||||||
// 2) we are on a NVSwitch platform (i.e. no P2P traffic over PCI-E switch) AND the GPU is Volta
|
|
||||||
bool enableGdrRead = (gdrReadParam > 0) || (nvlinks >= CONNECT_NVSWITCH && ncclCudaCompCap() > 6 && gdrReadParam != 0);
|
|
||||||
if (enableGdrRead) {
|
|
||||||
int flags;
|
|
||||||
NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
|
|
||||||
if (flags & NCCL_PTR_CUDA)
|
|
||||||
resources->cudaSupport = true;
|
|
||||||
}
|
|
||||||
if (resources->cudaSupport)
|
|
||||||
INFO(INIT|NET, "Net: enabling net device %d to read from rank %d", resources->netDev, myInfo->rank);
|
|
||||||
|
|
||||||
int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
|
int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
|
||||||
if (resources->cudaSupport) {
|
if (resources->useGdr) {
|
||||||
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
|
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,10 +275,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
ring->recv.transportResources = resources;
|
ring->recv.transportResources = resources;
|
||||||
|
|
||||||
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
|
struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
|
||||||
resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
|
resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
|
||||||
int flags;
|
NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr));
|
||||||
NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
|
|
||||||
resources->cudaSupport = (flags & NCCL_PTR_CUDA) ? true : false;
|
|
||||||
|
|
||||||
int sendSize = sizeof(struct ncclSendMem);
|
int sendSize = sizeof(struct ncclSendMem);
|
||||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
|
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
|
||||||
@ -255,8 +285,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
|
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
|
||||||
|
|
||||||
struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
|
struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
|
||||||
INFO(INIT|NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
|
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
|
||||||
resources->cudaSupport ? "/GDRDMA" : "",
|
resources->useGdr ? "/GDRDMA" : "",
|
||||||
(resources->hostDevMem != NULL) ? "/GDCopy" : "");
|
(resources->hostDevMem != NULL) ? "/GDCopy" : "");
|
||||||
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
|
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
|
||||||
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
|
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
|
||||||
@ -267,7 +297,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
|||||||
// Setup device pointers
|
// Setup device pointers
|
||||||
struct netSendResources* resources = (struct netSendResources*)send->transportResources;
|
struct netSendResources* resources = (struct netSendResources*)send->transportResources;
|
||||||
|
|
||||||
if (resources->cudaSupport) {
|
if (resources->useGdr) {
|
||||||
send->conn.buff = resources->devNetMem->buff;
|
send->conn.buff = resources->devNetMem->buff;
|
||||||
// We don't use devMem for llMode because the CPU has to read the data
|
// We don't use devMem for llMode because the CPU has to read the data
|
||||||
send->conn.llBuff = resources->devHostRecvMem->llBuff;
|
send->conn.llBuff = resources->devHostRecvMem->llBuff;
|
||||||
@ -299,7 +329,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
|||||||
recv->conn.head = &resources->devHostSendMem->head;
|
recv->conn.head = &resources->devHostSendMem->head;
|
||||||
recv->conn.llHead = &resources->devHostSendMem->llHead;
|
recv->conn.llHead = &resources->devHostSendMem->llHead;
|
||||||
|
|
||||||
if (resources->cudaSupport == false) {
|
if (resources->useGdr == 0) {
|
||||||
recv->conn.buff = resources->devHostRecvMem->buff;
|
recv->conn.buff = resources->devHostRecvMem->buff;
|
||||||
recv->conn.llBuff = resources->devHostRecvMem->llBuff;
|
recv->conn.llBuff = resources->devHostRecvMem->llBuff;
|
||||||
}
|
}
|
||||||
@ -320,7 +350,7 @@ ncclResult_t netSendFree(void* transportResources) {
|
|||||||
struct netSendResources* resources = (struct netSendResources*)transportResources;
|
struct netSendResources* resources = (struct netSendResources*)transportResources;
|
||||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||||
if (resources->cudaSupport)
|
if (resources->useGdr)
|
||||||
CUDACHECK(cudaFree(resources->devNetMem));
|
CUDACHECK(cudaFree(resources->devNetMem));
|
||||||
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
|
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
|
||||||
free(resources);
|
free(resources);
|
||||||
@ -344,9 +374,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||||||
volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
|
volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
|
||||||
struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
|
struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
|
||||||
uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
|
uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
|
||||||
struct ncclRecvMem* localMem = resources->cudaSupport ? resources->devNetMem : resources->hostRecvMem;
|
struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem;
|
||||||
char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
|
char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
|
||||||
int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
||||||
volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
|
volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
|
||||||
int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
|
int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
|
||||||
int sliceSize = buffSize / args->substeps;
|
int sliceSize = buffSize / args->substeps;
|
||||||
@ -362,8 +392,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||||||
|
|
||||||
if (!args->needProxy) goto nextColl;
|
if (!args->needProxy) goto nextColl;
|
||||||
|
|
||||||
TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
|
TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
|
||||||
TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
|
TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
|
||||||
|
|
||||||
// Update in case we skipped some collectives
|
// Update in case we skipped some collectives
|
||||||
if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
|
if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
|
||||||
@ -440,10 +470,10 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
|||||||
int llMode = args->llMode;
|
int llMode = args->llMode;
|
||||||
|
|
||||||
volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
|
volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
|
||||||
struct ncclRecvMem* localMem = resources->cudaSupport ? ring->devMemRecv : resources->hostRecvMem;
|
struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem;
|
||||||
char* localBuff = llMode ? localMem->llBuff : localMem->buff;
|
char* localBuff = llMode ? localMem->llBuff : localMem->buff;
|
||||||
char* nextBuff = (resources->cudaSupport == false && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
|
char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
|
||||||
int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
||||||
uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
|
uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
|
||||||
|
|
||||||
int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
|
int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
|
||||||
@ -458,8 +488,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
|||||||
|
|
||||||
if (!args->needProxy) goto nextColl;
|
if (!args->needProxy) goto nextColl;
|
||||||
|
|
||||||
TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
|
TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
|
||||||
TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
|
TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
|
||||||
|
|
||||||
if (llMode == 0) {
|
if (llMode == 0) {
|
||||||
// Waiting for next opCount is only needed before writing nextTail.
|
// Waiting for next opCount is only needed before writing nextTail.
|
||||||
|
@ -82,8 +82,12 @@ static void* ncclIbAsyncThreadMain(void* args) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void initDevices() {
|
NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
|
||||||
if(wrap_ibv_symbols() != ncclSuccess) { return; }
|
|
||||||
|
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||||
|
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
|
||||||
|
if (ncclParamIbDisable()) return ncclInternalError;
|
||||||
|
|
||||||
if (ncclNIbDevs == -1) {
|
if (ncclNIbDevs == -1) {
|
||||||
pthread_mutex_lock(&ncclIbLock);
|
pthread_mutex_lock(&ncclIbLock);
|
||||||
wrap_ibv_fork_init();
|
wrap_ibv_fork_init();
|
||||||
@ -91,9 +95,9 @@ static void initDevices() {
|
|||||||
ncclNIbDevs = 0;
|
ncclNIbDevs = 0;
|
||||||
if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
|
if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
|
||||||
WARN("NET/IB : No IP interface found.");
|
WARN("NET/IB : No IP interface found.");
|
||||||
return;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
INFO(INIT|NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
|
INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
|
||||||
|
|
||||||
// Detect IB cards
|
// Detect IB cards
|
||||||
int nIbDevs;
|
int nIbDevs;
|
||||||
@ -105,7 +109,7 @@ static void initDevices() {
|
|||||||
bool searchNot = userIbEnv && userIbEnv[0] == '^';
|
bool searchNot = userIbEnv && userIbEnv[0] == '^';
|
||||||
int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
|
int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
|
||||||
|
|
||||||
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return;
|
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
|
||||||
|
|
||||||
for (int d=0; d<nIbDevs; d++) {
|
for (int d=0; d<nIbDevs; d++) {
|
||||||
struct ibv_context * context;
|
struct ibv_context * context;
|
||||||
@ -134,7 +138,7 @@ static void initDevices() {
|
|||||||
if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
|
if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
INFO(INIT|NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
|
INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
|
||||||
portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
|
portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
|
||||||
ncclIbDevs[ncclNIbDevs].device = d;
|
ncclIbDevs[ncclNIbDevs].device = d;
|
||||||
ncclIbDevs[ncclNIbDevs].port = port;
|
ncclIbDevs[ncclNIbDevs].port = port;
|
||||||
@ -145,38 +149,29 @@ static void initDevices() {
|
|||||||
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
|
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return; } }
|
if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return; };
|
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
|
||||||
}
|
}
|
||||||
|
|
||||||
pthread_mutex_unlock(&ncclIbLock);
|
pthread_mutex_unlock(&ncclIbLock);
|
||||||
}
|
}
|
||||||
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclIbDevices(int* ndev, int** scores) {
|
ncclResult_t ncclIbDevices(int* ndev) {
|
||||||
initDevices();
|
|
||||||
*ndev = ncclNIbDevs;
|
*ndev = ncclNIbDevs;
|
||||||
int cudaDev;
|
return ncclSuccess;
|
||||||
cudaGetDevice(&cudaDev);
|
}
|
||||||
char* cudaPath;
|
|
||||||
ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
|
ncclResult_t ncclIbPciPath(int dev, char** path) {
|
||||||
int* sc;
|
char devicepath[PATH_MAX];
|
||||||
NCCLCHECK(ncclCalloc(&sc, ncclNIbDevs));
|
snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName);
|
||||||
char line[1024];
|
*path = realpath(devicepath, NULL);
|
||||||
sprintf(line, "CUDA Dev %d, IB Ports : ", cudaDev);
|
if (*path == NULL) {
|
||||||
for (int d=0; d<ncclNIbDevs; d++) {
|
WARN("Could not find real path of %s", devicepath);
|
||||||
char* mlxPath;
|
return ncclSystemError;
|
||||||
ncclResult_t err2 = getMlxPath(ncclIbDevs[d].devName, &mlxPath);
|
|
||||||
int distance = (err1 != ncclSuccess || err2 != ncclSuccess || mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
|
|
||||||
sprintf(line+strlen(line), "%s/%d(%s) ", ncclIbDevs[d].devName, ncclIbDevs[d].port, pathDists[distance]);
|
|
||||||
sc[d] = 1+PATH_SOC-distance;
|
|
||||||
if (err2 == ncclSuccess) free(mlxPath);
|
|
||||||
}
|
}
|
||||||
INFO(INIT|NET,"%s", line);
|
|
||||||
if (err1 == ncclSuccess) free(cudaPath);
|
|
||||||
*scores = sc;
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -207,45 +202,21 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
NCCL_PARAM(IbGdrLevel, "IB_GDR_LEVEL", -2);
|
|
||||||
NCCL_PARAM(IbCudaSupport, "IB_CUDA_SUPPORT", -2);
|
|
||||||
|
|
||||||
ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
|
ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
|
||||||
initDevices();
|
|
||||||
*supportedTypes = NCCL_PTR_HOST;
|
*supportedTypes = NCCL_PTR_HOST;
|
||||||
|
|
||||||
int cudaDev;
|
int cudaDev;
|
||||||
if (cudaGetDevice(&cudaDev) != cudaSuccess) return ncclSuccess;
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||||
|
|
||||||
int ibGdrLevel = PATH_PHB;
|
if (ncclIbGdrSupport(dev) != ncclSuccess) {
|
||||||
if (ncclParamIbCudaSupport() != -2) ibGdrLevel = ncclParamIbCudaSupport() ? PATH_SOC + 1 : 0;
|
INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
|
||||||
if (ncclParamIbGdrLevel() != -2) ibGdrLevel = ncclParamIbGdrLevel();
|
return ncclSuccess;
|
||||||
if (ibGdrLevel > 0) {
|
|
||||||
int gdrSupport = ncclIbGdrSupport(dev);
|
|
||||||
if (gdrSupport > 0) {
|
|
||||||
INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (%s)", cudaDev, ncclIbDevs[dev].devName, gdrSupport == 1 ? "no module" : "not supported by GPU");
|
|
||||||
ibGdrLevel = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ibGdrLevel <= 0) return ncclSuccess;
|
|
||||||
|
|
||||||
char* cudaPath;
|
|
||||||
if (getCudaPath(cudaDev, &cudaPath) != ncclSuccess) return ncclSuccess;
|
|
||||||
char* mlxPath;
|
|
||||||
if (getMlxPath(ncclIbDevs[dev].devName, &mlxPath) != ncclSuccess) { free(cudaPath); return ncclSuccess; }
|
|
||||||
int distance = (mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
|
|
||||||
free(mlxPath); free(cudaPath);
|
|
||||||
if (distance < ibGdrLevel) {
|
|
||||||
*supportedTypes |= NCCL_PTR_CUDA;
|
|
||||||
} else {
|
|
||||||
INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (distance %d >= %d)", cudaDev, ncclIbDevs[dev].devName, distance, ibGdrLevel);
|
|
||||||
}
|
}
|
||||||
|
*supportedTypes |= NCCL_PTR_CUDA;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t GetSocketAddr(union socketAddress* addr) {
|
static ncclResult_t GetSocketAddr(union socketAddress* addr) {
|
||||||
if (ncclNIbDevs == -1) initDevices();
|
|
||||||
memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
|
memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -442,7 +413,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
|
|||||||
*sendComm = comm;
|
*sendComm = comm;
|
||||||
|
|
||||||
// IB Setup
|
// IB Setup
|
||||||
initDevices(); /*NOTE: We need to do this for ncclNet unit test that bypasses nccl initialization*/
|
|
||||||
ibv_context* ctx = ncclIbDevs[dev].context;
|
ibv_context* ctx = ncclIbDevs[dev].context;
|
||||||
NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
|
NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
|
||||||
uint8_t ib_port = ncclIbDevs[dev].port;
|
uint8_t ib_port = ncclIbDevs[dev].port;
|
||||||
@ -464,13 +434,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
|
|||||||
// RoCE support
|
// RoCE support
|
||||||
qpInfo.lid = portAttr.lid;
|
qpInfo.lid = portAttr.lid;
|
||||||
if (qpInfo.lid) { // IB
|
if (qpInfo.lid) { // IB
|
||||||
INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
|
INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
|
||||||
} else { // RoCE
|
} else { // RoCE
|
||||||
union ibv_gid gid;
|
union ibv_gid gid;
|
||||||
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
|
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
|
||||||
qpInfo.spn = gid.global.subnet_prefix;
|
qpInfo.spn = gid.global.subnet_prefix;
|
||||||
qpInfo.iid = gid.global.interface_id;
|
qpInfo.iid = gid.global.interface_id;
|
||||||
INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
|
INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
|
||||||
}
|
}
|
||||||
|
|
||||||
NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
|
NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
|
||||||
@ -649,7 +619,7 @@ ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct
|
|||||||
NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
|
NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
|
||||||
*mrRet = verbs->mrPool+elem;
|
*mrRet = verbs->mrPool+elem;
|
||||||
verbs->mrPool[elem].refcnt++;
|
verbs->mrPool[elem].refcnt++;
|
||||||
TRACE(INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
|
TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -903,7 +873,9 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
|
|||||||
|
|
||||||
ncclNet_t ncclNetIb = {
|
ncclNet_t ncclNetIb = {
|
||||||
"IB",
|
"IB",
|
||||||
|
ncclIbInit,
|
||||||
ncclIbDevices,
|
ncclIbDevices,
|
||||||
|
ncclIbPciPath,
|
||||||
ncclIbPtrSupport,
|
ncclIbPtrSupport,
|
||||||
ncclIbListen,
|
ncclIbListen,
|
||||||
ncclIbConnect,
|
ncclIbConnect,
|
||||||
@ -917,10 +889,3 @@ ncclNet_t ncclNetIb = {
|
|||||||
ncclIbCloseListen
|
ncclIbCloseListen
|
||||||
};
|
};
|
||||||
|
|
||||||
NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
|
|
||||||
|
|
||||||
bool ncclIbSupport() {
|
|
||||||
if (ncclParamIbDisable()) return 0;
|
|
||||||
initDevices();
|
|
||||||
return ncclNIbDevs > 0;
|
|
||||||
}
|
|
||||||
|
@ -8,67 +8,58 @@
|
|||||||
#include "core.h"
|
#include "core.h"
|
||||||
#include "socket.h"
|
#include "socket.h"
|
||||||
#include "net.h"
|
#include "net.h"
|
||||||
#include "topo.h"
|
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <poll.h>
|
#include <poll.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
/* Init functions */
|
/* Init functions */
|
||||||
|
static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
|
||||||
|
static union socketAddress ncclNetIfAddrs[MAX_IFS];
|
||||||
|
static int ncclNetIfs = -1;
|
||||||
|
pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
|
ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
|
||||||
|
if (ncclNetIfs == -1) {
|
||||||
|
pthread_mutex_lock(&ncclSocketLock);
|
||||||
|
if (ncclNetIfs == -1) {
|
||||||
|
ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
|
||||||
|
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
|
||||||
|
if (ncclNetIfs <= 0) {
|
||||||
|
WARN("NET/Socket : no interface found");
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pthread_mutex_unlock(&ncclSocketLock);
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
|
ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
|
||||||
*supportedTypes = NCCL_PTR_HOST;
|
*supportedTypes = NCCL_PTR_HOST;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
|
ncclResult_t ncclSocketDevices(int* ndev) {
|
||||||
static union socketAddress ncclNetIfAddrs[MAX_IFS];
|
*ndev = ncclNetIfs;
|
||||||
static int ncclNetIfs = -1;
|
return ncclSuccess;
|
||||||
pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
|
|
||||||
|
|
||||||
static void initDevices() {
|
|
||||||
if (ncclNetIfs == -1) {
|
|
||||||
pthread_mutex_lock(&ncclSocketLock);
|
|
||||||
if (ncclNetIfs == -1) {
|
|
||||||
ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
|
|
||||||
INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs);
|
|
||||||
if (ncclNetIfs <= 0) {
|
|
||||||
WARN("NET/Socket : no interface found");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pthread_mutex_unlock(&ncclSocketLock);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketDevices(int* ndev, int** scores) {
|
ncclResult_t ncclSocketPciPath(int dev, char** path) {
|
||||||
initDevices();
|
char devicepath[PATH_MAX];
|
||||||
*ndev = ncclNetIfs;
|
snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
|
||||||
int cudaDev;
|
*path = realpath(devicepath, NULL);
|
||||||
cudaGetDevice(&cudaDev);
|
if (*path == NULL) {
|
||||||
char* cudaPath;
|
INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
|
||||||
ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
|
return ncclSystemError;
|
||||||
int* sc;
|
|
||||||
NCCLCHECK(ncclCalloc(&sc, ncclNetIfs));
|
|
||||||
char line[1024];
|
|
||||||
sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev);
|
|
||||||
for (int i=0; i<ncclNetIfs; i++) {
|
|
||||||
char* sockPath;
|
|
||||||
ncclResult_t err2 = getSockPath(ncclNetIfNames+i*MAX_IF_NAME_SIZE, &sockPath);
|
|
||||||
int distance = (err1 != ncclSuccess || err2 != ncclSuccess || sockPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(sockPath, cudaPath);
|
|
||||||
sprintf(line+strlen(line), "%s(%s) ", ncclNetIfNames+i*MAX_IF_NAME_SIZE, pathDists[distance]);
|
|
||||||
sc[i] = 1+PATH_SOC-distance;
|
|
||||||
if (err2 == ncclSuccess) free(sockPath);
|
|
||||||
}
|
}
|
||||||
INFO(INIT|NET,"%s", line);
|
|
||||||
if (err1 == ncclSuccess) free(cudaPath);
|
|
||||||
*scores = sc;
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
|
static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
|
||||||
if (ncclNetIfs == -1) initDevices();
|
|
||||||
if (dev >= ncclNetIfs) return ncclInternalError;
|
if (dev >= ncclNetIfs) return ncclInternalError;
|
||||||
memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
|
memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -223,7 +214,9 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
|
|||||||
|
|
||||||
ncclNet_t ncclNetSocket = {
|
ncclNet_t ncclNetSocket = {
|
||||||
"Socket",
|
"Socket",
|
||||||
|
ncclSocketInit,
|
||||||
ncclSocketDevices,
|
ncclSocketDevices,
|
||||||
|
ncclSocketPciPath,
|
||||||
ncclSocketPtrSupport,
|
ncclSocketPtrSupport,
|
||||||
ncclSocketListen,
|
ncclSocketListen,
|
||||||
ncclSocketConnect,
|
ncclSocketConnect,
|
||||||
|
@ -85,7 +85,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
|
|||||||
// See if CUDA can do P2P
|
// See if CUDA can do P2P
|
||||||
int p2p;
|
int p2p;
|
||||||
if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
|
if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
|
||||||
INFO(INIT|P2P,"peer query failed between dev %d and dev %d",
|
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
|
||||||
myInfo->cudaDev, peerInfo->cudaDev);
|
myInfo->cudaDev, peerInfo->cudaDev);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -454,7 +454,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
info.direct = 1;
|
info.direct = 1;
|
||||||
info.directPtr = ring->devMemSend;
|
info.directPtr = ring->devMemSend;
|
||||||
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
||||||
INFO(INIT|P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
|
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
|
||||||
} else {
|
} else {
|
||||||
// Enable P2P access
|
// Enable P2P access
|
||||||
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
|
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
|
||||||
@ -465,7 +465,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
peerInfo->cudaDev, err, cudaGetErrorString(err));
|
peerInfo->cudaDev, err, cudaGetErrorString(err));
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
|
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
|
||||||
ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -477,7 +477,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
|
myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
|
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
|
||||||
ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||||
//TRACE_DUMP_IPC(&info.devIpc);
|
//TRACE_DUMP_IPC(&info.devIpc);
|
||||||
}
|
}
|
||||||
@ -495,7 +495,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
info.direct = 1;
|
info.direct = 1;
|
||||||
info.directPtr = ring->devMemRecv;
|
info.directPtr = ring->devMemRecv;
|
||||||
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
||||||
TRACE(INIT|P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
|
TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
|
||||||
} else {
|
} else {
|
||||||
// Enable P2P access
|
// Enable P2P access
|
||||||
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
|
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
|
||||||
@ -506,7 +506,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
peerInfo->cudaDev, err, cudaGetErrorString(err));
|
peerInfo->cudaDev, err, cudaGetErrorString(err));
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
info.direct = 0;
|
info.direct = 0;
|
||||||
@ -517,7 +517,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
|
myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||||
//TRACE_DUMP_IPC(&info.devIpc);
|
//TRACE_DUMP_IPC(&info.devIpc);
|
||||||
}
|
}
|
||||||
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
|
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
|
||||||
|
@ -168,10 +168,10 @@ ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
char shmName[MAX_SHM_NAME_LEN];
|
char shmName[MAX_SHM_NAME_LEN];
|
||||||
sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
|
sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
|
||||||
info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
|
info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
|
||||||
TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
|
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
|
||||||
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
||||||
|
|
||||||
INFO(INIT|SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||||
info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
|
info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
|
||||||
static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
|
static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
|
||||||
memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
|
memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
|
||||||
@ -189,7 +189,7 @@ ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
|
|||||||
char shmName[MAX_SHM_NAME_LEN];
|
char shmName[MAX_SHM_NAME_LEN];
|
||||||
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
|
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
|
||||||
info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
|
info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
|
||||||
TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
|
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
|
||||||
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
||||||
|
|
||||||
info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
|
info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
|
||||||
@ -207,7 +207,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
|||||||
char shmName[MAX_SHM_NAME_LEN];
|
char shmName[MAX_SHM_NAME_LEN];
|
||||||
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
|
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
|
||||||
resources->remShmSize = info->shmSize;
|
resources->remShmSize = info->shmSize;
|
||||||
TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
|
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
|
||||||
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
|
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
|
||||||
// Remove the file to ensure proper clean-up
|
// Remove the file to ensure proper clean-up
|
||||||
NCCLCHECK(shmUnlink(shmName));
|
NCCLCHECK(shmUnlink(shmName));
|
||||||
@ -231,7 +231,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
|||||||
char shmName[MAX_SHM_NAME_LEN];
|
char shmName[MAX_SHM_NAME_LEN];
|
||||||
sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
|
sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
|
||||||
resources->remShmSize = info->shmSize;
|
resources->remShmSize = info->shmSize;
|
||||||
TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
|
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
|
||||||
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
|
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
|
||||||
NCCLCHECK(shmUnlink(shmName));
|
NCCLCHECK(shmUnlink(shmName));
|
||||||
recv->conn.head = &resources->devRemHostMem->head;
|
recv->conn.head = &resources->devRemHostMem->head;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user