Add a debug level to NCCL and CUDA versions at init
This commit is contained in:
parent
9fcc523485
commit
e51e922924
27
Makefile
27
Makefile
@ -62,23 +62,30 @@ else
|
||||
endif
|
||||
|
||||
|
||||
NCCL_MAJOR := 1
|
||||
NCCL_MINOR := 5
|
||||
NCCL_PATCH := 3
|
||||
CXXFLAGS += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
|
||||
|
||||
CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
|
||||
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
CXXFLAGS += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
|
||||
|
||||
.PHONY : lib clean debclean test mpitest install
|
||||
.DEFAULT : lib
|
||||
|
||||
INCEXPORTS := nccl.h
|
||||
LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
|
||||
LIBNAME := libnccl.so
|
||||
VER_MAJOR := 1
|
||||
VER_MINOR := 2
|
||||
VER_PATCH := 3
|
||||
|
||||
INCDIR := $(BUILDDIR)/include
|
||||
LIBDIR := $(BUILDDIR)/lib
|
||||
OBJDIR := $(BUILDDIR)/obj
|
||||
|
||||
INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
|
||||
LIBSONAME := $(patsubst %,%.$(VER_MAJOR),$(LIBNAME))
|
||||
LIBTARGET := $(patsubst %,%.$(VER_MAJOR).$(VER_MINOR).$(VER_PATCH),$(LIBNAME))
|
||||
LIBSONAME := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
|
||||
LIBTARGET := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
|
||||
LIBLINK := $(patsubst lib%.so, -l%, $(LIBNAME))
|
||||
LIBOBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
|
||||
DEPFILES := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
|
||||
@ -171,10 +178,6 @@ $(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
|
||||
|
||||
#### PACKAGING ####
|
||||
|
||||
CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
|
||||
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
|
||||
DEB_GEN_IN := $(shell ls debian/*.in)
|
||||
DEB_GEN := $(DEB_GEN_IN:.in=)
|
||||
|
||||
@ -192,9 +195,9 @@ debclean :
|
||||
|
||||
debian/% : debian/%.in
|
||||
@printf "Generating %-25s > %-24s\n" $< $@
|
||||
sed -e "s/\$${nccl:Major}/$(VER_MAJOR)/g" \
|
||||
-e "s/\$${nccl:Minor}/$(VER_MINOR)/g" \
|
||||
-e "s/\$${nccl:Patch}/$(VER_PATCH)/g" \
|
||||
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
|
||||
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
|
||||
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
|
||||
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
|
||||
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
|
||||
-e "s/\$${nccl:Debian}/$(DEB_REVISION)/g" \
|
||||
|
15
src/core.cu
15
src/core.cu
@ -501,6 +501,8 @@ static void initDebug() {
|
||||
const char* nccl_debug = getenv("NCCL_DEBUG");
|
||||
if (nccl_debug == NULL) {
|
||||
ncclDebugLevel = NONE;
|
||||
} else if (strcmp(nccl_debug, "VERSION") == 0) {
|
||||
ncclDebugLevel = VERSION;
|
||||
} else if (strcmp(nccl_debug, "WARN") == 0) {
|
||||
ncclDebugLevel = WARN;
|
||||
} else if (strcmp(nccl_debug, "INFO") == 0) {
|
||||
@ -654,8 +656,19 @@ static ncclResult_t commUnlinkHostMem(ncclComm_t comm, ncclUniqueId commId, int
|
||||
return shmUnlink(rankname);
|
||||
}
|
||||
|
||||
static void showVersion() {
|
||||
static int shown = 0;
|
||||
if (shown == 0 && ncclDebugLevel >= VERSION) {
|
||||
printf("NCCL version %d.%d.%d compiled with CUDA %d.%d\n", NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH, CUDA_MAJOR, CUDA_MINOR);
|
||||
fflush(stdout); \
|
||||
shown = 1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" DSOGLOBAL
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
|
||||
if (myrank == 0) showVersion();
|
||||
|
||||
if (strlen(commId.internal) < 1 ||
|
||||
strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) {
|
||||
WARN("rank %d invalid commId", myrank);
|
||||
@ -735,6 +748,8 @@ extern "C" DSOGLOBAL
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
|
||||
initDebug();
|
||||
|
||||
showVersion();
|
||||
|
||||
ncclResult_t res;
|
||||
int savedDevice;
|
||||
RankEntry* ranks = NULL;
|
||||
|
@ -110,7 +110,7 @@ struct ncclComm {
|
||||
ncclNodeRef ptrs[1];
|
||||
};
|
||||
|
||||
typedef enum {NONE=0, WARN=1, INFO=2, ABORT=3} DebugLevel;
|
||||
typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
|
||||
extern DebugLevel ncclDebugLevel;
|
||||
|
||||
#define WARN(...) do { \
|
||||
|
Loading…
x
Reference in New Issue
Block a user