Add a debug level to NCCL and CUDA versions at init

This commit is contained in:
Sylvain Jeaugey 2016-06-16 16:50:14 -07:00
parent 9fcc523485
commit e51e922924
3 changed files with 31 additions and 13 deletions

View File

@ -62,23 +62,30 @@ else
endif endif
NCCL_MAJOR := 1
NCCL_MINOR := 5
NCCL_PATCH := 3
CXXFLAGS += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
CXXFLAGS += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
.PHONY : lib clean debclean test mpitest install .PHONY : lib clean debclean test mpitest install
.DEFAULT : lib .DEFAULT : lib
INCEXPORTS := nccl.h INCEXPORTS := nccl.h
LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
LIBNAME := libnccl.so LIBNAME := libnccl.so
VER_MAJOR := 1
VER_MINOR := 2
VER_PATCH := 3
INCDIR := $(BUILDDIR)/include INCDIR := $(BUILDDIR)/include
LIBDIR := $(BUILDDIR)/lib LIBDIR := $(BUILDDIR)/lib
OBJDIR := $(BUILDDIR)/obj OBJDIR := $(BUILDDIR)/obj
INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS)) INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
LIBSONAME := $(patsubst %,%.$(VER_MAJOR),$(LIBNAME)) LIBSONAME := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
LIBTARGET := $(patsubst %,%.$(VER_MAJOR).$(VER_MINOR).$(VER_PATCH),$(LIBNAME)) LIBTARGET := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
LIBLINK := $(patsubst lib%.so, -l%, $(LIBNAME)) LIBLINK := $(patsubst lib%.so, -l%, $(LIBNAME))
LIBOBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES))) LIBOBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
DEPFILES := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS)) DEPFILES := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
@ -171,10 +178,6 @@ $(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
#### PACKAGING #### #### PACKAGING ####
CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
DEB_GEN_IN := $(shell ls debian/*.in) DEB_GEN_IN := $(shell ls debian/*.in)
DEB_GEN := $(DEB_GEN_IN:.in=) DEB_GEN := $(DEB_GEN_IN:.in=)
@ -192,9 +195,9 @@ debclean :
debian/% : debian/%.in debian/% : debian/%.in
@printf "Generating %-25s > %-24s\n" $< $@ @printf "Generating %-25s > %-24s\n" $< $@
sed -e "s/\$${nccl:Major}/$(VER_MAJOR)/g" \ sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(VER_MINOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(VER_PATCH)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${nccl:Debian}/$(DEB_REVISION)/g" \ -e "s/\$${nccl:Debian}/$(DEB_REVISION)/g" \

View File

@ -501,6 +501,8 @@ static void initDebug() {
const char* nccl_debug = getenv("NCCL_DEBUG"); const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) { if (nccl_debug == NULL) {
ncclDebugLevel = NONE; ncclDebugLevel = NONE;
} else if (strcmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = VERSION;
} else if (strcmp(nccl_debug, "WARN") == 0) { } else if (strcmp(nccl_debug, "WARN") == 0) {
ncclDebugLevel = WARN; ncclDebugLevel = WARN;
} else if (strcmp(nccl_debug, "INFO") == 0) { } else if (strcmp(nccl_debug, "INFO") == 0) {
@ -654,8 +656,19 @@ static ncclResult_t commUnlinkHostMem(ncclComm_t comm, ncclUniqueId commId, int
return shmUnlink(rankname); return shmUnlink(rankname);
} }
static void showVersion() {
static int shown = 0;
if (shown == 0 && ncclDebugLevel >= VERSION) {
printf("NCCL version %d.%d.%d compiled with CUDA %d.%d\n", NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH, CUDA_MAJOR, CUDA_MINOR);
fflush(stdout); \
shown = 1;
}
}
extern "C" DSOGLOBAL extern "C" DSOGLOBAL
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) { ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
if (myrank == 0) showVersion();
if (strlen(commId.internal) < 1 || if (strlen(commId.internal) < 1 ||
strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) { strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) {
WARN("rank %d invalid commId", myrank); WARN("rank %d invalid commId", myrank);
@ -735,6 +748,8 @@ extern "C" DSOGLOBAL
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) { ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
initDebug(); initDebug();
showVersion();
ncclResult_t res; ncclResult_t res;
int savedDevice; int savedDevice;
RankEntry* ranks = NULL; RankEntry* ranks = NULL;

View File

@ -110,7 +110,7 @@ struct ncclComm {
ncclNodeRef ptrs[1]; ncclNodeRef ptrs[1];
}; };
typedef enum {NONE=0, WARN=1, INFO=2, ABORT=3} DebugLevel; typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
extern DebugLevel ncclDebugLevel; extern DebugLevel ncclDebugLevel;
#define WARN(...) do { \ #define WARN(...) do { \