diff --git a/Makefile b/Makefile index 5f93ecb..9afde64 100644 --- a/Makefile +++ b/Makefile @@ -78,14 +78,14 @@ TESTBINS := $(patsubst %, $(TSTDIR)/%, $(TESTS)) MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS)) DEPFILES := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS)) -lib : $(INCTARGETS) $(LIBTARGET) +lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) -include $(DEPFILES) -$(LIBTARGET) : $(LIBOBJ) +$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ) @printf "Linking %-25s\n" $@ @mkdir -p $(LIBDIR) - @$(GPP) $(CPPFLAGS) $(CXXFLAGS) -shared -Wl,-soname,$(LIBSONAME) -o $(LIBDIR)/$@ $(LDFLAGS) $(LIBOBJ) + @$(GPP) $(CPPFLAGS) $(CXXFLAGS) -shared -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ) @ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) @ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) @@ -109,7 +109,7 @@ clean : test : lib $(TESTBINS) -$(TSTDIR)/% : test/single/%.cu lib +$(TSTDIR)/% : test/single/%.cu $(LIBDIR)/$(LIBTARGET) @printf "Building %-25s > %-24s\n" $< $@ @mkdir -p $(TSTDIR) @$(NVCC) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt @@ -121,11 +121,11 @@ $(TSTDIR)/% : test/single/%.cu lib mpitest : lib $(MPITESTBINS) -$(MPITSTDIR)/% : test/mpi/%.cu lib +$(MPITSTDIR)/% : test/mpi/%.cu $(LIBDIR)/$(LIBTARGET) @printf "Building %-25s > %-24s\n" $< $@ @mkdir -p $(MPITSTDIR) - @$(NVCC) $(MPIFLAGS) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) - @$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) > $(@:%=%.d.tmp) + @$(NVCC) $(MPIFLAGS) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcurand + @$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcurand > $(@:%=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d) diff --git a/test/include/test_utilities.h b/test/include/test_utilities.h index c929a9e..fb34d19 100644 --- a/test/include/test_utilities.h +++ b/test/include/test_utilities.h @@ -32,14 +32,23 @@ #include -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - printf("Cuda failure %s:%d '%s'\n", \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(false) +#define CUDACHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + printf("Cuda failure %s:%d '%s'\n", \ + __FILE__,__LINE__,cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t r = cmd; \ + if (r!= ncclSuccess) { \ + printf("NCCL failure %s:%d '%s'\n", \ + __FILE__,__LINE__,ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) template void Randomize(T* const dest, const int N, const int randomSeed); diff --git a/test/mpi/mpi_test.cu b/test/mpi/mpi_test.cu index 87465e5..54ebbce 100644 --- a/test/mpi/mpi_test.cu +++ b/test/mpi/mpi_test.cu @@ -32,15 +32,7 @@ #include "nccl.h" #include "mpi.h" - -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - printf("Cuda failure %s:%d '%s'\n", \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(false) +#include "test_utilities.h" #define SIZE 128 #define NITERS 1 @@ -48,7 +40,7 @@ int main(int argc, char *argv[]) { ncclUniqueId commId; int size, rank; - int ret; + ncclResult_t ret; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); @@ -66,11 +58,11 @@ int main(int argc, char *argv[]) { // NCCL Communicator creation ncclComm_t comm; - ncclGetUniqueId(&commId); + NCCLCHECK(ncclGetUniqueId(&commId)); MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD); ret = ncclCommInitRank(&comm, size, commId, rank); if (ret != ncclSuccess) { - printf("NCCL Init failed : %d\n", ret); + printf("NCCL Init failed (%d) '%s'\n", ret, ncclGetErrorString(ret)); exit(1); } @@ -93,7 +85,7 @@ int main(int argc, char *argv[]) { // Run allreduce int errors = 0; for (int i=0; i& dList) { // initialize data int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); T* buffer = (T*)malloc(nDev * N * sizeof(T)); T* result = (T*)malloc(nDev * N * sizeof(T)); @@ -61,8 +61,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type, - (void*)recvbuff[i], comms[i], s[i]); + NCCLCHECK(ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type, + (void*)recvbuff[i], comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -79,8 +79,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i], - s[i]); + NCCLCHECK(ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i], + s[i])); } for (int i = 0; i < nDev; ++i) { @@ -121,7 +121,7 @@ template void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** sendbuff = (T**)malloc(nDev * sizeof(T*)); T** recvbuff = (T**)malloc(nDev * sizeof(T*)); @@ -199,15 +199,15 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); printf("# Using devices\n"); for (int g=0; g void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms, const std::vector& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** sendbuff = (T**)malloc(nDev * sizeof(T*)); T** recvbuff = (T**)malloc(nDev * sizeof(T*)); @@ -256,7 +256,7 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); if (!csv) { printf("# Using devices\n"); @@ -264,8 +264,8 @@ int main(int argc, char* argv[]) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name); diff --git a/test/single/broadcast_test.cu b/test/single/broadcast_test.cu index 9c85a1f..9801d04 100644 --- a/test/single/broadcast_test.cu +++ b/test/single/broadcast_test.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,7 +41,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root, ncclComm_t* const comms, const std::vector& dList) { // initialize data int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); T* buffer = (T*)malloc(N * sizeof(T)); T* result = (T*)malloc(N * sizeof(T)); @@ -65,7 +65,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]); + NCCLCHECK(ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -83,7 +83,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root, for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]); + NCCLCHECK(ncclBcast((void*)buff[i], n, type, root, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -123,7 +123,7 @@ template void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** buff = (T**)malloc(nDev * sizeof(T*)); for (int i = 0; i < nDev; ++i) { @@ -199,15 +199,15 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);; - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); printf("# Using devices\n"); for (int g = 0; g < nDev; ++g) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name); diff --git a/test/single/reduce_scatter_test.cu b/test/single/reduce_scatter_test.cu index da205d5..4fc3292 100644 --- a/test/single/reduce_scatter_test.cu +++ b/test/single/reduce_scatter_test.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,7 +41,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, const ncclRedOp_t op, ncclComm_t* const comms, const std::vector& dList) { // initialize data int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); T* buffer = (T*)malloc(N * nDev * sizeof(T)); @@ -66,8 +66,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], - std::min(N, 1024 * 1024), type, op, comms[i], s[i]); + NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], + std::min(N, 1024 * 1024), type, op, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -86,8 +86,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type, - op, comms[i], s[i]); + NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type, + op, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -122,8 +122,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type, - op, comms[i], s[i]); + NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type, + op, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -163,7 +163,7 @@ template void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** sendbuff = (T**)malloc(nDev * sizeof(T*)); T** recvbuff = (T**)malloc(nDev * sizeof(T*)); @@ -243,15 +243,15 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); printf("# Using devices\n"); for (int g = 0; g < nDev; ++g) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name); diff --git a/test/single/reduce_test.cu b/test/single/reduce_test.cu index 42b1e9b..9500c18 100644 --- a/test/single/reduce_test.cu +++ b/test/single/reduce_test.cu @@ -50,7 +50,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, memset(result, 0, N * sizeof(T)); int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); for (int i = 0; i < nDev; ++i) { @@ -68,8 +68,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), - type, op, root, comms[i], s[i]); + NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), + type, op, root, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -90,8 +90,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, //for (int i=0; i<100; i++) { for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op, - root, comms[i], s[i]); + NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op, + root, comms[i], s[i])); } //} @@ -129,8 +129,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, //for (int i=0; i<100; i++) { for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op, - root, comms[i], s[i]); + NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op, + root, comms[i], s[i])); } //} @@ -171,7 +171,7 @@ template void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** sendbuff = (T**)malloc(nDev * sizeof(T*)); T** recvbuff = (T**)malloc(nDev * sizeof(T*)); @@ -253,7 +253,7 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); if (!csv) { printf("# Using devices\n"); @@ -261,8 +261,8 @@ int main(int argc, char* argv[]) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name);