Added NCCL error checking to tests.
Also cleaned up makefile so that tests and lib are not built unnecessarily. Change-Id: Ia0c596cc2213628de2f066be97615c09bb1bb262 Reviewed-on: http://git-master/r/999627 Reviewed-by: Przemek Tredak <ptredak@nvidia.com> Tested-by: Przemek Tredak <ptredak@nvidia.com>
This commit is contained in:
parent
fe1a956715
commit
2758353380
14
Makefile
14
Makefile
@ -78,14 +78,14 @@ TESTBINS := $(patsubst %, $(TSTDIR)/%, $(TESTS))
|
|||||||
MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
|
MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
|
||||||
DEPFILES := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
|
DEPFILES := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
|
||||||
|
|
||||||
lib : $(INCTARGETS) $(LIBTARGET)
|
lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
|
||||||
|
|
||||||
-include $(DEPFILES)
|
-include $(DEPFILES)
|
||||||
|
|
||||||
$(LIBTARGET) : $(LIBOBJ)
|
$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
|
||||||
@printf "Linking %-25s\n" $@
|
@printf "Linking %-25s\n" $@
|
||||||
@mkdir -p $(LIBDIR)
|
@mkdir -p $(LIBDIR)
|
||||||
@$(GPP) $(CPPFLAGS) $(CXXFLAGS) -shared -Wl,-soname,$(LIBSONAME) -o $(LIBDIR)/$@ $(LDFLAGS) $(LIBOBJ)
|
@$(GPP) $(CPPFLAGS) $(CXXFLAGS) -shared -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
|
||||||
@ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
|
@ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
|
||||||
@ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
|
@ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
|
||||||
|
|
||||||
@ -109,7 +109,7 @@ clean :
|
|||||||
|
|
||||||
test : lib $(TESTBINS)
|
test : lib $(TESTBINS)
|
||||||
|
|
||||||
$(TSTDIR)/% : test/single/%.cu lib
|
$(TSTDIR)/% : test/single/%.cu $(LIBDIR)/$(LIBTARGET)
|
||||||
@printf "Building %-25s > %-24s\n" $< $@
|
@printf "Building %-25s > %-24s\n" $< $@
|
||||||
@mkdir -p $(TSTDIR)
|
@mkdir -p $(TSTDIR)
|
||||||
@$(NVCC) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt
|
@$(NVCC) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt
|
||||||
@ -121,11 +121,11 @@ $(TSTDIR)/% : test/single/%.cu lib
|
|||||||
|
|
||||||
mpitest : lib $(MPITESTBINS)
|
mpitest : lib $(MPITESTBINS)
|
||||||
|
|
||||||
$(MPITSTDIR)/% : test/mpi/%.cu lib
|
$(MPITSTDIR)/% : test/mpi/%.cu $(LIBDIR)/$(LIBTARGET)
|
||||||
@printf "Building %-25s > %-24s\n" $< $@
|
@printf "Building %-25s > %-24s\n" $< $@
|
||||||
@mkdir -p $(MPITSTDIR)
|
@mkdir -p $(MPITSTDIR)
|
||||||
@$(NVCC) $(MPIFLAGS) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS)
|
@$(NVCC) $(MPIFLAGS) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcurand
|
||||||
@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) > $(@:%=%.d.tmp)
|
@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcurand > $(@:%=%.d.tmp)
|
||||||
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
|
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
|
||||||
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
|
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
|
||||||
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
|
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
|
||||||
|
@ -39,7 +39,16 @@
|
|||||||
__FILE__,__LINE__,cudaGetErrorString(e)); \
|
__FILE__,__LINE__,cudaGetErrorString(e)); \
|
||||||
exit(EXIT_FAILURE); \
|
exit(EXIT_FAILURE); \
|
||||||
} \
|
} \
|
||||||
} while(false)
|
} while(0)
|
||||||
|
|
||||||
|
#define NCCLCHECK(cmd) do { \
|
||||||
|
ncclResult_t r = cmd; \
|
||||||
|
if (r!= ncclSuccess) { \
|
||||||
|
printf("NCCL failure %s:%d '%s'\n", \
|
||||||
|
__FILE__,__LINE__,ncclGetErrorString(r)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void Randomize(T* const dest, const int N, const int randomSeed);
|
void Randomize(T* const dest, const int N, const int randomSeed);
|
||||||
|
@ -32,15 +32,7 @@
|
|||||||
|
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
|
#include "test_utilities.h"
|
||||||
#define CUDACHECK(cmd) do { \
|
|
||||||
cudaError_t e = cmd; \
|
|
||||||
if( e != cudaSuccess ) { \
|
|
||||||
printf("Cuda failure %s:%d '%s'\n", \
|
|
||||||
__FILE__,__LINE__,cudaGetErrorString(e)); \
|
|
||||||
exit(EXIT_FAILURE); \
|
|
||||||
} \
|
|
||||||
} while(false)
|
|
||||||
|
|
||||||
#define SIZE 128
|
#define SIZE 128
|
||||||
#define NITERS 1
|
#define NITERS 1
|
||||||
@ -48,7 +40,7 @@
|
|||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
ncclUniqueId commId;
|
ncclUniqueId commId;
|
||||||
int size, rank;
|
int size, rank;
|
||||||
int ret;
|
ncclResult_t ret;
|
||||||
|
|
||||||
MPI_Init(&argc, &argv);
|
MPI_Init(&argc, &argv);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||||
@ -66,11 +58,11 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
// NCCL Communicator creation
|
// NCCL Communicator creation
|
||||||
ncclComm_t comm;
|
ncclComm_t comm;
|
||||||
ncclGetUniqueId(&commId);
|
NCCLCHECK(ncclGetUniqueId(&commId));
|
||||||
MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
|
MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
|
||||||
ret = ncclCommInitRank(&comm, size, commId, rank);
|
ret = ncclCommInitRank(&comm, size, commId, rank);
|
||||||
if (ret != ncclSuccess) {
|
if (ret != ncclSuccess) {
|
||||||
printf("NCCL Init failed : %d\n", ret);
|
printf("NCCL Init failed (%d) '%s'\n", ret, ncclGetErrorString(ret));
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +85,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// Run allreduce
|
// Run allreduce
|
||||||
int errors = 0;
|
int errors = 0;
|
||||||
for (int i=0; i<NITERS; i++) {
|
for (int i=0; i<NITERS; i++) {
|
||||||
ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream);
|
NCCLCHECK(ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check results
|
// Check results
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions
|
* modification, are permitted provided that the following conditions
|
||||||
@ -41,7 +41,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
ncclComm_t* const comms, const std::vector<int>& dList) {
|
ncclComm_t* const comms, const std::vector<int>& dList) {
|
||||||
// initialize data
|
// initialize data
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
||||||
T* buffer = (T*)malloc(nDev * N * sizeof(T));
|
T* buffer = (T*)malloc(nDev * N * sizeof(T));
|
||||||
T* result = (T*)malloc(nDev * N * sizeof(T));
|
T* result = (T*)malloc(nDev * N * sizeof(T));
|
||||||
@ -61,8 +61,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
// warm up GPU
|
// warm up GPU
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type,
|
NCCLCHECK(ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type,
|
||||||
(void*)recvbuff[i], comms[i], s[i]);
|
(void*)recvbuff[i], comms[i], s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -79,8 +79,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i],
|
NCCLCHECK(ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i],
|
||||||
s[i]);
|
s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -121,7 +121,7 @@ template<typename T>
|
|||||||
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
|
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
|
||||||
const std::vector<int>& dList) {
|
const std::vector<int>& dList) {
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
|
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
|
||||||
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
|
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
|
||||||
|
|
||||||
@ -199,15 +199,15 @@ int main(int argc, char* argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
||||||
ncclCommInitAll(comms, nDev, dList.data());
|
NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
|
||||||
|
|
||||||
printf("# Using devices\n");
|
printf("# Using devices\n");
|
||||||
for (int g=0; g<nDev; ++g) {
|
for (int g=0; g<nDev; ++g) {
|
||||||
int cudaDev;
|
int cudaDev;
|
||||||
int rank;
|
int rank;
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
ncclCommCuDevice(comms[g], &cudaDev);
|
NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
|
||||||
ncclCommUserRank(comms[g], &rank);
|
NCCLCHECK(ncclCommUserRank(comms[g], &rank));
|
||||||
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
||||||
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
||||||
prop.pciBusID, prop.name);
|
prop.pciBusID, prop.name);
|
||||||
|
@ -48,7 +48,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
memset(result, 0, N * sizeof(T));
|
memset(result, 0, N * sizeof(T));
|
||||||
|
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -66,7 +66,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
// warm up GPU
|
// warm up GPU
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]);
|
NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -87,8 +87,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
//for (int i=0; i<100; i++) {
|
//for (int i=0; i<100; i++) {
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
|
NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
|
||||||
comms[i], s[i]);
|
comms[i], s[i]));
|
||||||
}
|
}
|
||||||
//}
|
//}
|
||||||
|
|
||||||
@ -130,8 +130,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
//for (int i=0; i<100; i++) {
|
//for (int i=0; i<100; i++) {
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
|
NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
|
||||||
comms[i], s[i]);
|
comms[i], s[i]));
|
||||||
}
|
}
|
||||||
//}
|
//}
|
||||||
|
|
||||||
@ -176,7 +176,7 @@ template<typename T>
|
|||||||
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms,
|
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms,
|
||||||
const std::vector<int>& dList) {
|
const std::vector<int>& dList) {
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
|
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
|
||||||
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
|
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
|
||||||
|
|
||||||
@ -256,7 +256,7 @@ int main(int argc, char* argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
||||||
ncclCommInitAll(comms, nDev, dList.data());
|
NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
|
||||||
|
|
||||||
if (!csv) {
|
if (!csv) {
|
||||||
printf("# Using devices\n");
|
printf("# Using devices\n");
|
||||||
@ -264,8 +264,8 @@ int main(int argc, char* argv[]) {
|
|||||||
int cudaDev;
|
int cudaDev;
|
||||||
int rank;
|
int rank;
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
ncclCommCuDevice(comms[g], &cudaDev);
|
NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
|
||||||
ncclCommUserRank(comms[g], &rank);
|
NCCLCHECK(ncclCommUserRank(comms[g], &rank));
|
||||||
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
||||||
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
||||||
prop.pciBusID, prop.name);
|
prop.pciBusID, prop.name);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions
|
* modification, are permitted provided that the following conditions
|
||||||
@ -41,7 +41,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
|
|||||||
ncclComm_t* const comms, const std::vector<int>& dList) {
|
ncclComm_t* const comms, const std::vector<int>& dList) {
|
||||||
// initialize data
|
// initialize data
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
||||||
T* buffer = (T*)malloc(N * sizeof(T));
|
T* buffer = (T*)malloc(N * sizeof(T));
|
||||||
T* result = (T*)malloc(N * sizeof(T));
|
T* result = (T*)malloc(N * sizeof(T));
|
||||||
@ -65,7 +65,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
|
|||||||
// warm up GPU
|
// warm up GPU
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]);
|
NCCLCHECK(ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -83,7 +83,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
|
|||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]);
|
NCCLCHECK(ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -123,7 +123,7 @@ template<typename T>
|
|||||||
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
|
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
|
||||||
const std::vector<int>& dList) {
|
const std::vector<int>& dList) {
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
T** buff = (T**)malloc(nDev * sizeof(T*));
|
T** buff = (T**)malloc(nDev * sizeof(T*));
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -199,15 +199,15 @@ int main(int argc, char* argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);;
|
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);;
|
||||||
ncclCommInitAll(comms, nDev, dList.data());
|
NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
|
||||||
|
|
||||||
printf("# Using devices\n");
|
printf("# Using devices\n");
|
||||||
for (int g = 0; g < nDev; ++g) {
|
for (int g = 0; g < nDev; ++g) {
|
||||||
int cudaDev;
|
int cudaDev;
|
||||||
int rank;
|
int rank;
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
ncclCommCuDevice(comms[g], &cudaDev);
|
NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
|
||||||
ncclCommUserRank(comms[g], &rank);
|
NCCLCHECK(ncclCommUserRank(comms[g], &rank));
|
||||||
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
||||||
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
||||||
prop.pciBusID, prop.name);
|
prop.pciBusID, prop.name);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions
|
* modification, are permitted provided that the following conditions
|
||||||
@ -41,7 +41,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) {
|
const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) {
|
||||||
// initialize data
|
// initialize data
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
||||||
|
|
||||||
T* buffer = (T*)malloc(N * nDev * sizeof(T));
|
T* buffer = (T*)malloc(N * nDev * sizeof(T));
|
||||||
@ -66,8 +66,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
// warm up GPU
|
// warm up GPU
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
|
NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
|
||||||
std::min(N, 1024 * 1024), type, op, comms[i], s[i]);
|
std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -86,8 +86,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
|
NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
|
||||||
op, comms[i], s[i]);
|
op, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -122,8 +122,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
|
NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
|
||||||
op, comms[i], s[i]);
|
op, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -163,7 +163,7 @@ template<typename T>
|
|||||||
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
|
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
|
||||||
const std::vector<int>& dList) {
|
const std::vector<int>& dList) {
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
|
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
|
||||||
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
|
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
|
||||||
|
|
||||||
@ -243,15 +243,15 @@ int main(int argc, char* argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
||||||
ncclCommInitAll(comms, nDev, dList.data());
|
NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
|
||||||
|
|
||||||
printf("# Using devices\n");
|
printf("# Using devices\n");
|
||||||
for (int g = 0; g < nDev; ++g) {
|
for (int g = 0; g < nDev; ++g) {
|
||||||
int cudaDev;
|
int cudaDev;
|
||||||
int rank;
|
int rank;
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
ncclCommCuDevice(comms[g], &cudaDev);
|
NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
|
||||||
ncclCommUserRank(comms[g], &rank);
|
NCCLCHECK(ncclCommUserRank(comms[g], &rank));
|
||||||
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
||||||
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
||||||
prop.pciBusID, prop.name);
|
prop.pciBusID, prop.name);
|
||||||
|
@ -50,7 +50,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
memset(result, 0, N * sizeof(T));
|
memset(result, 0, N * sizeof(T));
|
||||||
|
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -68,8 +68,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
// warm up GPU
|
// warm up GPU
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024),
|
NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024),
|
||||||
type, op, root, comms[i], s[i]);
|
type, op, root, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
@ -90,8 +90,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
//for (int i=0; i<100; i++) {
|
//for (int i=0; i<100; i++) {
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
|
NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
|
||||||
root, comms[i], s[i]);
|
root, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
//}
|
//}
|
||||||
|
|
||||||
@ -129,8 +129,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
|||||||
//for (int i=0; i<100; i++) {
|
//for (int i=0; i<100; i++) {
|
||||||
for (int i = 0; i < nDev; ++i) {
|
for (int i = 0; i < nDev; ++i) {
|
||||||
CUDACHECK(cudaSetDevice(dList[i]));
|
CUDACHECK(cudaSetDevice(dList[i]));
|
||||||
ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
|
NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
|
||||||
root, comms[i], s[i]);
|
root, comms[i], s[i]));
|
||||||
}
|
}
|
||||||
//}
|
//}
|
||||||
|
|
||||||
@ -171,7 +171,7 @@ template<typename T>
|
|||||||
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
|
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
|
||||||
const std::vector<int>& dList) {
|
const std::vector<int>& dList) {
|
||||||
int nDev = 0;
|
int nDev = 0;
|
||||||
ncclCommCount(comms[0], &nDev);
|
NCCLCHECK(ncclCommCount(comms[0], &nDev));
|
||||||
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
|
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
|
||||||
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
|
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
|
||||||
|
|
||||||
@ -253,7 +253,7 @@ int main(int argc, char* argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
||||||
ncclCommInitAll(comms, nDev, dList.data());
|
NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
|
||||||
|
|
||||||
if (!csv) {
|
if (!csv) {
|
||||||
printf("# Using devices\n");
|
printf("# Using devices\n");
|
||||||
@ -261,8 +261,8 @@ int main(int argc, char* argv[]) {
|
|||||||
int cudaDev;
|
int cudaDev;
|
||||||
int rank;
|
int rank;
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
ncclCommCuDevice(comms[g], &cudaDev);
|
NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
|
||||||
ncclCommUserRank(comms[g], &rank);
|
NCCLCHECK(ncclCommUserRank(comms[g], &rank));
|
||||||
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
||||||
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
||||||
prop.pciBusID, prop.name);
|
prop.pciBusID, prop.name);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user