diff --git a/Makefile b/Makefile
index 5f93ecb..9afde64 100644
--- a/Makefile
+++ b/Makefile
@@ -78,14 +78,14 @@ TESTBINS   := $(patsubst %, $(TSTDIR)/%, $(TESTS))
 MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
 DEPFILES   := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
 
-lib : $(INCTARGETS) $(LIBTARGET)
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
 
 -include $(DEPFILES)
 
-$(LIBTARGET) : $(LIBOBJ)
+$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
 	@printf "Linking   %-25s\n" $@
 	@mkdir -p $(LIBDIR)
-	@$(GPP) $(CPPFLAGS) $(CXXFLAGS) -shared -Wl,-soname,$(LIBSONAME) -o $(LIBDIR)/$@ $(LDFLAGS) $(LIBOBJ)
+	@$(GPP) $(CPPFLAGS) $(CXXFLAGS) -shared -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
 	@ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	@ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
@@ -109,7 +109,7 @@ clean :
 
 test : lib $(TESTBINS)
 
-$(TSTDIR)/% : test/single/%.cu lib
+$(TSTDIR)/% : test/single/%.cu $(LIBDIR)/$(LIBTARGET)
 	@printf "Building  %-25s > %-24s\n" $< $@
 	@mkdir -p $(TSTDIR)
 	@$(NVCC) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt
@@ -121,11 +121,11 @@ $(TSTDIR)/% : test/single/%.cu lib
 
 mpitest : lib $(MPITESTBINS)
 
-$(MPITSTDIR)/% : test/mpi/%.cu lib
+$(MPITSTDIR)/% : test/mpi/%.cu $(LIBDIR)/$(LIBTARGET)
 	@printf "Building  %-25s > %-24s\n" $< $@
 	@mkdir -p $(MPITSTDIR)
-	@$(NVCC) $(MPIFLAGS) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS)
-	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) > $(@:%=%.d.tmp)
+	@$(NVCC) $(MPIFLAGS) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcurand
+	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcurand > $(@:%=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
diff --git a/test/include/test_utilities.h b/test/include/test_utilities.h
index c929a9e..fb34d19 100644
--- a/test/include/test_utilities.h
+++ b/test/include/test_utilities.h
@@ -32,14 +32,23 @@
 
 #include <curand.h>
 
-#define CUDACHECK(cmd) do {                              \
-    cudaError_t e = cmd;                                 \
-    if( e != cudaSuccess ) {                             \
-        printf("Cuda failure %s:%d '%s'\n",              \
-               __FILE__,__LINE__,cudaGetErrorString(e)); \
-        exit(EXIT_FAILURE);                              \
-    }                                                    \
-} while(false)
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t e = cmd;                              \
+  if( e != cudaSuccess ) {                          \
+    printf("Cuda failure %s:%d '%s'\n",             \
+        __FILE__,__LINE__,cudaGetErrorString(e));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("NCCL failure %s:%d '%s'\n",             \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
 
 template<typename T>
 void Randomize(T* const dest, const int N, const int randomSeed);
diff --git a/test/mpi/mpi_test.cu b/test/mpi/mpi_test.cu
index 87465e5..54ebbce 100644
--- a/test/mpi/mpi_test.cu
+++ b/test/mpi/mpi_test.cu
@@ -32,15 +32,7 @@
 
 #include "nccl.h"
 #include "mpi.h"
-
-#define CUDACHECK(cmd) do {                              \
-  cudaError_t e = cmd;                                 \
-  if( e != cudaSuccess ) {                             \
-    printf("Cuda failure %s:%d '%s'\n",              \
-        __FILE__,__LINE__,cudaGetErrorString(e)); \
-    exit(EXIT_FAILURE);                              \
-  }                                                    \
-} while(false)
+#include "test_utilities.h"
 
 #define SIZE 128
 #define NITERS 1
@@ -48,7 +40,7 @@
 int main(int argc, char *argv[]) {
   ncclUniqueId commId;
   int size, rank;
-  int ret;
+  ncclResult_t ret;
 
   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &size);
@@ -66,11 +58,11 @@ int main(int argc, char *argv[]) {
 
   // NCCL Communicator creation
   ncclComm_t comm;
-  ncclGetUniqueId(&commId);
+  NCCLCHECK(ncclGetUniqueId(&commId));
   MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
   ret = ncclCommInitRank(&comm, size, commId, rank);
   if (ret != ncclSuccess) {
-    printf("NCCL Init failed : %d\n", ret);
+    printf("NCCL Init failed (%d) '%s'\n", ret, ncclGetErrorString(ret));
     exit(1);
   }
 
@@ -93,7 +85,7 @@ int main(int argc, char *argv[]) {
   // Run allreduce
   int errors = 0;
   for (int i=0; i<NITERS; i++) {
-    ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream);
+    NCCLCHECK(ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream));
   }
 
   // Check results
diff --git a/test/single/all_gather_test.cu b/test/single/all_gather_test.cu
index a9e1c1e..0925b7e 100644
--- a/test/single/all_gather_test.cu
+++ b/test/single/all_gather_test.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -41,7 +41,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
     ncclComm_t* const comms, const std::vector<int>& dList) {
   // initialize data
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
   T* buffer = (T*)malloc(nDev * N * sizeof(T));
   T* result = (T*)malloc(nDev * N * sizeof(T));
@@ -61,8 +61,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
   // warm up GPU
   for (int i = 0; i < nDev; ++i) {
     CUDACHECK(cudaSetDevice(dList[i]));
-    ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type,
-        (void*)recvbuff[i], comms[i], s[i]);
+    NCCLCHECK(ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type,
+        (void*)recvbuff[i], comms[i], s[i]));
   }
 
   for (int i = 0; i < nDev; ++i) {
@@ -79,8 +79,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
 
     for (int i = 0; i < nDev; ++i) {
       CUDACHECK(cudaSetDevice(dList[i]));
-      ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i],
-          s[i]);
+      NCCLCHECK(ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i],
+          s[i]));
     }
 
     for (int i = 0; i < nDev; ++i) {
@@ -121,7 +121,7 @@ template<typename T>
 void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
     const std::vector<int>& dList) {
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   T** sendbuff = (T**)malloc(nDev * sizeof(T*));
   T** recvbuff = (T**)malloc(nDev * sizeof(T*));
 
@@ -199,15 +199,15 @@ int main(int argc, char* argv[]) {
   }
 
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  ncclCommInitAll(comms, nDev, dList.data());
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
 
   printf("# Using devices\n");
   for (int g=0; g<nDev; ++g) {
     int cudaDev;
     int rank;
     cudaDeviceProp prop;
-    ncclCommCuDevice(comms[g], &cudaDev);
-    ncclCommUserRank(comms[g], &rank);
+    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
     printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
         prop.pciBusID, prop.name);
diff --git a/test/single/all_reduce_test.cu b/test/single/all_reduce_test.cu
index a2fcb3d..2e14335 100644
--- a/test/single/all_reduce_test.cu
+++ b/test/single/all_reduce_test.cu
@@ -48,7 +48,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
   memset(result, 0, N * sizeof(T));
 
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
 
   for (int i = 0; i < nDev; ++i) {
@@ -66,7 +66,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
   // warm up GPU
   for (int i = 0; i < nDev; ++i) {
     CUDACHECK(cudaSetDevice(dList[i]));
-    ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]);
+    NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
   }
 
   for (int i = 0; i < nDev; ++i) {
@@ -87,8 +87,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
     //for (int i=0; i<100; i++) {
       for (int i = 0; i < nDev; ++i) {
         CUDACHECK(cudaSetDevice(dList[i]));
-        ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
-            comms[i], s[i]);
+        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
+            comms[i], s[i]));
       }
     //}
 
@@ -130,8 +130,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
     //for (int i=0; i<100; i++) {
       for (int i = 0; i < nDev; ++i) {
         CUDACHECK(cudaSetDevice(dList[i]));
-        ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
-            comms[i], s[i]);
+        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
+            comms[i], s[i]));
       }
     //}
 
@@ -176,7 +176,7 @@ template<typename T>
 void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms,
     const std::vector<int>& dList) {
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   T** sendbuff = (T**)malloc(nDev * sizeof(T*));
   T** recvbuff = (T**)malloc(nDev * sizeof(T*));
 
@@ -256,7 +256,7 @@ int main(int argc, char* argv[]) {
   }
 
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  ncclCommInitAll(comms, nDev, dList.data());
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
 
   if (!csv) {
     printf("# Using devices\n");
@@ -264,8 +264,8 @@ int main(int argc, char* argv[]) {
       int cudaDev;
       int rank;
       cudaDeviceProp prop;
-      ncclCommCuDevice(comms[g], &cudaDev);
-      ncclCommUserRank(comms[g], &rank);
+      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
       CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
       printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
           prop.pciBusID, prop.name);
diff --git a/test/single/broadcast_test.cu b/test/single/broadcast_test.cu
index 9c85a1f..9801d04 100644
--- a/test/single/broadcast_test.cu
+++ b/test/single/broadcast_test.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -41,7 +41,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
     ncclComm_t* const comms, const std::vector<int>& dList) {
   // initialize data
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
   T* buffer = (T*)malloc(N * sizeof(T));
   T* result = (T*)malloc(N * sizeof(T));
@@ -65,7 +65,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
   // warm up GPU
   for (int i = 0; i < nDev; ++i) {
     CUDACHECK(cudaSetDevice(dList[i]));
-    ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]);
+    NCCLCHECK(ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]));
   }
 
   for (int i = 0; i < nDev; ++i) {
@@ -83,7 +83,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
 
     for (int i = 0; i < nDev; ++i) {
       CUDACHECK(cudaSetDevice(dList[i]));
-      ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]);
+      NCCLCHECK(ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]));
     }
 
     for (int i = 0; i < nDev; ++i) {
@@ -123,7 +123,7 @@ template<typename T>
 void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
     const std::vector<int>& dList) {
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   T** buff = (T**)malloc(nDev * sizeof(T*));
 
   for (int i = 0; i < nDev; ++i) {
@@ -199,15 +199,15 @@ int main(int argc, char* argv[]) {
   }
 
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);;
-  ncclCommInitAll(comms, nDev, dList.data());
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
 
   printf("# Using devices\n");
   for (int g = 0; g < nDev; ++g) {
     int cudaDev;
     int rank;
     cudaDeviceProp prop;
-    ncclCommCuDevice(comms[g], &cudaDev);
-    ncclCommUserRank(comms[g], &rank);
+    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
     printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
         prop.pciBusID, prop.name);
diff --git a/test/single/reduce_scatter_test.cu b/test/single/reduce_scatter_test.cu
index da205d5..4fc3292 100644
--- a/test/single/reduce_scatter_test.cu
+++ b/test/single/reduce_scatter_test.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -41,7 +41,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
     const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) {
   // initialize data
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
 
   T* buffer = (T*)malloc(N * nDev * sizeof(T));
@@ -66,8 +66,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
   // warm up GPU
   for (int i = 0; i < nDev; ++i) {
     CUDACHECK(cudaSetDevice(dList[i]));
-    ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
-        std::min(N, 1024 * 1024), type, op, comms[i], s[i]);
+    NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
+        std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
   }
 
   for (int i = 0; i < nDev; ++i) {
@@ -86,8 +86,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
 
     for (int i = 0; i < nDev; ++i) {
       CUDACHECK(cudaSetDevice(dList[i]));
-      ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
-          op, comms[i], s[i]);
+      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
+          op, comms[i], s[i]));
     }
 
     for (int i = 0; i < nDev; ++i) {
@@ -122,8 +122,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
 
     for (int i = 0; i < nDev; ++i) {
       CUDACHECK(cudaSetDevice(dList[i]));
-      ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
-          op, comms[i], s[i]);
+      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
+          op, comms[i], s[i]));
     }
 
     for (int i = 0; i < nDev; ++i) {
@@ -163,7 +163,7 @@ template<typename T>
 void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
     const std::vector<int>& dList) {
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   T** sendbuff = (T**)malloc(nDev * sizeof(T*));
   T** recvbuff = (T**)malloc(nDev * sizeof(T*));
 
@@ -243,15 +243,15 @@ int main(int argc, char* argv[]) {
   }
 
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  ncclCommInitAll(comms, nDev, dList.data());
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
 
   printf("# Using devices\n");
   for (int g = 0; g < nDev; ++g) {
     int cudaDev;
     int rank;
     cudaDeviceProp prop;
-    ncclCommCuDevice(comms[g], &cudaDev);
-    ncclCommUserRank(comms[g], &rank);
+    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
     printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
         prop.pciBusID, prop.name);
diff --git a/test/single/reduce_test.cu b/test/single/reduce_test.cu
index 42b1e9b..9500c18 100644
--- a/test/single/reduce_test.cu
+++ b/test/single/reduce_test.cu
@@ -50,7 +50,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
   memset(result, 0, N * sizeof(T));
 
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
 
   for (int i = 0; i < nDev; ++i) {
@@ -68,8 +68,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
   // warm up GPU
   for (int i = 0; i < nDev; ++i) {
     CUDACHECK(cudaSetDevice(dList[i]));
-    ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024),
-        type, op, root, comms[i], s[i]);
+    NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024),
+        type, op, root, comms[i], s[i]));
   }
 
   for (int i = 0; i < nDev; ++i) {
@@ -90,8 +90,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
     //for (int i=0; i<100; i++) {
       for (int i = 0; i < nDev; ++i) {
         CUDACHECK(cudaSetDevice(dList[i]));
-        ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
-            root, comms[i], s[i]);
+        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
+            root, comms[i], s[i]));
       }
     //}
 
@@ -129,8 +129,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
     //for (int i=0; i<100; i++) {
       for (int i = 0; i < nDev; ++i) {
         CUDACHECK(cudaSetDevice(dList[i]));
-        ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
-            root, comms[i], s[i]);
+        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
+            root, comms[i], s[i]));
       }
     //}
 
@@ -171,7 +171,7 @@ template<typename T>
 void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
     const std::vector<int>& dList) {
   int nDev = 0;
-  ncclCommCount(comms[0], &nDev);
+  NCCLCHECK(ncclCommCount(comms[0], &nDev));
   T** sendbuff = (T**)malloc(nDev * sizeof(T*));
   T** recvbuff = (T**)malloc(nDev * sizeof(T*));
 
@@ -253,7 +253,7 @@ int main(int argc, char* argv[]) {
   }
 
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  ncclCommInitAll(comms, nDev, dList.data());
+  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
 
   if (!csv) {
     printf("# Using devices\n");
@@ -261,8 +261,8 @@ int main(int argc, char* argv[]) {
       int cudaDev;
       int rank;
       cudaDeviceProp prop;
-      ncclCommCuDevice(comms[g], &cudaDev);
-      ncclCommUserRank(comms[g], &rank);
+      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
+      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
       CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
       printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
         prop.pciBusID, prop.name);