Moved tests to separate dir and improved MPI test

test sources moved to test/ directory. MPI test displays PASS/FAIL and returns code accordingly. Change-Id: I058ebd1bd5202d8f38cc9787898b2480100c102b Reviewed-on: http://git-master/r/936086 Reviewed-by: Przemek Tredak <ptredak@nvidia.com> Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-01-20 14:18:25 -05:00 · 2016-01-20 14:18:25 -05:00 · c05312f151
commit c05312f151
parent 5966316771
8 changed files with 63 additions and 24 deletions
--- a/19
+++ b/19
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@ -49,6 +49,7 @@ endif

 LDFLAGS    := -L$(CUDA_HOME)/lib64 -lcudart
 MPIFLAGS   := -I$(MPI_HOME)/include -L$(MPI_HOME)/lib -lmpi
+TSTINC     := -Ibuild/include -Itest/include

 .PHONY : lib clean test mpitest install
 .DEFAULT : lib
@ -65,8 +66,8 @@ MPITESTS    := mpi_test
 INCDIR := $(BUILDDIR)/include
 LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
-TSTDIR := $(BUILDDIR)/test
-MPITSTDIR := $(BUILDDIR)/mpitest
+TSTDIR := $(BUILDDIR)/test/single
+MPITSTDIR := $(BUILDDIR)/test/mpi

 INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
 LIBSONAME  := $(patsubst %,%.$(VER_MAJOR),$(LIBNAME))
@ -108,11 +109,11 @@ clean :

 test : lib $(TESTBINS)

-$(TSTDIR)/% : src/%.cu lib
+$(TSTDIR)/% : test/single/%.cu lib
 	@printf "Building  %-25s > %-24s\n" $< $@
 	@mkdir -p $(TSTDIR)
-	@$(NVCC) -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt
-	@$(NVCC) -M -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
+	@$(NVCC) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt
+	@$(NVCC) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
@ -120,11 +121,11 @@ $(TSTDIR)/% : src/%.cu lib

 mpitest : lib $(MPITESTBINS)

-$(MPITSTDIR)/% : src/%.cu lib
+$(MPITSTDIR)/% : test/mpi/%.cu lib
 	@printf "Building  %-25s > %-24s\n" $< $@
 	@mkdir -p $(MPITSTDIR)
-	@$(NVCC) $(MPIFLAGS) -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS)
-	@$(NVCC) $(MPIFLAGS) -M -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) > $(@:%=%.d.tmp)
+	@$(NVCC) $(MPIFLAGS) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS)
+	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) > $(@:%=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
--- a/test/include/test_utilities.h
+++ b/test/include/test_utilities.h
--- a/test/mpi/mpi_test.cu
+++ b/test/mpi/mpi_test.cu
@ -28,19 +28,23 @@

 #include <sys/types.h>
 #include <unistd.h>
+#include <stdio.h>

 #include "nccl.h"
 #include "mpi.h"

 #define CUDACHECK(cmd) do {                              \
-    cudaError_t e = cmd;                                 \
-    if( e != cudaSuccess ) {                             \
-        printf("Cuda failure %s:%d '%s'\n",              \
-               __FILE__,__LINE__,cudaGetErrorString(e)); \
-        exit(EXIT_FAILURE);                              \
-    }                                                    \
+  cudaError_t e = cmd;                                 \
+  if( e != cudaSuccess ) {                             \
+    printf("Cuda failure %s:%d '%s'\n",              \
+        __FILE__,__LINE__,cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                              \
+  }                                                    \
 } while(false)

+#define SIZE 128
+#define NITERS 1
+
 int main(int argc, char *argv[]) {
  ncclUniqueId commId;
  int size, rank;
@ -50,14 +54,18 @@ int main(int argc, char *argv[]) {
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

+  if (argc < size) {
+    printf("Usage : %s <GPU list per rank>\n", argv[0]);
+  }
+
  int gpu = atoi(argv[rank+1]);
-  printf("MPI Rank %d running on GPU %d\n", rank, gpu);
+
  // We have to set our device before NCCL init
  CUDACHECK(cudaSetDevice(gpu));
  MPI_Barrier(MPI_COMM_WORLD);

+  // NCCL Communicator creation
  ncclComm_t comm;
-  // Let's use rank 0 PID as job ID
  ncclGetUniqueId(&commId);
  MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
  ret = ncclCommInitRank(&comm, size, commId, rank);
@ -66,18 +74,48 @@ int main(int argc, char *argv[]) {
    exit(1);
  }

+  // CUDA stream creation
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
+  // Initialize input values
  int *dptr;
-  CUDACHECK(cudaMalloc(&dptr, 1024*2*sizeof(int)));
-  int val = rank;
-  CUDACHECK(cudaMemcpy(dptr, &val, sizeof(int), cudaMemcpyHostToDevice));
+  CUDACHECK(cudaMalloc(&dptr, SIZE*2*sizeof(int)));
+  int *val = (int*) malloc(SIZE*sizeof(int));
+  for (int v=0; v<SIZE; v++) {
+    val[v] = rank + 1;
+  }
+  CUDACHECK(cudaMemcpy(dptr, val, SIZE*sizeof(int), cudaMemcpyHostToDevice));

-  ncclAllReduce((const void*)dptr, (void*)(dptr+1024), 1024, ncclInt, ncclSum, comm, cudaStreamDefault);
+  // Compute final value
+  int ref = size*(size+1)/2;

-  CUDACHECK(cudaMemcpy(&val, (dptr+1024), sizeof(int), cudaMemcpyDeviceToHost));
-  printf("Sum is %d\n", val);
+  // Run allreduce
+  int errors = 0;
+  for (int i=0; i<NITERS; i++) {
+    ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream);
+  }
+
+  // Check results
+  cudaStreamSynchronize(stream);
+  CUDACHECK(cudaMemcpy(val, (dptr+SIZE), SIZE*sizeof(int), cudaMemcpyDeviceToHost));
+  for (int v=0; v<SIZE; v++) {
+    if (val[v] != ref) {
+      errors++;
+      printf("[%d] Error at %d : got %d instead of %d\n", rank, v, val[v], ref);
+    }
+  }
  CUDACHECK(cudaFree(dptr));

+  MPI_Allreduce(MPI_IN_PLACE, &errors, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD);
+  if (rank == 0) {
+    if (errors)
+      printf("%d errors. Test FAILED.\n", errors);
+    else
+      printf("Test PASSED.\n");
+  }
+
  MPI_Finalize();
  ncclCommDestroy(comm);
-  return 0;
+  return errors ? 1 : 0;
 }
--- a/test/single/all_gather_test.cu
+++ b/test/single/all_gather_test.cu
--- a/test/single/all_reduce_test.cu
+++ b/test/single/all_reduce_test.cu
--- a/test/single/broadcast_test.cu
+++ b/test/single/broadcast_test.cu
--- a/test/single/reduce_scatter_test.cu
+++ b/test/single/reduce_scatter_test.cu
--- a/test/single/reduce_test.cu
+++ b/test/single/reduce_test.cu