Fixed deadlock in back-to-back reduce_scatters.

Change-Id: I92d32b15e516a39710b676aee692ae9b70638937 Reviewed-on: http://git-master/r/935458 Reviewed-by: Przemek Tredak <ptredak@nvidia.com> Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-01-20 17:58:25 -08:00 · 2016-01-20 17:58:25 -08:00 · 130ee246e2
commit 130ee246e2
parent 90af7c73ef
12 changed files with 70 additions and 44 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 /build
--- a/README.md
+++ b/README.md
@ -113,3 +113,9 @@ int main(int argc, char* argv[])
 }
 ```
 ## Copyright and License
 NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
 accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
 rights reserved.
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/src/all_reduce_test.cu
+++ b/src/all_reduce_test.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/src/core.cu
+++ b/src/core.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/src/libwrap.cu
+++ b/src/libwrap.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/src/mpi_test.cu
+++ b/src/mpi_test.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/src/nccl.h
+++ b/src/nccl.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/src/reduce.cu
+++ b/src/reduce.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@ -164,6 +164,9 @@ struct ReduceScatterKernelArgs {
  int BufferSliceStride;
  int BufferMisalignedN;
  T ** ThisPtrToNextOutput;
  T ** PrevPtrToThisOutput;
  // local and remote input, output, and buffer
  const T * __restrict__ ThisInput;
  volatile T * __restrict__ ThisOutput;
@ -187,6 +190,20 @@ __global__ void ReduceScatterKernel(const ReduceScatterKernelArgs<T> args) {
  if (args.N == 0) return;
  int tid = threadIdx.x;
  // First wait for args.PrevPtrToThisOutput to become nullptr to ensure that
  // the previous GPU is done with a previous collective operation.
  if (tid == 0) {
    Wait([=] {
      return *((T * volatile *)args.PrevPtrToThisOutput) == nullptr; // Wait for previous processor to be done
    });
    *((T * volatile *)args.PrevPtrToThisOutput) = (T*)args.ThisOutput; // Tell Previous I'm starting
    Wait([=] {
      return *((T * volatile *)args.ThisPtrToNextOutput) != nullptr;  // Wait till I've been told next started
    });
  }
  __syncthreads();
  for (int chunk = 0; chunk < args.NumChunks; ++chunk) {
    // calculate slice size.  for all chunks except (possibly) the last one,
    // this will just be args.SliceSize. For the last one, it may be smaller
@ -311,6 +328,7 @@ __global__ void ReduceScatterKernel(const ReduceScatterKernelArgs<T> args) {
    if (tid == 0) {
      args.ThisNewDataAvailableFlag[tid] = 0;
      args.ThisChunkDoneFlag[tid] = 0;
      *args.ThisPtrToNextOutput = nullptr;
    }
  }
 }
@ -410,7 +428,8 @@ ncclResult_t ncclReduceScatterWithTypeAndFunc(const void* sendbuff,
    args.NumChunks = (args.N + args.ChunkSize - 1) / args.ChunkSize;
  }
-//  printf("sliceSize = %i, chunkSize = %i, numChunks = %i, sliceStride = %i, misalignedN = %i\n", args.SliceSize, args.ChunkSize, args.NumChunks, args.BufferSliceStride, args.BufferMisalignedN);
+  args.ThisPtrToNextOutput = (T**)&(comm->local[nextId]->recvPtrs[0]);
  args.PrevPtrToThisOutput = (T**)&(comm->remote[prevId]->recvPtrs[0]);
  args.ThisInput = (const T*)sendbuff;
  args.ThisOutput = (volatile T*)recvbuff;
@ -426,7 +445,7 @@ ncclResult_t ncclReduceScatterWithTypeAndFunc(const void* sendbuff,
  args.PrevChunkDoneFlag = comm->remote[prevId]->flags + 1;
  ReduceScatterKernel<NUM_THREADS, UNROLL_COUNT, FUNC, T>
-      <<<1, NUM_THREADS + NUM_SUBCHUNKS * WARP_SIZE, 0, stream>>>(args);
+      <<<1, NUM_THREADS + 1, 0, stream>>>(args);
  return ncclSuccess;
 }
--- a/src/reduce_test.cu
+++ b/src/reduce_test.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/src/test_utilities.h
+++ b/src/test_utilities.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
`@ -1 +1,2 @@`
		`# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.`
	`/build`	`/build`