Fixed a race condition in reduce and braodcast.

2015-11-19 11:11:52 -08:00 · 2015-11-19 11:11:52 -08:00 · 27d32ac5d9
commit 27d32ac5d9
parent 0673d5f44f
3 changed files with 17 additions and 25 deletions
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@ -59,7 +59,7 @@
 // subchunks, we interleave the independent subchunks so that more data can be
 // transferred while the sync is in progress. This is the number of subchunks
 // that are active at the same time
-#define NUM_SUBCHUNKS   1
+#define NUM_SUBCHUNKS   2

 // If this is called with STEP, it means that we just finished processing the
 // data for step STEP on this GPU, which is the data required on the next GPU
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@ -180,21 +180,17 @@ __global__ void BroadcastKernel(const BroadcastKernelArgs<T> args) {
  // First wait for args.PrevPtrToThisOutput to become nullptr to ensure that
  // the previous GPU is done with a previous collective operation.
  if (tid == 0) {
-    if (ROLE != ROOT) {
-      Wait([=] {
-        return *((T * volatile *)args.PrevPtrToThisData) == nullptr; // Wait for previous processor to be done
-      });
+    Wait([=] {
+      return *((T * volatile *)args.PrevPtrToThisData) == nullptr; // Wait for previous processor to be done
+    });

-      *((T * volatile *)args.PrevPtrToThisData) = (T*)args.ThisData; // Tell Previous I'm starting
-    }
-    if (ROLE != END) {
-      Wait([=] {
-        return *((T * volatile *)args.ThisPtrToNextData) != nullptr;  // Wait till I've been told next started
-      });
+    *((T * volatile *)args.PrevPtrToThisData) = (T*)args.ThisData; // Tell Previous I'm starting
+    Wait([=] {
+      return *((T * volatile *)args.ThisPtrToNextData) != nullptr;  // Wait till I've been told next started
+    });

-      if (PUSHRECV)
-        nextData = *((volatile void * volatile *)args.ThisPtrToNextData); // Grab next's pointer if needed.
-    }
+    if (PUSHRECV)
+      nextData = *((volatile void * volatile *)args.ThisPtrToNextData); // Grab next's pointer if needed.
  }
  __syncthreads();

--- a/src/reduce.cu
+++ b/src/reduce.cu
@ -182,18 +182,14 @@ __global__ void ReduceKernel(const ReduceKernelArgs<T> args) {
  // First wait for args.PrevPtrToThisOutput to become nullptr to ensure that
  // the previous GPU is done with a previous collective operation.
  if (tid == 0) {
-    if (ROLE != BEGIN) {
-      Wait([=] {
-        return *((T * volatile *)args.PrevPtrToThisData) == nullptr; // Wait for previous processor to be done
-      });
+    Wait([=] {
+      return *((T * volatile *)args.PrevPtrToThisData) == nullptr; // Wait for previous processor to be done
+    });

-      *((T * volatile *)args.PrevPtrToThisData) = (T*)args.ThisData; // Tell Previous I'm starting
-    }
-    if (ROLE != END) {
-      Wait([=] {
-        return *((T * volatile *)args.ThisPtrToNextData) != nullptr;  // Wait till I've been told next started
-      });
-    }
+    *((T * volatile *)args.PrevPtrToThisData) = (T*)args.ThisData; // Tell Previous I'm starting
+    Wait([=] {
+      return *((T * volatile *)args.ThisPtrToNextData) != nullptr;  // Wait till I've been told next started
+    });
  }
  __syncthreads();