Fixed a race condition in reduce and braodcast.
This commit is contained in:
parent
0673d5f44f
commit
27d32ac5d9
@ -59,7 +59,7 @@
|
||||
// subchunks, we interleave the independent subchunks so that more data can be
|
||||
// transferred while the sync is in progress. This is the number of subchunks
|
||||
// that are active at the same time
|
||||
#define NUM_SUBCHUNKS 1
|
||||
#define NUM_SUBCHUNKS 2
|
||||
|
||||
// If this is called with STEP, it means that we just finished processing the
|
||||
// data for step STEP on this GPU, which is the data required on the next GPU
|
||||
|
@ -180,21 +180,17 @@ __global__ void BroadcastKernel(const BroadcastKernelArgs<T> args) {
|
||||
// First wait for args.PrevPtrToThisOutput to become nullptr to ensure that
|
||||
// the previous GPU is done with a previous collective operation.
|
||||
if (tid == 0) {
|
||||
if (ROLE != ROOT) {
|
||||
Wait([=] {
|
||||
return *((T * volatile *)args.PrevPtrToThisData) == nullptr; // Wait for previous processor to be done
|
||||
});
|
||||
Wait([=] {
|
||||
return *((T * volatile *)args.PrevPtrToThisData) == nullptr; // Wait for previous processor to be done
|
||||
});
|
||||
|
||||
*((T * volatile *)args.PrevPtrToThisData) = (T*)args.ThisData; // Tell Previous I'm starting
|
||||
}
|
||||
if (ROLE != END) {
|
||||
Wait([=] {
|
||||
return *((T * volatile *)args.ThisPtrToNextData) != nullptr; // Wait till I've been told next started
|
||||
});
|
||||
*((T * volatile *)args.PrevPtrToThisData) = (T*)args.ThisData; // Tell Previous I'm starting
|
||||
Wait([=] {
|
||||
return *((T * volatile *)args.ThisPtrToNextData) != nullptr; // Wait till I've been told next started
|
||||
});
|
||||
|
||||
if (PUSHRECV)
|
||||
nextData = *((volatile void * volatile *)args.ThisPtrToNextData); // Grab next's pointer if needed.
|
||||
}
|
||||
if (PUSHRECV)
|
||||
nextData = *((volatile void * volatile *)args.ThisPtrToNextData); // Grab next's pointer if needed.
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
@ -182,18 +182,14 @@ __global__ void ReduceKernel(const ReduceKernelArgs<T> args) {
|
||||
// First wait for args.PrevPtrToThisOutput to become nullptr to ensure that
|
||||
// the previous GPU is done with a previous collective operation.
|
||||
if (tid == 0) {
|
||||
if (ROLE != BEGIN) {
|
||||
Wait([=] {
|
||||
return *((T * volatile *)args.PrevPtrToThisData) == nullptr; // Wait for previous processor to be done
|
||||
});
|
||||
Wait([=] {
|
||||
return *((T * volatile *)args.PrevPtrToThisData) == nullptr; // Wait for previous processor to be done
|
||||
});
|
||||
|
||||
*((T * volatile *)args.PrevPtrToThisData) = (T*)args.ThisData; // Tell Previous I'm starting
|
||||
}
|
||||
if (ROLE != END) {
|
||||
Wait([=] {
|
||||
return *((T * volatile *)args.ThisPtrToNextData) != nullptr; // Wait till I've been told next started
|
||||
});
|
||||
}
|
||||
*((T * volatile *)args.PrevPtrToThisData) = (T*)args.ThisData; // Tell Previous I'm starting
|
||||
Wait([=] {
|
||||
return *((T * volatile *)args.ThisPtrToNextData) != nullptr; // Wait till I've been told next started
|
||||
});
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user