Fixed deadlock in back-to-back reduce_scatters.

Change-Id: I92d32b15e516a39710b676aee692ae9b70638937
Reviewed-on: http://git-master/r/935458
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
This commit is contained in:
Nathan Luehr 2016-01-20 17:58:25 -08:00 committed by Przemek Tredak
parent 90af7c73ef
commit 130ee246e2
12 changed files with 70 additions and 44 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
/build /build

View File

@ -113,3 +113,9 @@ int main(int argc, char* argv[])
} }
``` ```
## Copyright and License
NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
rights reserved.

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
@ -164,6 +164,9 @@ struct ReduceScatterKernelArgs {
int BufferSliceStride; int BufferSliceStride;
int BufferMisalignedN; int BufferMisalignedN;
T ** ThisPtrToNextOutput;
T ** PrevPtrToThisOutput;
// local and remote input, output, and buffer // local and remote input, output, and buffer
const T * __restrict__ ThisInput; const T * __restrict__ ThisInput;
volatile T * __restrict__ ThisOutput; volatile T * __restrict__ ThisOutput;
@ -187,6 +190,20 @@ __global__ void ReduceScatterKernel(const ReduceScatterKernelArgs<T> args) {
if (args.N == 0) return; if (args.N == 0) return;
int tid = threadIdx.x; int tid = threadIdx.x;
// First wait for args.PrevPtrToThisOutput to become nullptr to ensure that
// the previous GPU is done with a previous collective operation.
if (tid == 0) {
Wait([=] {
return *((T * volatile *)args.PrevPtrToThisOutput) == nullptr; // Wait for previous processor to be done
});
*((T * volatile *)args.PrevPtrToThisOutput) = (T*)args.ThisOutput; // Tell Previous I'm starting
Wait([=] {
return *((T * volatile *)args.ThisPtrToNextOutput) != nullptr; // Wait till I've been told next started
});
}
__syncthreads();
for (int chunk = 0; chunk < args.NumChunks; ++chunk) { for (int chunk = 0; chunk < args.NumChunks; ++chunk) {
// calculate slice size. for all chunks except (possibly) the last one, // calculate slice size. for all chunks except (possibly) the last one,
// this will just be args.SliceSize. For the last one, it may be smaller // this will just be args.SliceSize. For the last one, it may be smaller
@ -311,6 +328,7 @@ __global__ void ReduceScatterKernel(const ReduceScatterKernelArgs<T> args) {
if (tid == 0) { if (tid == 0) {
args.ThisNewDataAvailableFlag[tid] = 0; args.ThisNewDataAvailableFlag[tid] = 0;
args.ThisChunkDoneFlag[tid] = 0; args.ThisChunkDoneFlag[tid] = 0;
*args.ThisPtrToNextOutput = nullptr;
} }
} }
} }
@ -410,7 +428,8 @@ ncclResult_t ncclReduceScatterWithTypeAndFunc(const void* sendbuff,
args.NumChunks = (args.N + args.ChunkSize - 1) / args.ChunkSize; args.NumChunks = (args.N + args.ChunkSize - 1) / args.ChunkSize;
} }
// printf("sliceSize = %i, chunkSize = %i, numChunks = %i, sliceStride = %i, misalignedN = %i\n", args.SliceSize, args.ChunkSize, args.NumChunks, args.BufferSliceStride, args.BufferMisalignedN); args.ThisPtrToNextOutput = (T**)&(comm->local[nextId]->recvPtrs[0]);
args.PrevPtrToThisOutput = (T**)&(comm->remote[prevId]->recvPtrs[0]);
args.ThisInput = (const T*)sendbuff; args.ThisInput = (const T*)sendbuff;
args.ThisOutput = (volatile T*)recvbuff; args.ThisOutput = (volatile T*)recvbuff;
@ -426,7 +445,7 @@ ncclResult_t ncclReduceScatterWithTypeAndFunc(const void* sendbuff,
args.PrevChunkDoneFlag = comm->remote[prevId]->flags + 1; args.PrevChunkDoneFlag = comm->remote[prevId]->flags + 1;
ReduceScatterKernel<NUM_THREADS, UNROLL_COUNT, FUNC, T> ReduceScatterKernel<NUM_THREADS, UNROLL_COUNT, FUNC, T>
<<<1, NUM_THREADS + NUM_SUBCHUNKS * WARP_SIZE, 0, stream>>>(args); <<<1, NUM_THREADS + 1, 0, stream>>>(args);
return ncclSuccess; return ncclSuccess;
} }

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions