Fix wrong variable name "slice" to "chunk"
https://github.com/NVIDIA/nccl/issues/287
This commit is contained in:
parent
b5b6c6acdd
commit
a783484ab5
@ -37,19 +37,19 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int slice;
|
||||
int chunk;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
slice = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
prims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
@ -57,24 +57,24 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
slice = ring->devUserRanks[0];
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
slice = ring->devUserRanks[1];
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
// Final wait/copy.
|
||||
@ -225,19 +225,19 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int slice;
|
||||
int chunk;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
slice = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
@ -245,24 +245,24 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
slice = ring->devUserRanks[0];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
slice = ring->devUserRanks[1];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
@ -413,19 +413,19 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int slice;
|
||||
int chunk;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
slice = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
@ -433,24 +433,24 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
slice = ring->devUserRanks[0];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
slice = ring->devUserRanks[1];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
|
Loading…
x
Reference in New Issue
Block a user