Check return code for Flush operation
Current NCCL code does not abort for failed Flush operations by underlying network. This may compromise data integrity. Signed-off-by: Rashika Kheria <rashika@amazon.com>
This commit is contained in:
parent
c38f174bd4
commit
6c61492eba
@ -378,7 +378,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
|||||||
if (done) {
|
if (done) {
|
||||||
args->head += args->sliceSteps;
|
args->head += args->sliceSteps;
|
||||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||||
if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
|
if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
|
||||||
resources->hostRecvMem->tail = args->head;
|
resources->hostRecvMem->tail = args->head;
|
||||||
}
|
}
|
||||||
args->idle = 0;
|
args->idle = 0;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user