Check return code for Flush operation

Current NCCL code does not abort for failed Flush operations by
underlying network. This may compromise data integrity.

Signed-off-by: Rashika Kheria <rashika@amazon.com>
This commit is contained in:
Rashika Kheria 2020-03-16 18:33:48 -07:00 committed by Sylvain Jeaugey
parent c38f174bd4
commit 6c61492eba

View File

@ -378,7 +378,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
if (done) {
args->head += args->sliceSteps;
if (args->protocol == NCCL_PROTO_SIMPLE) {
if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
resources->hostRecvMem->tail = args->head;
}
args->idle = 0;