From 6c61492eba5c25ac6ed1bf57de23c6a689aa75cc Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Mon, 16 Mar 2020 18:33:48 -0700 Subject: [PATCH] Check return code for Flush operation Current NCCL code does not abort for failed Flush operations by underlying network. This may compromise data integrity. Signed-off-by: Rashika Kheria --- src/transport/net.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transport/net.cc b/src/transport/net.cc index 87fc9ce..928a6a9 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -378,7 +378,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { if (done) { args->head += args->sliceSteps; if (args->protocol == NCCL_PROTO_SIMPLE) { - if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle); + if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle)); resources->hostRecvMem->tail = args->head; } args->idle = 0;