Check return code for Flush operation
Current NCCL code does not abort for failed Flush operations by underlying network. This may compromise data integrity. Signed-off-by: Rashika Kheria <rashika@amazon.com>
This commit is contained in:
parent
c38f174bd4
commit
6c61492eba
@ -378,7 +378,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
if (done) {
|
||||
args->head += args->sliceSteps;
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
|
||||
if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
|
||||
resources->hostRecvMem->tail = args->head;
|
||||
}
|
||||
args->idle = 0;
|
||||
|
Loading…
x
Reference in New Issue
Block a user