From f589932130994928e48eb29ce08d90c6ac11b8f0 Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Thu, 2 Dec 2021 12:33:06 -0800 Subject: [PATCH] Improve warning message about truncated messages Display hints of cause so that it would be easier for user to debug. Also change the error type from InternalError to InvalidUsage as most of time this is caused by a mismatch in collective size or env settings. --- src/transport/net_socket.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index c045a8f..b9b1848 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -421,8 +421,10 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size); - return ncclInternalError; + WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ + there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", + socketToString(r->addr, line), data, r->size); + return ncclInvalidUsage; } r->size = data; r->offset = 0;