Improve warning message about truncated messages

Display hints of cause so that it would be easier for user to debug.
Also change the error type from InternalError to InvalidUsage as most
of time this is caused by a mismatch in collective size or env settings.
This commit is contained in:
Ke Wen 2021-12-02 12:33:06 -08:00
parent 8cf7325d69
commit f589932130

View File

@ -421,8 +421,10 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
// Check size is less or equal to the size provided by the user
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size);
return ncclInternalError;
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
socketToString(r->addr, line), data, r->size);
return ncclInvalidUsage;
}
r->size = data;
r->offset = 0;