Improve warning message about truncated messages
Display hints of cause so that it would be easier for user to debug. Also change the error type from InternalError to InvalidUsage as most of time this is caused by a mismatch in collective size or env settings.
This commit is contained in:
parent
8cf7325d69
commit
f589932130
@ -421,8 +421,10 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
||||
// Check size is less or equal to the size provided by the user
|
||||
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size);
|
||||
return ncclInternalError;
|
||||
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
|
||||
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
|
||||
socketToString(r->addr, line), data, r->size);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
r->size = data;
|
||||
r->offset = 0;
|
||||
|
Loading…
x
Reference in New Issue
Block a user