Improve warning message about truncated messages
Display hints of cause so that it would be easier for user to debug. Also change the error type from InternalError to InvalidUsage as most of time this is caused by a mismatch in collective size or env settings.
This commit is contained in:
parent
8cf7325d69
commit
f589932130
@ -421,8 +421,10 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
|||||||
// Check size is less or equal to the size provided by the user
|
// Check size is less or equal to the size provided by the user
|
||||||
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size);
|
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
|
||||||
return ncclInternalError;
|
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
|
||||||
|
socketToString(r->addr, line), data, r->size);
|
||||||
|
return ncclInvalidUsage;
|
||||||
}
|
}
|
||||||
r->size = data;
|
r->size = data;
|
||||||
r->offset = 0;
|
r->offset = 0;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user