Rework SYSCHECK macros to better handle retries.
SYSCHECKVAL was not retrying when a retry was needed. Since not all calls are inside a loop, that means we could silently miss an EINTR/EAGAIN return code. Also rework the socket connection code and improve error reporting.
This commit is contained in:
parent
61b50a63ef
commit
302d538b73
@ -267,46 +267,26 @@ struct ncclComm {
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
int ret = -1; \
|
||||
while (ret == -1) { \
|
||||
SYSCHECKVAL(call, name, ret); \
|
||||
if (ret == -1) { \
|
||||
INFO(NCCL_ALL,"Got %s, retrying", strerror(errno)); \
|
||||
}\
|
||||
} \
|
||||
} while (0);
|
||||
int retval; \
|
||||
SYSCHECKVAL(call, name, retval); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL(call, name, retval) do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { \
|
||||
SYSCHECKSYNC(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKNTIMES(call, name, times, usec, exptype) do { \
|
||||
int ret = -1; \
|
||||
int count = 0; \
|
||||
while (ret == -1 && count < times) { \
|
||||
SYSCHECKVALEXP(call, name, ret, exptype); \
|
||||
count++; \
|
||||
if (ret == -1) { \
|
||||
usleep(usec); \
|
||||
}\
|
||||
} \
|
||||
if (ret == -1) { \
|
||||
WARN("Call to " name " timeout : %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define SYSCHECKVALEXP(call, name, retval, exptype) do { \
|
||||
#define SYSCHECKSYNC(call, name, retval) do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN && errno != exptype) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
||||
} else { \
|
||||
break; \
|
||||
} \
|
||||
} while (0);
|
||||
} while(true)
|
||||
|
||||
// Propagate errors up
|
||||
#define NCCLCHECK(call) do { \
|
||||
|
@ -366,8 +366,19 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
|
||||
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
|
||||
#endif
|
||||
|
||||
SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
|
||||
return ncclSuccess;
|
||||
int ret;
|
||||
int retries = 0;
|
||||
retry:
|
||||
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
|
||||
if (ret == 0) return ncclSuccess;
|
||||
if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
|
||||
INFO(ALL,"Call to connect returned %s, retrying", strerror(errno)); \
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
}
|
||||
char line[1024];
|
||||
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
#define NCCL_SOCKET_SEND 0
|
||||
|
Loading…
x
Reference in New Issue
Block a user