Rework SYSCHECK macros to better handle retries.

SYSCHECKVAL was not retrying when a retry was needed. Since not all
calls are inside a loop, that means we could silently miss an
EINTR/EAGAIN return code.

Also rework the socket connection code and improve error reporting.
This commit is contained in:
Sylvain Jeaugey 2018-11-05 16:51:52 -08:00
parent 61b50a63ef
commit 302d538b73
2 changed files with 25 additions and 34 deletions

View File

@ -267,46 +267,26 @@ struct ncclComm {
#include <errno.h> #include <errno.h>
// Check system calls // Check system calls
#define SYSCHECK(call, name) do { \ #define SYSCHECK(call, name) do { \
int ret = -1; \ int retval; \
while (ret == -1) { \ SYSCHECKVAL(call, name, retval); \
SYSCHECKVAL(call, name, ret); \ } while (false)
if (ret == -1) { \
INFO(NCCL_ALL,"Got %s, retrying", strerror(errno)); \
}\
} \
} while (0);
#define SYSCHECKVAL(call, name, retval) do { \ #define SYSCHECKVAL(call, name, retval) do { \
retval = call; \ SYSCHECKSYNC(call, name, retval); \
if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { \ if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \ WARN("Call to " name " failed : %s", strerror(errno)); \
return ncclSystemError; \ return ncclSystemError; \
} \ } \
} while (0); } while (false)
#define SYSCHECKNTIMES(call, name, times, usec, exptype) do { \ #define SYSCHECKSYNC(call, name, retval) do { \
int ret = -1; \
int count = 0; \
while (ret == -1 && count < times) { \
SYSCHECKVALEXP(call, name, ret, exptype); \
count++; \
if (ret == -1) { \
usleep(usec); \
}\
} \
if (ret == -1) { \
WARN("Call to " name " timeout : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define SYSCHECKVALEXP(call, name, retval, exptype) do { \
retval = call; \ retval = call; \
if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN && errno != exptype) { \ if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
WARN("Call to " name " failed : %s", strerror(errno)); \ INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
return ncclSystemError; \ } else { \
break; \
} \ } \
} while (0); } while(true)
// Propagate errors up // Propagate errors up
#define NCCLCHECK(call) do { \ #define NCCLCHECK(call) do { \

View File

@ -366,8 +366,19 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line)); TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
#endif #endif
SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED); int ret;
return ncclSuccess; int retries = 0;
retry:
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
if (ret == 0) return ncclSuccess;
if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
INFO(ALL,"Call to connect returned %s, retrying", strerror(errno)); \
usleep(SLEEP_INT);
goto retry;
}
char line[1024];
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
return ncclSystemError;
} }
#define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_SEND 0