Display host name instead of numeric IP when referring to a peer

For easier interpretation of debug messages like "connection closed by
peer", "peer message truncated" and "peer collective mismatch"
This commit is contained in:
Ke Wen 2022-03-08 14:26:34 -08:00 committed by Sylvain Jeaugey
parent b895abcdb8
commit 1382a87306
5 changed files with 11 additions and 10 deletions

View File

@ -44,7 +44,7 @@ struct ncclSocket {
enum ncclSocketState state;
};
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf);
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);

View File

@ -16,12 +16,16 @@
*
* Output: "IPv4/IPv6 address<port>"
*/
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf) {
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
if (buf == NULL || addr == NULL) return NULL;
struct sockaddr *saddr = &addr->sa;
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
char host[NI_MAXHOST], service[NI_MAXSERV];
(void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
/* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
* (When not set, this will still happen in case the node's name cannot be determined.)
*/
int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
(void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
sprintf(buf, "%s<%s>", host, service);
return buf;
}
@ -516,7 +520,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
if (closed) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line));
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
return ncclSystemError;
}
return ncclSuccess;

View File

@ -870,8 +870,6 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
struct ncclSocket* sock = &peer->sock;
char buf[SOCKET_NAME_MAXLEN+1];
buf[SOCKET_NAME_MAXLEN] = '\0';
int id;
struct ncclProxyConnection* connection;
NCCLCHECK(ncclProxyNewConnection(connectionPool, &id));
@ -889,8 +887,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
}
buf[SOCKET_NAME_MAXLEN] = '\0';
INFO(NCCL_NET, "New proxy %s connection %d from %s, transport %d", connection->send ? "send":"recv", id, ncclSocketToString(&sock->addr, buf), connection->transport);
INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
return ncclSuccess;
}

View File

@ -995,7 +995,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x",
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size, slots[r].addr, slots[r].rkey);
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line, 0), size, slots[r].size, slots[r].addr, slots[r].rkey);
return ncclInternalError;
}
struct ncclIbRequest* req;

View File

@ -500,7 +500,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
// Check size is less or equal to the size provided by the user
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line, 0), data, r->size);
return ncclInternalError;
}
r->size = data;