Display host name instead of numeric IP when referring to a peer
For easier interpretation of debug messages like "connection closed by peer", "peer message truncated" and "peer collective mismatch"
This commit is contained in:
parent
b895abcdb8
commit
1382a87306
@ -44,7 +44,7 @@ struct ncclSocket {
|
||||
enum ncclSocketState state;
|
||||
};
|
||||
|
||||
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf);
|
||||
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
|
||||
ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
|
||||
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
||||
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
|
||||
|
@ -16,12 +16,16 @@
|
||||
*
|
||||
* Output: "IPv4/IPv6 address<port>"
|
||||
*/
|
||||
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf) {
|
||||
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
|
||||
if (buf == NULL || addr == NULL) return NULL;
|
||||
struct sockaddr *saddr = &addr->sa;
|
||||
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
|
||||
char host[NI_MAXHOST], service[NI_MAXSERV];
|
||||
(void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
|
||||
/* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
|
||||
* (When not set, this will still happen in case the node's name cannot be determined.)
|
||||
*/
|
||||
int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
|
||||
(void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
|
||||
sprintf(buf, "%s<%s>", host, service);
|
||||
return buf;
|
||||
}
|
||||
@ -516,7 +520,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
|
||||
NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
|
||||
if (closed) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line));
|
||||
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
@ -870,8 +870,6 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
|
||||
|
||||
static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
|
||||
struct ncclSocket* sock = &peer->sock;
|
||||
char buf[SOCKET_NAME_MAXLEN+1];
|
||||
buf[SOCKET_NAME_MAXLEN] = '\0';
|
||||
int id;
|
||||
struct ncclProxyConnection* connection;
|
||||
NCCLCHECK(ncclProxyNewConnection(connectionPool, &id));
|
||||
@ -889,8 +887,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
||||
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
||||
NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
|
||||
}
|
||||
buf[SOCKET_NAME_MAXLEN] = '\0';
|
||||
INFO(NCCL_NET, "New proxy %s connection %d from %s, transport %d", connection->send ? "send":"recv", id, ncclSocketToString(&sock->addr, buf), connection->transport);
|
||||
INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -995,7 +995,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
|
||||
if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x",
|
||||
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size, slots[r].addr, slots[r].rkey);
|
||||
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line, 0), size, slots[r].size, slots[r].addr, slots[r].rkey);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclIbRequest* req;
|
||||
|
@ -500,7 +500,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
||||
// Check size is less or equal to the size provided by the user
|
||||
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
|
||||
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line, 0), data, r->size);
|
||||
return ncclInternalError;
|
||||
}
|
||||
r->size = data;
|
||||
|
Loading…
x
Reference in New Issue
Block a user