Display host name instead of numeric IP when referring to a peer
For easier interpretation of debug messages like "connection closed by peer", "peer message truncated" and "peer collective mismatch"
This commit is contained in:
parent
b895abcdb8
commit
1382a87306
@ -44,7 +44,7 @@ struct ncclSocket {
|
|||||||
enum ncclSocketState state;
|
enum ncclSocketState state;
|
||||||
};
|
};
|
||||||
|
|
||||||
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf);
|
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
|
||||||
ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
|
ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
|
||||||
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
||||||
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
|
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
|
||||||
|
@ -16,12 +16,16 @@
|
|||||||
*
|
*
|
||||||
* Output: "IPv4/IPv6 address<port>"
|
* Output: "IPv4/IPv6 address<port>"
|
||||||
*/
|
*/
|
||||||
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf) {
|
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
|
||||||
if (buf == NULL || addr == NULL) return NULL;
|
if (buf == NULL || addr == NULL) return NULL;
|
||||||
struct sockaddr *saddr = &addr->sa;
|
struct sockaddr *saddr = &addr->sa;
|
||||||
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
|
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
|
||||||
char host[NI_MAXHOST], service[NI_MAXSERV];
|
char host[NI_MAXHOST], service[NI_MAXSERV];
|
||||||
(void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
|
/* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
|
||||||
|
* (When not set, this will still happen in case the node's name cannot be determined.)
|
||||||
|
*/
|
||||||
|
int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
|
||||||
|
(void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
|
||||||
sprintf(buf, "%s<%s>", host, service);
|
sprintf(buf, "%s<%s>", host, service);
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
@ -516,7 +520,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
|
|||||||
NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
|
NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
|
||||||
if (closed) {
|
if (closed) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line));
|
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
@ -870,8 +870,6 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
|
|||||||
|
|
||||||
static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
|
static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
|
||||||
struct ncclSocket* sock = &peer->sock;
|
struct ncclSocket* sock = &peer->sock;
|
||||||
char buf[SOCKET_NAME_MAXLEN+1];
|
|
||||||
buf[SOCKET_NAME_MAXLEN] = '\0';
|
|
||||||
int id;
|
int id;
|
||||||
struct ncclProxyConnection* connection;
|
struct ncclProxyConnection* connection;
|
||||||
NCCLCHECK(ncclProxyNewConnection(connectionPool, &id));
|
NCCLCHECK(ncclProxyNewConnection(connectionPool, &id));
|
||||||
@ -889,8 +887,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
|||||||
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
||||||
NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
|
NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
|
||||||
}
|
}
|
||||||
buf[SOCKET_NAME_MAXLEN] = '\0';
|
INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
|
||||||
INFO(NCCL_NET, "New proxy %s connection %d from %s, transport %d", connection->send ? "send":"recv", id, ncclSocketToString(&sock->addr, buf), connection->transport);
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -995,7 +995,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
|
|||||||
if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
|
if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x",
|
WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x",
|
||||||
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size, slots[r].addr, slots[r].rkey);
|
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line, 0), size, slots[r].size, slots[r].addr, slots[r].rkey);
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
struct ncclIbRequest* req;
|
struct ncclIbRequest* req;
|
||||||
|
@ -500,7 +500,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
|||||||
// Check size is less or equal to the size provided by the user
|
// Check size is less or equal to the size provided by the user
|
||||||
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
|
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line, 0), data, r->size);
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
r->size = data;
|
r->size = data;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user