diff --git a/src/include/socket.h b/src/include/socket.h index 53fda4d..d72480b 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -44,7 +44,7 @@ struct ncclSocket { enum ncclSocketState state; }; -const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf); +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 4e3295f..ef2bea6 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -16,12 +16,16 @@ * * Output: "IPv4/IPv6 address" */ -const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf) { +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) { if (buf == NULL || addr == NULL) return NULL; struct sockaddr *saddr = &addr->sa; if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } char host[NI_MAXHOST], service[NI_MAXSERV]; - (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV); + /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned. + * (When not set, this will still happen in case the node's name cannot be determined.) + */ + int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0); + (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag); sprintf(buf, "%s<%s>", host, service); return buf; } @@ -516,7 +520,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed)); if (closed) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line)); + WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); return ncclSystemError; } return ncclSuccess; diff --git a/src/proxy.cc b/src/proxy.cc index d6fe309..db36b9c 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -870,8 +870,6 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) { static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) { struct ncclSocket* sock = &peer->sock; - char buf[SOCKET_NAME_MAXLEN+1]; - buf[SOCKET_NAME_MAXLEN] = '\0'; int id; struct ncclProxyConnection* connection; NCCLCHECK(ncclProxyNewConnection(connectionPool, &id)); @@ -889,8 +887,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr struct ncclProxyProgressState* state = &comm->proxyState.progressState; NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1)); } - buf[SOCKET_NAME_MAXLEN] = '\0'; - INFO(NCCL_NET, "New proxy %s connection %d from %s, transport %d", connection->send ? "send":"recv", id, ncclSocketToString(&sock->addr, buf), connection->transport); + INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport); return ncclSuccess; } diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 0a006de..25c589e 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -995,7 +995,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) { char line[SOCKET_NAME_MAXLEN+1]; WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x", - r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size, slots[r].addr, slots[r].rkey); + r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line, 0), size, slots[r].size, slots[r].addr, slots[r].rkey); return ncclInternalError; } struct ncclIbRequest* req; diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index d92c46f..8396179 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -500,7 +500,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line), data, r->size); + WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line, 0), data, r->size); return ncclInternalError; } r->size = data;