From f589932130994928e48eb29ce08d90c6ac11b8f0 Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Thu, 2 Dec 2021 12:33:06 -0800 Subject: [PATCH 1/6] Improve warning message about truncated messages Display hints of cause so that it would be easier for user to debug. Also change the error type from InternalError to InvalidUsage as most of time this is caused by a mismatch in collective size or env settings. --- src/transport/net_socket.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index c045a8f..b9b1848 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -421,8 +421,10 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size); - return ncclInternalError; + WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ + there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", + socketToString(r->addr, line), data, r->size); + return ncclInvalidUsage; } r->size = data; r->offset = 0; From fbfb6ac5d7f0bc5aa7ee9d85451f670b4eda6dbc Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Tue, 8 Feb 2022 15:21:22 -0800 Subject: [PATCH 2/6] Split IB parameter sanity check into two parts First part on collective mismatch, second part on internal errors --- src/transport/net_ib.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index db27eae..44ef7d0 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -680,11 +680,16 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo #else __sync_synchronize(); // order the readyPtr load against rkey load below // Sanity checks to catch user collective call count/size mismatches - // plus any potential programming errors - if (size > slot->size || slot->size < 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) { + if (size > slot->size || slot->seq != comm->fifoHead) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/IB : peer %s collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x", - socketToString(req->addr, line), size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead); + WARN("NET/IB : peer %s collective mismatch error, local size %d remote size %d seq %x/%x", + socketToString(req->addr, line), size, slot->size, slot->seq, comm->fifoHead); + return ncclInvalidUsage; + } // plus any potential programming errors + else if (slot->size < 0 || slot->addr == 0 || slot->rkey == 0) { + char line[SOCKET_NAME_MAXLEN+1]; + WARN("NET/IB : peer %s posted incorrect receive info: size %d addr %lx rkey %x", + socketToString(req->addr, line), slot->size, slot->addr, slot->rkey); return ncclInternalError; } wr[0].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; From 1c7c014ceb36d06f01175d5655dd01fbf32f63c0 Mon Sep 17 00:00:00 2001 From: Felix Abecassis Date: Fri, 25 Mar 2022 14:03:04 -0700 Subject: [PATCH 3/6] Remove unnecessary newline in plugin logging Signed-off-by: Felix Abecassis --- src/net.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/net.cc b/src/net.cc index 5f68021..cb65218 100644 --- a/src/net.cc +++ b/src/net.cc @@ -112,7 +112,7 @@ static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) { const char* envPluginName = getenv("NCCL_NET_PLUGIN"); if (envPluginName && strlen(envPluginName)) { snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName); - INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName); + INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName); } else { sprintf(ncclNetPluginName, "libnccl-net.so"); } From b895abcdb8c335dd8e7cbe6cf4c55e0adbc59904 Mon Sep 17 00:00:00 2001 From: Christopher Hesse Date: Wed, 2 Mar 2022 22:48:35 -0800 Subject: [PATCH 4/6] Fix typo in net_ib.cc --- src/transport/net_ib.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 6e8f215..0a006de 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -841,7 +841,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan else { NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning); } - TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*PageSize, mr->rkey); + TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey); cache->population += 1; cache->slots[slot].addr = addr; cache->slots[slot].pages = pages; From 1382a87306ae80cea6e8e8061bbd59ff91f3c69f Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Tue, 8 Mar 2022 14:26:34 -0800 Subject: [PATCH 5/6] Display host name instead of numeric IP when referring to a peer For easier interpretation of debug messages like "connection closed by peer", "peer message truncated" and "peer collective mismatch" --- src/include/socket.h | 2 +- src/misc/socket.cc | 10 +++++++--- src/proxy.cc | 5 +---- src/transport/net_ib.cc | 2 +- src/transport/net_socket.cc | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/include/socket.h b/src/include/socket.h index 53fda4d..d72480b 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -44,7 +44,7 @@ struct ncclSocket { enum ncclSocketState state; }; -const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf); +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 4e3295f..ef2bea6 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -16,12 +16,16 @@ * * Output: "IPv4/IPv6 address" */ -const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf) { +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) { if (buf == NULL || addr == NULL) return NULL; struct sockaddr *saddr = &addr->sa; if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } char host[NI_MAXHOST], service[NI_MAXSERV]; - (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV); + /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned. + * (When not set, this will still happen in case the node's name cannot be determined.) + */ + int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0); + (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag); sprintf(buf, "%s<%s>", host, service); return buf; } @@ -516,7 +520,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed)); if (closed) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line)); + WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); return ncclSystemError; } return ncclSuccess; diff --git a/src/proxy.cc b/src/proxy.cc index d6fe309..db36b9c 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -870,8 +870,6 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) { static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) { struct ncclSocket* sock = &peer->sock; - char buf[SOCKET_NAME_MAXLEN+1]; - buf[SOCKET_NAME_MAXLEN] = '\0'; int id; struct ncclProxyConnection* connection; NCCLCHECK(ncclProxyNewConnection(connectionPool, &id)); @@ -889,8 +887,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr struct ncclProxyProgressState* state = &comm->proxyState.progressState; NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1)); } - buf[SOCKET_NAME_MAXLEN] = '\0'; - INFO(NCCL_NET, "New proxy %s connection %d from %s, transport %d", connection->send ? "send":"recv", id, ncclSocketToString(&sock->addr, buf), connection->transport); + INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport); return ncclSuccess; } diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 0a006de..25c589e 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -995,7 +995,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) { char line[SOCKET_NAME_MAXLEN+1]; WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x", - r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size, slots[r].addr, slots[r].rkey); + r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line, 0), size, slots[r].size, slots[r].addr, slots[r].rkey); return ncclInternalError; } struct ncclIbRequest* req; diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index d92c46f..8396179 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -500,7 +500,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line), data, r->size); + WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line, 0), data, r->size); return ncclInternalError; } r->size = data; From 2247152a8ea68170adad865acf3c8783367c9e39 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Wed, 30 Mar 2022 02:14:32 -0700 Subject: [PATCH 6/6] Fix merging error --- src/transport/net_socket.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index ccdca04..9e14aa2 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -502,7 +502,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { char line[SOCKET_NAME_MAXLEN+1]; WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", - socketToString(&r->ctrlSock->addr, line), data, r->size); + ncclSocketToString(&r->ctrlSock->addr, line), data, r->size); return ncclInvalidUsage; } r->size = data;