Merge pull request #822 from KaimingOuyang/github/pytorch-hang-fix

Shutdown socket before close in ncclSocketClose()
This commit is contained in:
David Addison 2023-04-14 19:52:45 -07:00 committed by GitHub
commit 9b7d5edbfc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -818,7 +818,14 @@ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int
ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
if (sock != NULL) {
if (sock->fd >= 0) close(sock->fd);
if (sock->fd >= 0) {
/* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
* by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
* the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
* connection close here. */
shutdown(sock->fd, SHUT_RDWR);
close(sock->fd);
}
sock->state = ncclSocketStateClosed;
sock->fd = -1;
}