From ecab28a7c959af73cd2030b8c2ed281213dc906e Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Thu, 22 Sep 2022 01:04:50 -0700
Subject: [PATCH] Fix potential deadlock during init in multi-thread mode.

Make sure all calls calling cudaMalloc (including devCommSetup) are
called before the last bootstrapBarrier. That way, we avoid calls to
cudaMalloc be blocked by a NCCL kernel launched on another GPU by
another thread which completed init faster.

Resolve #623.
---
 src/init.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/init.cc b/src/init.cc
index 25c8d5d..42c1090 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1037,6 +1037,8 @@ collnet_cleanup:
     }
   }
 
+  NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
+
   /* Local intra-node barrier */
   NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
 
@@ -1087,7 +1089,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   }
   NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
-  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
 
   // update communicator state
   comm->initState = ncclSuccess;