Fix potential deadlock during init in multi-thread mode.

Make sure all calls calling cudaMalloc (including devCommSetup) are called before the last bootstrapBarrier. That way, we avoid calls to cudaMalloc be blocked by a NCCL kernel launched on another GPU by another thread which completed init faster. Resolve #623.
2022-09-22 01:04:50 -07:00 · 2022-09-22 01:04:50 -07:00 · ecab28a7c9
commit ecab28a7c9
parent f89fd4777d
1 changed files with 2 additions and 1 deletions
--- a/src/init.cc
+++ b/src/init.cc
@ -1037,6 +1037,8 @@ collnet_cleanup:
    }
  }

+  NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
+
  /* Local intra-node barrier */
  NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));

@ -1087,7 +1089,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  }
  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
-  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);

  // update communicator state
  comm->initState = ncclSuccess;