Fix potential deadlock during init in multi-thread mode.

Make sure all calls calling cudaMalloc (including devCommSetup) are
called before the last bootstrapBarrier. That way, we avoid calls to
cudaMalloc be blocked by a NCCL kernel launched on another GPU by
another thread which completed init faster.

Resolve #623.
This commit is contained in:
Sylvain Jeaugey 2022-09-22 01:04:50 -07:00
parent f89fd4777d
commit ecab28a7c9

View File

@ -1037,6 +1037,8 @@ collnet_cleanup:
} }
} }
NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
/* Local intra-node barrier */ /* Local intra-node barrier */
NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
@ -1087,7 +1089,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
} }
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
// update communicator state // update communicator state
comm->initState = ncclSuccess; comm->initState = ncclSuccess;