Fix potential deadlock during init in multi-thread mode.
Make sure all calls calling cudaMalloc (including devCommSetup) are called before the last bootstrapBarrier. That way, we avoid calls to cudaMalloc be blocked by a NCCL kernel launched on another GPU by another thread which completed init faster. Resolve #623.
This commit is contained in:
parent
f89fd4777d
commit
ecab28a7c9
@ -1037,6 +1037,8 @@ collnet_cleanup:
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
|
||||
|
||||
/* Local intra-node barrier */
|
||||
NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
|
||||
|
||||
@ -1087,7 +1089,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
}
|
||||
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
|
||||
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
|
||||
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
|
||||
|
||||
// update communicator state
|
||||
comm->initState = ncclSuccess;
|
||||
|
Loading…
x
Reference in New Issue
Block a user