Fix potential deadlock during init in multi-thread mode.
Make sure all calls calling cudaMalloc (including devCommSetup) are called before the last bootstrapBarrier. That way, we avoid calls to cudaMalloc be blocked by a NCCL kernel launched on another GPU by another thread which completed init faster. Resolve #623.
This commit is contained in:
parent
f89fd4777d
commit
ecab28a7c9
@ -1037,6 +1037,8 @@ collnet_cleanup:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
|
||||||
|
|
||||||
/* Local intra-node barrier */
|
/* Local intra-node barrier */
|
||||||
NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
|
NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
|
||||||
|
|
||||||
@ -1087,7 +1089,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
|||||||
}
|
}
|
||||||
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
|
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
|
||||||
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
|
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
|
||||||
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
|
|
||||||
|
|
||||||
// update communicator state
|
// update communicator state
|
||||||
comm->initState = ncclSuccess;
|
comm->initState = ncclSuccess;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user