From ecab28a7c959af73cd2030b8c2ed281213dc906e Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 22 Sep 2022 01:04:50 -0700 Subject: [PATCH] Fix potential deadlock during init in multi-thread mode. Make sure all calls calling cudaMalloc (including devCommSetup) are called before the last bootstrapBarrier. That way, we avoid calls to cudaMalloc be blocked by a NCCL kernel launched on another GPU by another thread which completed init faster. Resolve #623. --- src/init.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/init.cc b/src/init.cc index 25c8d5d..42c1090 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1037,6 +1037,8 @@ collnet_cleanup: } } + NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore); + /* Local intra-node barrier */ NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); @@ -1087,7 +1089,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { } NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); - NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); // update communicator state comm->initState = ncclSuccess;