Fix random deadlock during ncclCommInitRank.
This commit is contained in:
parent
9de361a1b9
commit
dba3ec9428
@ -188,7 +188,7 @@ static void syncRingDirect(RankGather* gather, int* ringDirectOk) {
|
||||
swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
|
||||
} while(!swapped);
|
||||
|
||||
while (gather->bar != 2*ndev) // Wait for all ranks to arrive at this second barrier
|
||||
while (gather->bar < 2*ndev) // Wait for all ranks to arrive at this second barrier
|
||||
sched_yield();
|
||||
__sync_synchronize();
|
||||
|
||||
@ -203,7 +203,7 @@ static ncclResult_t closeGather(RankGather* gather, int ndev) {
|
||||
swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
|
||||
} while(!swapped);
|
||||
|
||||
while (gather->bar != 3*ndev) // Wait for all ranks to arrive at this third barrier
|
||||
while (gather->bar < 3*ndev) // Wait for all ranks to arrive at this third barrier
|
||||
sched_yield();
|
||||
__sync_synchronize();
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user