Merge pull request #1217 from crazy-JiangDongHua/bugfix_undo_plan
Bug in plan enqueue logic where plans could be silently not launched for some communicators. Triggered when both are true: 1. Multiple communicators per ncclGroup. 2. Communicators within a group have different plan counts. 2. Intra-process launch barrier disabled.
This commit is contained in:
commit
6dd51f15bf
@ -142,7 +142,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
}
|
||||
|
||||
while (true) { // Iterate rounds of launches for clique.
|
||||
bool moreRounds;
|
||||
bool moreRounds = false;
|
||||
comm = cliqueHead;
|
||||
do { // Iterate clique members.
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
@ -150,7 +150,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
// Barrier reduction result tells us if this was the final round.
|
||||
moreRounds = 0 != ncclCommIntraBarrierOut(comm);
|
||||
} else {
|
||||
moreRounds = comm->unlaunchedPlansHead != nullptr;
|
||||
moreRounds |= comm->unlaunchedPlansHead != nullptr;
|
||||
}
|
||||
if (moreRounds) {
|
||||
// Pop next unlaunched kernel
|
||||
|
Loading…
x
Reference in New Issue
Block a user