Minor fixes for A100 platforms.
Add a WARN for invalid GroupEnd call.
This commit is contained in:
Sylvain Jeaugey 2020-06-22 09:36:20 -07:00
parent 5949d96f36
commit 01afd20a77
4 changed files with 8 additions and 5 deletions

View File

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 7
NCCL_PATCH := 3
NCCL_PATCH := 5
NCCL_SUFFIX :=
PKG_REVISION := 1

View File

@ -774,8 +774,8 @@ done:
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
graph->speedIntra /= 2;
graph->speedInter /= 2;
graph->speedIntra /= DIVUP(dupChannels, graph->nChannels);
graph->speedInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
}
return ncclSuccess;

View File

@ -107,9 +107,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
float busBw = graphs[a]->nChannels * speed;
if (compCap80) busBw *= 0.92;
// Various model refinements
if (compCap80) busBw = std::min(busBw, 235.0f);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= (comm->nNodes > 1 || coll == ncclCollAllReduce || coll == ncclCollReduce) ? 1.0/4.0 : 1.0/3.0;
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
double maxTreeBw = comm->nNodes > 2 ?

View File

@ -145,7 +145,10 @@ void* ncclAsyncThreadPreconnect(void* args_) {
NCCL_API(ncclResult_t, ncclGroupEnd);
ncclResult_t ncclGroupEnd() {
if (ncclGroupMode == 0) return ncclInvalidUsage;
if (ncclGroupMode == 0) {
WARN("ncclGroupEnd: not in a group call.");
return ncclInvalidUsage;
}
ncclGroupMode--;
if (ncclGroupMode > 0) return ncclSuccess;
int savedDev;