Tune chunkSteps for networks with large latency

This commit is contained in:
Sylvain Jeaugey 2022-11-30 01:20:43 -08:00
parent c324f771db
commit a6c8f5e0c2
3 changed files with 11 additions and 0 deletions

View File

@ -1281,6 +1281,12 @@ static ncclResult_t getStepInfo(struct ncclInfo* info) {
info->sliceSteps = ncclParamRingSliceSteps();
}
}
// Make buffer deeper for longer latency network segment
if (info->comm->nNodes > 1 && info->comm->netLatency > 100 &&
(info->coll == ncclFuncReduceScatter || info->coll == ncclFuncAllGather || info->coll == ncclFuncAllReduce)) {
info->sliceSteps = 1;
info->chunkSteps = 2;
}
if (info->chunkSteps > NCCL_STEPS/2 || info->sliceSteps > NCCL_STEPS/2) {
WARN("Invalid chunkSteps=%d/sliceSteps=%d, must be at most NCCL_STEPS/2=%d\n", info->chunkSteps, info->sliceSteps, NCCL_STEPS/2);
return ncclInvalidUsage;

View File

@ -218,6 +218,7 @@ struct ncclComm {
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float netLatency;
/* This attribute can indicate the states of communicators and return code of
* asynchronous NCCL operations. */

View File

@ -290,6 +290,10 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
WARN("Error: network %s not found.", netName ? netName : "");
return ncclInvalidUsage;
}
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, 0, &props));
comm->netLatency = props.latency;
return ncclSuccess;
}