H800/H100 fixes and tuning.
Re-enable intra-process direct pointer buffer access when CUMEM is
enabled.
This commit is contained in:
Sylvain Jeaugey 2023-09-26 05:56:27 -07:00
parent 3435178b6c
commit 8c6c595185
10 changed files with 96 additions and 74 deletions

View File

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 19
NCCL_PATCH := 1
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1

View File

@ -401,9 +401,9 @@ static ncclResult_t registerIntraNodeBuffers(
/* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
* saturate bandwidth. */
if (info->coll == ncclFuncReduceScatter)
info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
info->nChannels = std::min(5, comm->nvlsChannels);
else
info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
info->nChannels = std::min(4, comm->nvlsChannels);
*outRegBufType = NCCL_NVLS_REG_BUFFER;
}
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
@ -1358,6 +1358,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_NVLS) {
int maxChunkSize = 131072;
if (info->comm->nNodes > 1 && info->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
@ -1368,6 +1369,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
if (info->comm->nNodes >= 4) chunkSize = 65536;
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;

View File

@ -260,6 +260,32 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
} else {
for (int i=0; i<count; i++) next[i] = scores[i].g;
}
if (system->nodes[NVS].count) {
// NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first.
int index = gpu-system->nodes[GPU].nodes;
int i;
int prevGpu = (index-1+ngpus)%ngpus;
int nextGpu = (index+1)%ngpus;
int firstGpus[2];
int firstGpuCount = 0;
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2;
} else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2;
} else {
firstGpus[0] = nextGpu; firstGpuCount = 1;
}
for (int g=0; g<firstGpuCount; g++) {
for (i=0; i<count && next[i] != firstGpus[g]; i++);
if (i<count) {
for (; i>0; i--) next[i] = next[i-1];
next[0] = firstGpus[g];
}
}
}
*countPtr = count;
return ncclSuccess;
}
@ -555,7 +581,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
}
if (graph->nChannels == 0 || graph->sameChannels == 0) {
if (graph->nChannels == 0) {
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
@ -794,13 +820,28 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
return ncclSuccess;
}
ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngpus) {
if (graph->nChannels == 0) return ncclSuccess;
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
if (graph->bwIntra < 25.0) return ncclSuccess;
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
return ncclSuccess;
}
float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
@ -808,8 +849,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
int ngpus = system->nodes[GPU].count;
int crossNic = (system->nodes[NET].count > 1) &&
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
graph->crossNic = crossNic == 1 ? 1 : 0;
graph->bwIntra = graph->bwInter = 0;
graph->latencyInter = 0;
@ -949,6 +990,7 @@ done:
// We have a solution. Start from that solution and move to pass 2.
if (pass == 1) {
time = -1;
NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus));
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
speedIndex = 0;
while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
@ -957,27 +999,22 @@ done:
pass = 2;
}
// 3. See if we can increase bwIntra for trees (2 nodes or collnet)
if (pass == 2) {
if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && graph->pattern != NCCL_TOPO_PATTERN_NVLS &&
tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
speedIndex > 0) {
tmpGraph.bwIntra = speedArray[--speedIndex];
goto search;
}
time = -1;
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
pass = 3;
}
// 4. See if we can increase bwInter for nvls+tree
if (pass == 3) {
if (time != 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS &&
tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2 &&
speedIndex > 0) {
tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
tmpGraph.bwInter = speedArray[--speedIndex];
goto search;
// See if we can increase bw
if (time != 0 && speedIndex > 0) {
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
// increase bw for Ring
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex];
goto search;
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) {
tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
tmpGraph.bwInter = speedArray[--speedIndex];
goto search;
} else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) {
// increase bwIntra for trees (2 nodes or collnet)
tmpGraph.bwIntra = speedArray[--speedIndex];
goto search;
}
}
time = -1;
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
@ -991,18 +1028,6 @@ done:
graph->typeIntra = graph->typeInter = PATH_SYS;
graph->nChannels = 1;
}
if (graph->nChannels == 0) return ncclSuccess;
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
if (graph->bwIntra < 25.0) return ncclSuccess;
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
return ncclSuccess;
}

View File

@ -878,14 +878,3 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
if (ccMax) *ccMax = max;
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
for (int g=0; g<system->nodes[GPU].count; g++) {
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
*localRank = g;
return ncclSuccess;
}
}
WARN("Could not find local GPU with rank %d", rank);
return ncclInternalError;
}

View File

@ -127,7 +127,7 @@ struct ncclChannel {
struct ncclChannelPeer** peers;
struct ncclDevChannelPeer** devPeers;
/* devPeer pointer array used for host side access */
struct ncclDevChannelPeer** devPeersHostPtr;
struct ncclDevChannelPeer** devPeersHostPtr;
struct ncclRing ring;
int* devRingUserRanks;
struct ncclTree tree;

View File

@ -38,7 +38,6 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
int ncclPxnDisable(struct ncclComm* comm);
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
// Find CPU affinity
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);

View File

@ -1174,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
int sendNode = (node+delta)%nNodes;
for (int step=0; step < steps; step++) {
int recvIndex = (localRank-step+steps)%steps;
int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
tasks->p2pRecvOrder[i] = recvRank;
int sendIndex = (localRank+step)%steps;
int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;

View File

@ -155,7 +155,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank));
send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
@ -177,7 +177,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
// Determine whether we need to flush the GDR buffer on recv or not
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank));
recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;

View File

@ -172,7 +172,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
* information for this peer */
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req = { 0 };
int localRank, tpProxyRank;
int tpProxyRank;
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
@ -185,8 +185,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
tpProxyRank = comm->topParentRanks[proxyRank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
req.tpLocalRank = comm->topParentLocalRanks[localRank];
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
@ -210,7 +209,6 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
/* Setup recv connector */
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req = { 0 };
int localRank;
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
@ -228,8 +226,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
req.tpLocalRank = comm->topParentLocalRanks[localRank];
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
@ -312,15 +309,13 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
if (map->sameProcess && !ncclCuMemEnable()) {
if (map->cudaDev != comm->cudaDev) {
if (!ncclCuMemEnable()) {
// Enable P2P access for Legacy IPC
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError;
}
// Enable P2P access for Legacy IPC
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError;
}
}
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {

View File

@ -256,7 +256,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
accessDesc.location.id = comm->cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id);
TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id);
*devMemPtr = (void *)dptr;
#else
@ -288,7 +288,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
}
static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) {
if (myInfo->pidHash == peerInfo->pidHash) {
if (peerInfo->cudaDev != myInfo->cudaDev) {
// Same PID different GPUs, enable P2P access
// Legacy CUDA IPC
@ -300,6 +300,18 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
#if CUDART_VERSION >= 11030
// cuMem API support
if (ncclCuMemEnable()) {
// Allow direct access to the remote buffer from the local GPU
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = myInfo->cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
}
#endif
}
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
@ -342,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
resources->type = P2P_DIRECT;
send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
@ -406,7 +418,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
resources->type = P2P_DIRECT;
recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {