2.19.3-1
H800/H100 fixes and tuning. Re-enable intra-process direct pointer buffer access when CUMEM is enabled.
This commit is contained in:
parent
3435178b6c
commit
8c6c595185
@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 19
|
||||
NCCL_PATCH := 1
|
||||
NCCL_PATCH := 3
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
@ -401,9 +401,9 @@ static ncclResult_t registerIntraNodeBuffers(
|
||||
/* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
|
||||
* saturate bandwidth. */
|
||||
if (info->coll == ncclFuncReduceScatter)
|
||||
info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
|
||||
info->nChannels = std::min(5, comm->nvlsChannels);
|
||||
else
|
||||
info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
|
||||
info->nChannels = std::min(4, comm->nvlsChannels);
|
||||
*outRegBufType = NCCL_NVLS_REG_BUFFER;
|
||||
}
|
||||
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
|
||||
@ -1358,6 +1358,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc
|
||||
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_NVLS) {
|
||||
int maxChunkSize = 131072;
|
||||
if (info->comm->nNodes > 1 && info->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
|
||||
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
|
||||
@ -1368,6 +1369,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc
|
||||
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
|
||||
if (info->comm->nNodes >= 4) chunkSize = 65536;
|
||||
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
|
||||
if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
|
||||
if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
|
||||
|
@ -260,6 +260,32 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
|
||||
} else {
|
||||
for (int i=0; i<count; i++) next[i] = scores[i].g;
|
||||
}
|
||||
|
||||
if (system->nodes[NVS].count) {
|
||||
// NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first.
|
||||
int index = gpu-system->nodes[GPU].nodes;
|
||||
int i;
|
||||
int prevGpu = (index-1+ngpus)%ngpus;
|
||||
int nextGpu = (index+1)%ngpus;
|
||||
int firstGpus[2];
|
||||
int firstGpuCount = 0;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
|
||||
firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2;
|
||||
} else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ||
|
||||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
|
||||
firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2;
|
||||
} else {
|
||||
firstGpus[0] = nextGpu; firstGpuCount = 1;
|
||||
}
|
||||
for (int g=0; g<firstGpuCount; g++) {
|
||||
for (i=0; i<count && next[i] != firstGpus[g]; i++);
|
||||
if (i<count) {
|
||||
for (; i>0; i--) next[i] = next[i-1];
|
||||
next[0] = firstGpus[g];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*countPtr = count;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@ -555,7 +581,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
|
||||
}
|
||||
if (graph->nChannels == 0 || graph->sameChannels == 0) {
|
||||
if (graph->nChannels == 0) {
|
||||
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
|
||||
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
|
||||
int t = 1 << 10;
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
|
||||
@ -794,13 +820,28 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngpus) {
|
||||
if (graph->nChannels == 0) return ncclSuccess;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
|
||||
if (graph->bwIntra < 25.0) return ncclSuccess;
|
||||
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
|
||||
|
||||
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
|
||||
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
|
||||
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
|
||||
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->nChannels = dupChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
|
||||
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
|
||||
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
|
||||
|
||||
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
|
||||
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
|
||||
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
|
||||
|
||||
@ -808,8 +849,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int crossNic = (system->nodes[NET].count > 1) &&
|
||||
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
|
||||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
|
||||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
|
||||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
|
||||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
|
||||
graph->crossNic = crossNic == 1 ? 1 : 0;
|
||||
graph->bwIntra = graph->bwInter = 0;
|
||||
graph->latencyInter = 0;
|
||||
@ -949,6 +990,7 @@ done:
|
||||
// We have a solution. Start from that solution and move to pass 2.
|
||||
if (pass == 1) {
|
||||
time = -1;
|
||||
NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus));
|
||||
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
||||
speedIndex = 0;
|
||||
while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
|
||||
@ -957,27 +999,22 @@ done:
|
||||
pass = 2;
|
||||
}
|
||||
|
||||
// 3. See if we can increase bwIntra for trees (2 nodes or collnet)
|
||||
if (pass == 2) {
|
||||
if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && graph->pattern != NCCL_TOPO_PATTERN_NVLS &&
|
||||
tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
|
||||
speedIndex > 0) {
|
||||
tmpGraph.bwIntra = speedArray[--speedIndex];
|
||||
goto search;
|
||||
}
|
||||
time = -1;
|
||||
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
||||
pass = 3;
|
||||
}
|
||||
|
||||
// 4. See if we can increase bwInter for nvls+tree
|
||||
if (pass == 3) {
|
||||
if (time != 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS &&
|
||||
tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2 &&
|
||||
speedIndex > 0) {
|
||||
tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
|
||||
tmpGraph.bwInter = speedArray[--speedIndex];
|
||||
goto search;
|
||||
// See if we can increase bw
|
||||
if (time != 0 && speedIndex > 0) {
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
|
||||
// increase bw for Ring
|
||||
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex];
|
||||
goto search;
|
||||
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) {
|
||||
tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
|
||||
tmpGraph.bwInter = speedArray[--speedIndex];
|
||||
goto search;
|
||||
} else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) {
|
||||
// increase bwIntra for trees (2 nodes or collnet)
|
||||
tmpGraph.bwIntra = speedArray[--speedIndex];
|
||||
goto search;
|
||||
}
|
||||
}
|
||||
time = -1;
|
||||
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
||||
@ -991,18 +1028,6 @@ done:
|
||||
graph->typeIntra = graph->typeInter = PATH_SYS;
|
||||
graph->nChannels = 1;
|
||||
}
|
||||
|
||||
if (graph->nChannels == 0) return ncclSuccess;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
|
||||
if (graph->bwIntra < 25.0) return ncclSuccess;
|
||||
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
|
||||
|
||||
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
|
||||
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
|
||||
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
|
||||
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->nChannels = dupChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -878,14 +878,3 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
|
||||
if (ccMax) *ccMax = max;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
|
||||
*localRank = g;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find local GPU with rank %d", rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
@ -127,7 +127,7 @@ struct ncclChannel {
|
||||
struct ncclChannelPeer** peers;
|
||||
struct ncclDevChannelPeer** devPeers;
|
||||
/* devPeer pointer array used for host side access */
|
||||
struct ncclDevChannelPeer** devPeersHostPtr;
|
||||
struct ncclDevChannelPeer** devPeersHostPtr;
|
||||
struct ncclRing ring;
|
||||
int* devRingUserRanks;
|
||||
struct ncclTree tree;
|
||||
|
@ -38,7 +38,6 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
||||
int ncclPxnDisable(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
|
||||
|
||||
// Find CPU affinity
|
||||
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
|
||||
|
@ -1174,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
int sendNode = (node+delta)%nNodes;
|
||||
for (int step=0; step < steps; step++) {
|
||||
int recvIndex = (localRank-step+steps)%steps;
|
||||
int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
|
||||
int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
|
||||
tasks->p2pRecvOrder[i] = recvRank;
|
||||
int sendIndex = (localRank+step)%steps;
|
||||
int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
|
||||
|
@ -155,7 +155,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank));
|
||||
send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
|
||||
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
|
||||
@ -177,7 +177,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
// Determine whether we need to flush the GDR buffer on recv or not
|
||||
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||
|
||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank));
|
||||
recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
|
||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||
|
@ -172,7 +172,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
* information for this peer */
|
||||
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
||||
struct setupReq req = { 0 };
|
||||
int localRank, tpProxyRank;
|
||||
int tpProxyRank;
|
||||
|
||||
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||
req.channelId = channelId;
|
||||
@ -185,8 +185,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
|
||||
tpProxyRank = comm->topParentRanks[proxyRank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
|
||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
|
||||
req.tpLocalRank = comm->topParentLocalRanks[localRank];
|
||||
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
req.tpRank = comm->topParentRanks[myInfo->rank];
|
||||
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||
@ -210,7 +209,6 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
|
||||
/* Setup recv connector */
|
||||
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
|
||||
struct setupReq req = { 0 };
|
||||
int localRank;
|
||||
|
||||
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||
req.channelId = channelId;
|
||||
@ -228,8 +226,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
|
||||
|
||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
|
||||
req.tpLocalRank = comm->topParentLocalRanks[localRank];
|
||||
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
req.tpRank = comm->topParentRanks[myInfo->rank];
|
||||
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
|
||||
@ -312,15 +309,13 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
|
||||
if (map->sameProcess && !ncclCuMemEnable()) {
|
||||
if (map->cudaDev != comm->cudaDev) {
|
||||
if (!ncclCuMemEnable()) {
|
||||
// Enable P2P access for Legacy IPC
|
||||
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
|
||||
if (err == cudaErrorPeerAccessAlreadyEnabled) {
|
||||
cudaGetLastError();
|
||||
} else if (err != cudaSuccess) {
|
||||
WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
// Enable P2P access for Legacy IPC
|
||||
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
|
||||
if (err == cudaErrorPeerAccessAlreadyEnabled) {
|
||||
cudaGetLastError();
|
||||
} else if (err != cudaSuccess) {
|
||||
WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
|
||||
|
@ -256,7 +256,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
|
||||
accessDesc.location.id = comm->cudaDev;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
|
||||
TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id);
|
||||
TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id);
|
||||
|
||||
*devMemPtr = (void *)dptr;
|
||||
#else
|
||||
@ -288,7 +288,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
|
||||
}
|
||||
|
||||
static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
|
||||
if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (peerInfo->cudaDev != myInfo->cudaDev) {
|
||||
// Same PID different GPUs, enable P2P access
|
||||
// Legacy CUDA IPC
|
||||
@ -300,6 +300,18 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
|
||||
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
#if CUDART_VERSION >= 11030
|
||||
// cuMem API support
|
||||
if (ncclCuMemEnable()) {
|
||||
// Allow direct access to the remote buffer from the local GPU
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = myInfo->cudaDev;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
*devMem = p2pBuff->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
@ -342,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
resources->type = P2P_DIRECT;
|
||||
send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
|
||||
@ -406,7 +418,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
resources->type = P2P_DIRECT;
|
||||
recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
} else {
|
||||
|
Loading…
x
Reference in New Issue
Block a user