2.19.3-1
H800/H100 fixes and tuning. Re-enable intra-process direct pointer buffer access when CUMEM is enabled.
This commit is contained in:
parent
3435178b6c
commit
8c6c595185
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 19
|
NCCL_MINOR := 19
|
||||||
NCCL_PATCH := 1
|
NCCL_PATCH := 3
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -401,9 +401,9 @@ static ncclResult_t registerIntraNodeBuffers(
|
|||||||
/* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
|
/* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
|
||||||
* saturate bandwidth. */
|
* saturate bandwidth. */
|
||||||
if (info->coll == ncclFuncReduceScatter)
|
if (info->coll == ncclFuncReduceScatter)
|
||||||
info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
|
info->nChannels = std::min(5, comm->nvlsChannels);
|
||||||
else
|
else
|
||||||
info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
|
info->nChannels = std::min(4, comm->nvlsChannels);
|
||||||
*outRegBufType = NCCL_NVLS_REG_BUFFER;
|
*outRegBufType = NCCL_NVLS_REG_BUFFER;
|
||||||
}
|
}
|
||||||
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
|
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
|
||||||
@ -1358,6 +1358,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc
|
|||||||
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||||
} else if (info->algorithm == NCCL_ALGO_NVLS) {
|
} else if (info->algorithm == NCCL_ALGO_NVLS) {
|
||||||
int maxChunkSize = 131072;
|
int maxChunkSize = 131072;
|
||||||
|
if (info->comm->nNodes > 1 && info->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
|
||||||
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
|
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
|
||||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||||
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
|
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
|
||||||
@ -1368,6 +1369,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc
|
|||||||
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
|
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
|
||||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||||
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
|
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
|
||||||
|
if (info->comm->nNodes >= 4) chunkSize = 65536;
|
||||||
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
|
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
|
||||||
if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
|
if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
|
||||||
if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
|
if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
|
||||||
|
@ -260,6 +260,32 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
|
|||||||
} else {
|
} else {
|
||||||
for (int i=0; i<count; i++) next[i] = scores[i].g;
|
for (int i=0; i<count; i++) next[i] = scores[i].g;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (system->nodes[NVS].count) {
|
||||||
|
// NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first.
|
||||||
|
int index = gpu-system->nodes[GPU].nodes;
|
||||||
|
int i;
|
||||||
|
int prevGpu = (index-1+ngpus)%ngpus;
|
||||||
|
int nextGpu = (index+1)%ngpus;
|
||||||
|
int firstGpus[2];
|
||||||
|
int firstGpuCount = 0;
|
||||||
|
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
|
||||||
|
firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2;
|
||||||
|
} else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ||
|
||||||
|
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
|
||||||
|
firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2;
|
||||||
|
} else {
|
||||||
|
firstGpus[0] = nextGpu; firstGpuCount = 1;
|
||||||
|
}
|
||||||
|
for (int g=0; g<firstGpuCount; g++) {
|
||||||
|
for (i=0; i<count && next[i] != firstGpus[g]; i++);
|
||||||
|
if (i<count) {
|
||||||
|
for (; i>0; i--) next[i] = next[i-1];
|
||||||
|
next[0] = firstGpus[g];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
*countPtr = count;
|
*countPtr = count;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -555,7 +581,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
|||||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
|
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
|
||||||
}
|
}
|
||||||
if (graph->nChannels == 0 || graph->sameChannels == 0) {
|
if (graph->nChannels == 0 || graph->sameChannels == 0) {
|
||||||
if (graph->nChannels == 0) {
|
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
|
||||||
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
|
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
|
||||||
int t = 1 << 10;
|
int t = 1 << 10;
|
||||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
|
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
|
||||||
@ -794,13 +820,28 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngpus) {
|
||||||
|
if (graph->nChannels == 0) return ncclSuccess;
|
||||||
|
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
|
||||||
|
if (graph->bwIntra < 25.0) return ncclSuccess;
|
||||||
|
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
|
||||||
|
|
||||||
|
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
|
||||||
|
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
|
||||||
|
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
|
||||||
|
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
|
||||||
|
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
|
||||||
|
graph->nChannels = dupChannels;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
|
float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
|
||||||
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||||
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
|
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
|
||||||
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
|
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
|
||||||
|
|
||||||
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
|
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
|
||||||
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||||
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
|
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
|
||||||
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
|
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
|
||||||
|
|
||||||
@ -949,6 +990,7 @@ done:
|
|||||||
// We have a solution. Start from that solution and move to pass 2.
|
// We have a solution. Start from that solution and move to pass 2.
|
||||||
if (pass == 1) {
|
if (pass == 1) {
|
||||||
time = -1;
|
time = -1;
|
||||||
|
NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus));
|
||||||
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
||||||
speedIndex = 0;
|
speedIndex = 0;
|
||||||
while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
|
while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
|
||||||
@ -957,27 +999,22 @@ done:
|
|||||||
pass = 2;
|
pass = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. See if we can increase bwIntra for trees (2 nodes or collnet)
|
|
||||||
if (pass == 2) {
|
if (pass == 2) {
|
||||||
if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && graph->pattern != NCCL_TOPO_PATTERN_NVLS &&
|
// See if we can increase bw
|
||||||
tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
|
if (time != 0 && speedIndex > 0) {
|
||||||
speedIndex > 0) {
|
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
|
||||||
tmpGraph.bwIntra = speedArray[--speedIndex];
|
// increase bw for Ring
|
||||||
|
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex];
|
||||||
goto search;
|
goto search;
|
||||||
}
|
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) {
|
||||||
time = -1;
|
|
||||||
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
|
||||||
pass = 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4. See if we can increase bwInter for nvls+tree
|
|
||||||
if (pass == 3) {
|
|
||||||
if (time != 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS &&
|
|
||||||
tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2 &&
|
|
||||||
speedIndex > 0) {
|
|
||||||
tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
|
tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
|
||||||
tmpGraph.bwInter = speedArray[--speedIndex];
|
tmpGraph.bwInter = speedArray[--speedIndex];
|
||||||
goto search;
|
goto search;
|
||||||
|
} else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) {
|
||||||
|
// increase bwIntra for trees (2 nodes or collnet)
|
||||||
|
tmpGraph.bwIntra = speedArray[--speedIndex];
|
||||||
|
goto search;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
time = -1;
|
time = -1;
|
||||||
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
||||||
@ -991,18 +1028,6 @@ done:
|
|||||||
graph->typeIntra = graph->typeInter = PATH_SYS;
|
graph->typeIntra = graph->typeInter = PATH_SYS;
|
||||||
graph->nChannels = 1;
|
graph->nChannels = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (graph->nChannels == 0) return ncclSuccess;
|
|
||||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
|
|
||||||
if (graph->bwIntra < 25.0) return ncclSuccess;
|
|
||||||
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
|
|
||||||
|
|
||||||
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
|
|
||||||
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
|
|
||||||
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
|
|
||||||
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
|
|
||||||
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
|
|
||||||
graph->nChannels = dupChannels;
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -878,14 +878,3 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
|
|||||||
if (ccMax) *ccMax = max;
|
if (ccMax) *ccMax = max;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
|
|
||||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
|
||||||
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
|
|
||||||
*localRank = g;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
WARN("Could not find local GPU with rank %d", rank);
|
|
||||||
return ncclInternalError;
|
|
||||||
}
|
|
||||||
|
@ -38,7 +38,6 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
|
|||||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
||||||
int ncclPxnDisable(struct ncclComm* comm);
|
int ncclPxnDisable(struct ncclComm* comm);
|
||||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||||
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
|
|
||||||
|
|
||||||
// Find CPU affinity
|
// Find CPU affinity
|
||||||
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
|
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
|
||||||
|
@ -155,7 +155,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||||
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||||
|
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank));
|
send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
|
||||||
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
|
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
|
||||||
@ -177,7 +177,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
// Determine whether we need to flush the GDR buffer on recv or not
|
// Determine whether we need to flush the GDR buffer on recv or not
|
||||||
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||||
|
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank));
|
recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
|
||||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||||
|
@ -172,7 +172,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
|||||||
* information for this peer */
|
* information for this peer */
|
||||||
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
||||||
struct setupReq req = { 0 };
|
struct setupReq req = { 0 };
|
||||||
int localRank, tpProxyRank;
|
int tpProxyRank;
|
||||||
|
|
||||||
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||||
req.channelId = channelId;
|
req.channelId = channelId;
|
||||||
@ -185,8 +185,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
|
|
||||||
tpProxyRank = comm->topParentRanks[proxyRank];
|
tpProxyRank = comm->topParentRanks[proxyRank];
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
|
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||||
req.tpLocalRank = comm->topParentLocalRanks[localRank];
|
|
||||||
req.tpRank = comm->topParentRanks[myInfo->rank];
|
req.tpRank = comm->topParentRanks[myInfo->rank];
|
||||||
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
|
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
|
||||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||||
@ -210,7 +209,6 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
|
|||||||
/* Setup recv connector */
|
/* Setup recv connector */
|
||||||
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
|
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
|
||||||
struct setupReq req = { 0 };
|
struct setupReq req = { 0 };
|
||||||
int localRank;
|
|
||||||
|
|
||||||
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||||
req.channelId = channelId;
|
req.channelId = channelId;
|
||||||
@ -228,8 +226,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
|
||||||
|
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
|
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||||
req.tpLocalRank = comm->topParentLocalRanks[localRank];
|
|
||||||
req.tpRank = comm->topParentRanks[myInfo->rank];
|
req.tpRank = comm->topParentRanks[myInfo->rank];
|
||||||
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
|
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
|
||||||
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
|
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
|
||||||
@ -312,7 +309,6 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
|||||||
|
|
||||||
if (map->sameProcess && !ncclCuMemEnable()) {
|
if (map->sameProcess && !ncclCuMemEnable()) {
|
||||||
if (map->cudaDev != comm->cudaDev) {
|
if (map->cudaDev != comm->cudaDev) {
|
||||||
if (!ncclCuMemEnable()) {
|
|
||||||
// Enable P2P access for Legacy IPC
|
// Enable P2P access for Legacy IPC
|
||||||
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
|
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
|
||||||
if (err == cudaErrorPeerAccessAlreadyEnabled) {
|
if (err == cudaErrorPeerAccessAlreadyEnabled) {
|
||||||
@ -322,7 +318,6 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
|||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
|
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
|
||||||
if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
|
if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
|
||||||
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||||
|
@ -256,7 +256,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
|
|||||||
accessDesc.location.id = comm->cudaDev;
|
accessDesc.location.id = comm->cudaDev;
|
||||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||||
CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
|
CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
|
||||||
TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id);
|
TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id);
|
||||||
|
|
||||||
*devMemPtr = (void *)dptr;
|
*devMemPtr = (void *)dptr;
|
||||||
#else
|
#else
|
||||||
@ -288,7 +288,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
|
static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
|
||||||
if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) {
|
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||||
if (peerInfo->cudaDev != myInfo->cudaDev) {
|
if (peerInfo->cudaDev != myInfo->cudaDev) {
|
||||||
// Same PID different GPUs, enable P2P access
|
// Same PID different GPUs, enable P2P access
|
||||||
// Legacy CUDA IPC
|
// Legacy CUDA IPC
|
||||||
@ -300,6 +300,18 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
|
|||||||
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
|
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
#if CUDART_VERSION >= 11030
|
||||||
|
// cuMem API support
|
||||||
|
if (ncclCuMemEnable()) {
|
||||||
|
// Allow direct access to the remote buffer from the local GPU
|
||||||
|
CUmemAccessDesc accessDesc = {};
|
||||||
|
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
|
accessDesc.location.id = myInfo->cudaDev;
|
||||||
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||||
|
INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
|
||||||
|
CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
*devMem = p2pBuff->directPtr;
|
*devMem = p2pBuff->directPtr;
|
||||||
*ipcPtr = NULL;
|
*ipcPtr = NULL;
|
||||||
@ -342,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
|||||||
|
|
||||||
if (intermediateRank == -1) {
|
if (intermediateRank == -1) {
|
||||||
info->rank = myInfo->rank;
|
info->rank = myInfo->rank;
|
||||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
|
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||||
resources->type = P2P_DIRECT;
|
resources->type = P2P_DIRECT;
|
||||||
send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
|
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
|
||||||
@ -406,7 +418,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
|||||||
|
|
||||||
if (intermediateRank == -1) {
|
if (intermediateRank == -1) {
|
||||||
info->rank = myInfo->rank;
|
info->rank = myInfo->rank;
|
||||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
|
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||||
resources->type = P2P_DIRECT;
|
resources->type = P2P_DIRECT;
|
||||||
recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||||
} else {
|
} else {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user