From 8c6c5951854a57ba90c4424fa040497f6defac46 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 26 Sep 2023 05:56:27 -0700 Subject: [PATCH] 2.19.3-1 H800/H100 fixes and tuning. Re-enable intra-process direct pointer buffer access when CUMEM is enabled. --- makefiles/version.mk | 2 +- src/enqueue.cc | 6 ++- src/graph/search.cc | 97 ++++++++++++++++++++++++--------------- src/graph/topo.cc | 11 ----- src/include/comm.h | 2 +- src/include/graph.h | 1 - src/init.cc | 2 +- src/transport/coll_net.cc | 4 +- src/transport/net.cc | 25 ++++------ src/transport/p2p.cc | 20 ++++++-- 10 files changed, 96 insertions(+), 74 deletions(-) diff --git a/makefiles/version.mk b/makefiles/version.mk index 218f432..5e32150 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 19 -NCCL_PATCH := 1 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/enqueue.cc b/src/enqueue.cc index e375a7e..dbb9865 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -401,9 +401,9 @@ static ncclResult_t registerIntraNodeBuffers( /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to * saturate bandwidth. */ if (info->coll == ncclFuncReduceScatter) - info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); + info->nChannels = std::min(5, comm->nvlsChannels); else - info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); + info->nChannels = std::min(4, comm->nvlsChannels); *outRegBufType = NCCL_NVLS_REG_BUFFER; } } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now @@ -1358,6 +1358,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_NVLS) { int maxChunkSize = 131072; + if (info->comm->nNodes > 1 && info->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768; if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads; @@ -1368,6 +1369,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) { // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads; + if (info->comm->nNodes >= 4) chunkSize = 65536; if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144; if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072; if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; diff --git a/src/graph/search.cc b/src/graph/search.cc index 46d88c4..3ebb0d4 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -260,6 +260,32 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc } else { for (int i=0; inodes[NVS].count) { + // NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first. + int index = gpu-system->nodes[GPU].nodes; + int i; + int prevGpu = (index-1+ngpus)%ngpus; + int nextGpu = (index+1)%ngpus; + int firstGpus[2]; + int firstGpuCount = 0; + if (graph->pattern == NCCL_TOPO_PATTERN_RING) { + firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2; + } else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE || + graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { + firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2; + } else { + firstGpus[0] = nextGpu; firstGpuCount = 1; + } + for (int g=0; g0; i--) next[i] = next[i-1]; + next[0] = firstGpus[g]; + } + } + } + *countPtr = count; return ncclSuccess; } @@ -555,7 +581,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); } if (graph->nChannels == 0 || graph->sameChannels == 0) { - if (graph->nChannels == 0) { + if (graph->nChannels == 0 && system->nodes[NVS].count == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); @@ -794,13 +820,28 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs return ncclSuccess; } +ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngpus) { + if (graph->nChannels == 0) return ncclSuccess; + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess; + if (graph->bwIntra < 25.0) return ncclSuccess; + if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess; + + int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); + memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); + memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); + graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); + graph->bwInter /= DIVUP(dupChannels, graph->nChannels); + graph->nChannels = dupChannels; + return ncclSuccess; +} + float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 }; float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 }; -float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) @@ -808,8 +849,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph int ngpus = system->nodes[GPU].count; int crossNic = (system->nodes[NET].count > 1) && (graph->pattern == NCCL_TOPO_PATTERN_RING || - graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || - graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0; + graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || + graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0; graph->crossNic = crossNic == 1 ? 1 : 0; graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; @@ -949,6 +990,7 @@ done: // We have a solution. Start from that solution and move to pass 2. if (pass == 1) { time = -1; + NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus)); memcpy(&tmpGraph, graph, sizeof(tmpGraph)); speedIndex = 0; while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++; @@ -957,27 +999,22 @@ done: pass = 2; } - // 3. See if we can increase bwIntra for trees (2 nodes or collnet) if (pass == 2) { - if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && graph->pattern != NCCL_TOPO_PATTERN_NVLS && - tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 && - speedIndex > 0) { - tmpGraph.bwIntra = speedArray[--speedIndex]; - goto search; - } - time = -1; - memcpy(&tmpGraph, graph, sizeof(tmpGraph)); - pass = 3; - } - - // 4. See if we can increase bwInter for nvls+tree - if (pass == 3) { - if (time != 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS && - tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2 && - speedIndex > 0) { - tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels; - tmpGraph.bwInter = speedArray[--speedIndex]; - goto search; + // See if we can increase bw + if (time != 0 && speedIndex > 0) { + if (graph->pattern == NCCL_TOPO_PATTERN_RING) { + // increase bw for Ring + tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex]; + goto search; + } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) { + tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels; + tmpGraph.bwInter = speedArray[--speedIndex]; + goto search; + } else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) { + // increase bwIntra for trees (2 nodes or collnet) + tmpGraph.bwIntra = speedArray[--speedIndex]; + goto search; + } } time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); @@ -991,18 +1028,6 @@ done: graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } - - if (graph->nChannels == 0) return ncclSuccess; - if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess; - if (graph->bwIntra < 25.0) return ncclSuccess; - if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess; - - int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); - memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); - memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); - graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); - graph->bwInter /= DIVUP(dupChannels, graph->nChannels); - graph->nChannels = dupChannels; return ncclSuccess; } diff --git a/src/graph/topo.cc b/src/graph/topo.cc index fc0228a..481def4 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -878,14 +878,3 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* if (ccMax) *ccMax = max; return ncclSuccess; } - -ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) { - for (int g=0; gnodes[GPU].count; g++) { - if (system->nodes[GPU].nodes[g].gpu.rank == rank) { - *localRank = g; - return ncclSuccess; - } - } - WARN("Could not find local GPU with rank %d", rank); - return ncclInternalError; -} diff --git a/src/include/comm.h b/src/include/comm.h index 869ecd3..bc5a9c5 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -127,7 +127,7 @@ struct ncclChannel { struct ncclChannelPeer** peers; struct ncclDevChannelPeer** devPeers; /* devPeer pointer array used for host side access */ - struct ncclDevChannelPeer** devPeersHostPtr; + struct ncclDevChannelPeer** devPeersHostPtr; struct ncclRing ring; int* devRingUserRanks; struct ncclTree tree; diff --git a/src/include/graph.h b/src/include/graph.h index 56eee60..fdd6348 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -38,7 +38,6 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net); int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); -ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank); // Find CPU affinity ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); diff --git a/src/init.cc b/src/init.cc index f6dde15..c681f2a 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1174,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p int sendNode = (node+delta)%nNodes; for (int step=0; step < steps; step++) { int recvIndex = (localRank-step+steps)%steps; - int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1; + int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1; tasks->p2pRecvOrder[i] = recvRank; int sendIndex = (localRank+step)%steps; int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1; diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 80aa77c..04bab8b 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -155,7 +155,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank)); + send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn)); ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); @@ -177,7 +177,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph // Determine whether we need to flush the GDR buffer on recv or not if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank)); + recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn)); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; diff --git a/src/transport/net.cc b/src/transport/net.cc index b12a754..0998172 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -172,7 +172,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct * information for this peer */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct setupReq req = { 0 }; - int localRank, tpProxyRank; + int tpProxyRank; send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; @@ -185,8 +185,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph tpProxyRank = comm->topParentRanks[proxyRank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn)); - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank)); - req.tpLocalRank = comm->topParentLocalRanks[localRank]; + req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); @@ -210,7 +209,6 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); /* Setup recv connector */ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct setupReq req = { 0 }; - int localRank; recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; @@ -228,8 +226,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn)); - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank)); - req.tpLocalRank = comm->topParentLocalRanks[localRank]; + req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); @@ -312,15 +309,13 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne if (map->sameProcess && !ncclCuMemEnable()) { if (map->cudaDev != comm->cudaDev) { - if (!ncclCuMemEnable()) { - // Enable P2P access for Legacy IPC - cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); - if (err == cudaErrorPeerAccessAlreadyEnabled) { - cudaGetLastError(); - } else if (err != cudaSuccess) { - WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); - return ncclInternalError; - } + // Enable P2P access for Legacy IPC + cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); + if (err == cudaErrorPeerAccessAlreadyEnabled) { + cudaGetLastError(); + } else if (err != cudaSuccess) { + WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); + return ncclInternalError; } } } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index fe1d054..3e4dab7 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -256,7 +256,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz accessDesc.location.id = comm->cudaDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1)); - TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id); + TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id); *devMemPtr = (void *)dptr; #else @@ -288,7 +288,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* } static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { - if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) { + if (myInfo->pidHash == peerInfo->pidHash) { if (peerInfo->cudaDev != myInfo->cudaDev) { // Same PID different GPUs, enable P2P access // Legacy CUDA IPC @@ -300,6 +300,18 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } +#if CUDART_VERSION >= 11030 + // cuMem API support + if (ncclCuMemEnable()) { + // Allow direct access to the remote buffer from the local GPU + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = myInfo->cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev); + CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1)); + } +#endif } *devMem = p2pBuff->directPtr; *ipcPtr = NULL; @@ -342,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st if (intermediateRank == -1) { info->rank = myInfo->rank; - if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) { + if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) { resources->type = P2P_DIRECT; send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s", @@ -406,7 +418,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st if (intermediateRank == -1) { info->rank = myInfo->rank; - if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) { + if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) { resources->type = P2P_DIRECT; recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else {