2.19.3-1

H800/H100 fixes and tuning. Re-enable intra-process direct pointer buffer access when CUMEM is enabled.
2023-09-26 05:56:27 -07:00 · 2023-09-26 05:56:27 -07:00 · 8c6c595185
commit 8c6c595185
parent 3435178b6c
10 changed files with 96 additions and 74 deletions
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 19
-NCCL_PATCH   := 1
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@ -401,9 +401,9 @@ static ncclResult_t registerIntraNodeBuffers(
      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
       * saturate bandwidth. */
      if (info->coll == ncclFuncReduceScatter)
-        info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+        info->nChannels = std::min(5, comm->nvlsChannels);
      else
-        info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+        info->nChannels = std::min(4, comm->nvlsChannels);
      *outRegBufType = NCCL_NVLS_REG_BUFFER;
    }
  } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
@ -1358,6 +1358,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc
    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
  } else if (info->algorithm == NCCL_ALGO_NVLS) {
    int maxChunkSize = 131072;
+    if (info->comm->nNodes > 1 && info->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
    if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
    uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
@ -1368,6 +1369,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFunc
  } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
    uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
+    if (info->comm->nNodes >= 4) chunkSize = 65536;
    if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
    if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
    if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@ -260,6 +260,32 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
  } else {
    for (int i=0; i<count; i++) next[i] = scores[i].g;
  }
+
+  if (system->nodes[NVS].count) {
+    // NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first.
+    int index = gpu-system->nodes[GPU].nodes;
+    int i;
+    int prevGpu = (index-1+ngpus)%ngpus;
+    int nextGpu = (index+1)%ngpus;
+    int firstGpus[2];
+    int firstGpuCount = 0;
+    if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
+      firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2;
+    } else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ||
+        graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
+      firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2;
+    } else {
+      firstGpus[0] = nextGpu; firstGpuCount = 1;
+    }
+    for (int g=0; g<firstGpuCount; g++) {
+      for (i=0; i<count && next[i] != firstGpus[g]; i++);
+      if (i<count) {
+        for (; i>0; i--) next[i] = next[i-1];
+        next[0] = firstGpus[g];
+      }
+    }
+  }
+
  *countPtr = count;
  return ncclSuccess;
 }
@ -555,7 +581,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
      }
      if (graph->nChannels == 0 || graph->sameChannels == 0) {
-        if (graph->nChannels == 0) {
+        if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
          // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
          int t = 1 << 10;
          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
@ -794,13 +820,28 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
  return ncclSuccess;
 }

+ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngpus) {
+  if (graph->nChannels == 0) return ncclSuccess;
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
+  if (graph->bwIntra < 25.0) return ncclSuccess;
+  if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
+
+  int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
+  memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
+  memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
+  graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
+  graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
+  graph->nChannels = dupChannels;
+  return ncclSuccess;
+}
+
 float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
 float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))

 float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
-float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))

@ -808,8 +849,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
  int ngpus = system->nodes[GPU].count;
  int crossNic = (system->nodes[NET].count > 1) &&
 	 (graph->pattern == NCCL_TOPO_PATTERN_RING ||
-	  graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
-	  graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
+          graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
+          graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
  graph->crossNic = crossNic == 1 ? 1 : 0;
  graph->bwIntra = graph->bwInter = 0;
  graph->latencyInter = 0;
@ -949,6 +990,7 @@ done:
  // We have a solution. Start from that solution and move to pass 2.
  if (pass == 1) {
    time = -1;
+    NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus));
    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
    speedIndex = 0;
    while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
@ -957,27 +999,22 @@ done:
    pass = 2;
  }

-  // 3. See if we can increase bwIntra for trees (2 nodes or collnet)
  if (pass == 2) {
-    if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && graph->pattern != NCCL_TOPO_PATTERN_NVLS &&
-        tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
-        speedIndex > 0) {
-      tmpGraph.bwIntra = speedArray[--speedIndex];
-      goto search;
-    }
-    time = -1;
-    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
-    pass = 3;
-  }
-
-  // 4. See if we can increase bwInter for nvls+tree
-  if (pass == 3) {
-    if (time != 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS &&
-        tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2 &&
-        speedIndex > 0) {
-      tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
-      tmpGraph.bwInter = speedArray[--speedIndex];
-      goto search;
+    // See if we can increase bw
+    if (time != 0 && speedIndex > 0) {
+      if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
+        // increase bw for Ring
+        tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex];
+        goto search;
+      } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) {
+        tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
+        tmpGraph.bwInter = speedArray[--speedIndex];
+        goto search;
+      } else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) {
+        // increase bwIntra for trees (2 nodes or collnet)
+        tmpGraph.bwIntra = speedArray[--speedIndex];
+        goto search;
+      }
    }
    time = -1;
    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
@ -991,18 +1028,6 @@ done:
    graph->typeIntra = graph->typeInter = PATH_SYS;
    graph->nChannels = 1;
  }
-
-  if (graph->nChannels == 0) return ncclSuccess;
-  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
-  if (graph->bwIntra < 25.0) return ncclSuccess;
-  if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
-
-  int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
-  memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
-  memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
-  graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
-  graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
-  graph->nChannels = dupChannels;
  return ncclSuccess;
 }

--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@ -878,14 +878,3 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
  if (ccMax) *ccMax = max;
  return ncclSuccess;
 }
-
-ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
-      *localRank = g;
-      return ncclSuccess;
-    }
-  }
-  WARN("Could not find local GPU with rank %d", rank);
-  return ncclInternalError;
-}
--- a/src/include/comm.h
+++ b/src/include/comm.h
@ -127,7 +127,7 @@ struct ncclChannel {
  struct ncclChannelPeer** peers;
  struct ncclDevChannelPeer** devPeers;
  /* devPeer pointer array used for host side access */
-  struct ncclDevChannelPeer** devPeersHostPtr; 
+  struct ncclDevChannelPeer** devPeersHostPtr;
  struct ncclRing ring;
  int* devRingUserRanks;
  struct ncclTree tree;
--- a/src/include/graph.h
+++ b/src/include/graph.h
@ -38,7 +38,6 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
-ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);

 // Find CPU affinity
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
--- a/src/init.cc
+++ b/src/init.cc
@ -1174,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
      int sendNode = (node+delta)%nNodes;
      for (int step=0; step < steps; step++) {
        int recvIndex = (localRank-step+steps)%steps;
-	int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
+        int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
        tasks->p2pRecvOrder[i] = recvRank;
        int sendIndex = (localRank+step)%steps;
        int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@ -155,7 +155,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
  send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;

-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank));
+  send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
  tpProxyRank = comm->topParentRanks[myInfo->rank];
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
  ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
@ -177,7 +177,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  // Determine whether we need to flush the GDR buffer on recv or not
  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));

-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank));
+  recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
  tpProxyRank = comm->topParentRanks[myInfo->rank];
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@ -172,7 +172,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
 * information for this peer */
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
  struct setupReq req = { 0 };
-  int localRank, tpProxyRank;
+  int tpProxyRank;

  send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
  req.channelId = channelId;
@ -185,8 +185,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph

  tpProxyRank = comm->topParentRanks[proxyRank];
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
-  req.tpLocalRank = comm->topParentLocalRanks[localRank];
+  req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
  req.tpRank = comm->topParentRanks[myInfo->rank];
  req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
  NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
@ -210,7 +209,6 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
 /* Setup recv connector */
 static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
  struct setupReq req = { 0 };
-  int localRank;

  recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
  req.channelId = channelId;
@ -228,8 +226,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  tpProxyRank = comm->topParentRanks[myInfo->rank];
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));

-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
-  req.tpLocalRank = comm->topParentLocalRanks[localRank];
+  req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
  req.tpRank = comm->topParentRanks[myInfo->rank];
  req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
@ -312,15 +309,13 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne

  if (map->sameProcess && !ncclCuMemEnable()) {
    if (map->cudaDev != comm->cudaDev) {
-      if (!ncclCuMemEnable()) {
-        // Enable P2P access for Legacy IPC
-        cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
-        if (err == cudaErrorPeerAccessAlreadyEnabled) {
-          cudaGetLastError();
-        } else if (err != cudaSuccess) {
-          WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
-          return ncclInternalError;
-        }
+      // Enable P2P access for Legacy IPC
+      cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
+        return ncclInternalError;
      }
    }
  } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@ -256,7 +256,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
    accessDesc.location.id = comm->cudaDev;
    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
-    TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id);
+    TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id);

    *devMemPtr = (void *)dptr;
 #else
@ -288,7 +288,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
 }

 static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
-  if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) {
+  if (myInfo->pidHash == peerInfo->pidHash) {
    if (peerInfo->cudaDev != myInfo->cudaDev) {
      // Same PID different GPUs, enable P2P access
      // Legacy CUDA IPC
@ -300,6 +300,18 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
            peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
        return ncclInternalError;
      }
+#if CUDART_VERSION >= 11030
+      // cuMem API support
+      if (ncclCuMemEnable()) {
+        // Allow direct access to the remote buffer from the local GPU
+        CUmemAccessDesc accessDesc = {};
+        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        accessDesc.location.id = myInfo->cudaDev;
+        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
+        CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
+      }
+#endif
    }
    *devMem = p2pBuff->directPtr;
    *ipcPtr = NULL;
@ -342,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
+    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
      resources->type = P2P_DIRECT;
      send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
@ -406,7 +418,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
+    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
      resources->type = P2P_DIRECT;
      recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
    } else {