Only enable peer access for ring neighbors.

This enables support for systems with more than 9 GPUs attached to a single PCIe root complex.
2017-03-01 15:59:48 -08:00 · 2017-03-01 15:59:48 -08:00 · 8996811936
commit 8996811936
parent c219a183d0
1 changed files with 4 additions and 2 deletions
--- a/src/core.cu
+++ b/src/core.cu
@ -388,7 +388,9 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
    pid_t iPid = ranks[i].pid;
    int canpeer = 0;

-    if (cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
+    int iIsNeighbor = (i == (myNcclId+1)%ndev) || (i == (myNcclId+ndev-1)%ndev);
+
+    if (iIsNeighbor && cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
      INFO("peer query failed between rank %d (dev %d) and rank %d (dev %d)",
        rank, myDev, iRank, iDev);
      canpeer = 0;
@ -777,12 +779,12 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId
  }

  res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->globalMemSpaceBroke);
+  syncRingDirect(gath, &((*newcomm)->globalMemSpace));
  if (res != ncclSuccess) {
    WARN("rank %d failed to build comm maps", myrank);
    goto cleanup;
  }

-  syncRingDirect(gath, &((*newcomm)->globalMemSpace));
  INFO("Global device memory space is %s", (*newcomm)->globalMemSpace ? "enabled" : "disabled");

  res = closeGather(gath, ndev); // includes a barrier