diff --git a/src/core.cu b/src/core.cu index c7bf89e..1420d21 100644 --- a/src/core.cu +++ b/src/core.cu @@ -388,7 +388,9 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran pid_t iPid = ranks[i].pid; int canpeer = 0; - if (cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) { + int iIsNeighbor = (i == (myNcclId+1)%ndev) || (i == (myNcclId+ndev-1)%ndev); + + if (iIsNeighbor && cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) { INFO("peer query failed between rank %d (dev %d) and rank %d (dev %d)", rank, myDev, iRank, iDev); canpeer = 0; @@ -777,12 +779,12 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId } res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->globalMemSpaceBroke); + syncRingDirect(gath, &((*newcomm)->globalMemSpace)); if (res != ncclSuccess) { WARN("rank %d failed to build comm maps", myrank); goto cleanup; } - syncRingDirect(gath, &((*newcomm)->globalMemSpace)); INFO("Global device memory space is %s", (*newcomm)->globalMemSpace ? "enabled" : "disabled"); res = closeGather(gath, ndev); // includes a barrier