Only enable peer access for ring neighbors.
This enables support for systems with more than 9 GPUs attached to a single PCIe root complex.
This commit is contained in:
parent
c219a183d0
commit
8996811936
@ -388,7 +388,9 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
|
||||
pid_t iPid = ranks[i].pid;
|
||||
int canpeer = 0;
|
||||
|
||||
if (cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
|
||||
int iIsNeighbor = (i == (myNcclId+1)%ndev) || (i == (myNcclId+ndev-1)%ndev);
|
||||
|
||||
if (iIsNeighbor && cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
|
||||
INFO("peer query failed between rank %d (dev %d) and rank %d (dev %d)",
|
||||
rank, myDev, iRank, iDev);
|
||||
canpeer = 0;
|
||||
@ -777,12 +779,12 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId
|
||||
}
|
||||
|
||||
res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->globalMemSpaceBroke);
|
||||
syncRingDirect(gath, &((*newcomm)->globalMemSpace));
|
||||
if (res != ncclSuccess) {
|
||||
WARN("rank %d failed to build comm maps", myrank);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
syncRingDirect(gath, &((*newcomm)->globalMemSpace));
|
||||
INFO("Global device memory space is %s", (*newcomm)->globalMemSpace ? "enabled" : "disabled");
|
||||
|
||||
res = closeGather(gath, ndev); // includes a barrier
|
||||
|
Loading…
x
Reference in New Issue
Block a user