Fixed useRemoteRecv consistency issue.

Change-Id: Ib093a8dc3bb093eddc89dad81d3fffa53c03a6a2
Reviewed-on: http://git-master/r/1013543
Reviewed-by: Cliff Woolley <jwoolley@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
This commit is contained in:
Nathan Luehr 2016-02-18 11:59:54 -08:00 committed by Przemek Tredak
parent 9442285526
commit 5554a4c9f0

View File

@ -133,6 +133,7 @@ typedef struct {
union {
struct {
volatile int bar;
int ringDirectFail;
};
char pad[16];
};
@ -178,6 +179,22 @@ static ncclResult_t initGather(RankGather** gather, ncclUniqueId commId,
return ncclSuccess;
}
static void syncRingDirect(RankGather* gather, int* ringDirectOk) {
int bar_tmp = gather->bar - 1;
int ndev = gather->ranks[0].ndev;
bool swapped;
do {
bar_tmp += 1;
swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
} while(!swapped);
while (gather->bar != 2*ndev) // Wait for all ranks to arrive at this second barrier
sched_yield();
__sync_synchronize();
*ringDirectOk = gather->ringDirectFail ? 0 : 1;
}
static ncclResult_t closeGather(RankGather* gather, int ndev) {
int bar_tmp = gather->bar - 1;
bool swapped;
@ -186,7 +203,7 @@ static ncclResult_t closeGather(RankGather* gather, int ndev) {
swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
} while(!swapped);
while (gather->bar != 2*ndev)
while (gather->bar != 3*ndev) // Wait for all ranks to arrive at this third barrier
sched_yield();
__sync_synchronize();
@ -347,7 +364,7 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
return retval;
}
static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks) {
static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* ringDirectFailed) {
int ndev = comm->nDev;
for(int i=0; i<ndev; ++i) {
// Check for inconsistencies between ranks
@ -418,16 +435,16 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
}
}
if (iPid == myPid && (canpeer || myDev == iDev)) {
if (iPid == myPid) {
if (canpeer || myDev == iDev) {
INFO("rank access %d -> %d via P2P device mem", rank, iRank);
comm->ptrs[i].local = ranks[myId].devptr;
comm->ptrs[i].remote = ranks[i].devptr;
comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
} else if (iPid == myPid) {
} else { // go through hostmem
INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
if (j <= 2) {
comm->useRemoteRecv = 0;
}
if (j <= 2)
*ringDirectFailed = 1;
if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
WARN("rank %d failed to map zero copy buffer to device", rank);
commClearMaps(comm);
@ -439,9 +456,11 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
return ncclUnhandledCudaError;
}
comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
} else if (canpeer || myDev == iDev) {
}
} else { // multi-process!
*ringDirectFailed = 1;
if (canpeer || myDev == iDev) {
INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
comm->useRemoteRecv = 0;
comm->ptrs[i].local = ranks[myId].devptr;
if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
@ -451,9 +470,8 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
}
comm->ptrs[i].remoteCleanup = CLEANUP_CUIPC;
comm->ptrs[i].cleanupHandle = comm->ptrs[i].remote;
} else {
} else { // go through hostmem
INFO("rank access %d -> %d via zero copy host shm", rank, iRank);
comm->useRemoteRecv = 0;
if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
WARN("rank %d failed to obtain dev ptr to sysmem buffer", rank);
commClearMaps(comm);
@ -475,7 +493,7 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
comm->ptrs[i].remoteCleanup = CLEANUP_UNMAP;
}
}
INFO("PushToRecv algos are %s\n", (comm->useRemoteRecv) ? "enabled" : "disabled");
}
return ncclSuccess;
}
@ -679,12 +697,15 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId
goto cleanup;
}
res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks);
res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->ringDirectFail);
if (res != ncclSuccess) {
WARN("rank %d failed to build comm maps", myrank);
goto cleanup;
}
syncRingDirect(gath, &((*newcomm)->useRemoteRecv));
INFO("PushToRecv algos are %s\n", (*newcomm)->useRemoteRecv ? "enabled" : "disabled");
res = closeGather(gath, ndev); // includes a barrier
gath = NULL;
if (res != ncclSuccess) {
@ -722,6 +743,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
char busId[13];
nvmlDevice_t nvmlHandle;
int affinity_set = 0;
int ringDirectFail = 0; // Assume direct access to recv ptr OK
res = wrapSymbols();
if (res != ncclSuccess) {
@ -792,13 +814,18 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
for(rank=0; rank<ndev; ++rank) {
comm = comms[rank];
cudaSetDevice(comm->cudaDev);
res = commBuildMaps(comm, NULL, rank, ranks);
res = commBuildMaps(comm, NULL, rank, ranks, &ringDirectFail);
if (res != ncclSuccess) {
WARN("rank %d failed to build comm maps", rank);
goto cleanup;
}
}
INFO("PushToRecv algos are %s\n", (ringDirectFail) ? "disabled" : "enabled");
for(rank=0; rank<ndev; ++rank) {
comms[rank]->useRemoteRecv = ringDirectFail ? 0 : 1;
}
free(ranks);
ranks = NULL;
res = ncclSuccess;