Fixed redundant contexts in multi-process apps

Change-Id: If787014450fd281304f0c7baf01d25963e40905d
This commit is contained in:
Nathan Luehr 2016-07-22 17:29:13 -07:00
parent 7a1aa6b563
commit 55c42ad681
2 changed files with 36 additions and 30 deletions

View File

@ -422,40 +422,46 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
canpeer = 0; canpeer = 0;
} }
if (canpeer) {
cudaError_t err;
err = cudaDeviceEnablePeerAccess(iDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
INFO("peer access failed between rank %d (dev %d) and rank %d (dev %d)\n",
rank, myDev, iRank, iDev);
canpeer = 0;
}
}
if (iPid == myPid) { if (iPid == myPid) {
if (canpeer || myDev == iDev) { if (myDev == iDev) {
INFO("rank access %d -> %d via P2P device mem", rank, iRank); INFO("rank access %d -> %d via common device", rank, iRank);
comm->ptrs[i].local = ranks[myId].devptr; comm->ptrs[i].local = ranks[myId].devptr;
comm->ptrs[i].remote = ranks[i].devptr; comm->ptrs[i].remote = ranks[i].devptr;
comm->ptrs[i].remoteCleanup = CLEANUP_NONE; comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
} else { // go through hostmem } else {
INFO("rank access %d -> %d via zero-copy host mem", rank, iRank); int peer_enabled = canpeer;
if (j <= 2) if (canpeer) {
*ringDirectFailed = 1; cudaError_t p2pErr = cudaDeviceEnablePeerAccess(iDev, 0);
if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) { if (p2pErr == cudaErrorPeerAccessAlreadyEnabled) {
WARN("rank %d failed to map zero copy buffer to device", rank); cudaGetLastError();
commClearMaps(comm); } else if (p2pErr != cudaSuccess) {
return ncclUnhandledCudaError; INFO("peer access failed between rank %d (dev %d) and rank %d (dev %d)\n",
rank, myDev, iRank, iDev);
peer_enabled = 0;
}
} }
if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, ranks[i].hostptr, 0) != cudaSuccess) {
WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank); if (peer_enabled) {
commClearMaps(comm); INFO("rank access %d -> %d via P2P device mem", rank, iRank);
return ncclUnhandledCudaError; comm->ptrs[i].local = ranks[myId].devptr;
comm->ptrs[i].remote = ranks[i].devptr;
comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
} else { // go through hostmem
INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
if (j <= 2)
*ringDirectFailed = 1;
if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
WARN("rank %d failed to map zero copy buffer to device", rank);
commClearMaps(comm);
return ncclUnhandledCudaError;
}
if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, ranks[i].hostptr, 0) != cudaSuccess) {
WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
commClearMaps(comm);
return ncclUnhandledCudaError;
}
comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
} }
comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
} }
} else { // multi-process! } else { // multi-process!
*ringDirectFailed = 1; *ringDirectFailed = 1;

View File

@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
// CUDA stream creation // CUDA stream creation
cudaStream_t stream; cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// Initialize input values // Initialize input values
int *dptr; int *dptr;
@ -91,7 +91,7 @@ int main(int argc, char *argv[]) {
} }
// Check results // Check results
cudaStreamSynchronize(stream); CUDACHECK(cudaStreamSynchronize(stream));
CUDACHECK(cudaMemcpy(val, (dptr+SIZE), SIZE*sizeof(int), cudaMemcpyDeviceToHost)); CUDACHECK(cudaMemcpy(val, (dptr+SIZE), SIZE*sizeof(int), cudaMemcpyDeviceToHost));
for (int v=0; v<SIZE; v++) { for (int v=0; v<SIZE; v++) {
if (val[v] != ref) { if (val[v] != ref) {