Only call the CUDA runtime. That may fix #27.

This commit is contained in:
Sylvain Jeaugey 2016-06-07 16:27:51 -07:00
parent 620491a649
commit d5e507fc7f
3 changed files with 9 additions and 86 deletions

View File

@ -36,7 +36,6 @@
#include <sched.h> #include <sched.h>
#include <fcntl.h> #include <fcntl.h>
#include <unistd.h> #include <unistd.h>
#include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <string.h> #include <string.h>
#include <errno.h> #include <errno.h>
@ -110,7 +109,7 @@ typedef struct {
pid_t pid; pid_t pid;
ncclMem* hostptr; ncclMem* hostptr;
ncclMem* devptr; ncclMem* devptr;
CUipcMemHandle devipc; cudaIpcMemHandle_t devipc;
size_t buffSize; size_t buffSize;
} RankEntry; } RankEntry;
@ -299,7 +298,7 @@ static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm)
info->buffSize = comm->buffSize; info->buffSize = comm->buffSize;
info->hostptr = comm->hostMem; info->hostptr = comm->hostMem;
info->devptr = comm->devMem; info->devptr = comm->devMem;
if (wrapCuIpcGetMemHandle(&info->devipc, (CUdeviceptr)comm->devMem) != ncclSuccess) { if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) {
WARN("rank %d failed to open CUDA IPC handle", rank); WARN("rank %d failed to open CUDA IPC handle", rank);
return ncclUnhandledCudaError; return ncclUnhandledCudaError;
} }
@ -321,11 +320,11 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
case CLEANUP_NONE: case CLEANUP_NONE:
break; break;
case CLEANUP_CUIPC: case CLEANUP_CUIPC:
res = wrapCuIpcCloseMemHandle((CUdeviceptr)comm->ptrs[d].cleanupHandle); cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].cleanupHandle);
if (res != ncclSuccess) { if (cures != cudaSuccess) {
WARN("rank %d failed to close IPC handle to rank %d", WARN("rank %d failed to close IPC handle to rank %d",
comm->userFromRing[comm->ncclId], comm->userFromRing[d]); comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
retval = (retval == ncclSuccess) ? res : retval; retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
} }
break; break;
case CLEANUP_UNMAP: case CLEANUP_UNMAP:
@ -462,8 +461,8 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
if (canpeer || myDev == iDev) { if (canpeer || myDev == iDev) {
INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank); INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
comm->ptrs[i].local = ranks[myId].devptr; comm->ptrs[i].local = ranks[myId].devptr;
if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote), if (cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote),
ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) { ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess) != cudaSuccess) {
WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank); WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
commClearMaps(comm); commClearMaps(comm);
return ncclUnhandledCudaError; return ncclUnhandledCudaError;

View File

@ -41,12 +41,6 @@ static RetCode (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
static RetCode (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device); static RetCode (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
static const char* (*nvmlInternalErrorString)(RetCode r); static const char* (*nvmlInternalErrorString)(RetCode r);
static CUresult (*cuInternalGetErrorString)(CUresult error, const char** pStr);
static CUresult (*cuInternalIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
static CUresult (*cuInternalIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
static CUresult (*cuInternalIpcCloseMemHandle)(CUdeviceptr dptr);
ncclResult_t wrapSymbols(void) { ncclResult_t wrapSymbols(void) {
if (symbolsLoaded) if (symbolsLoaded)
@ -93,11 +87,6 @@ ncclResult_t wrapSymbols(void) {
LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity); LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
LOAD_SYM(cuhandle, "cuGetErrorString", cuInternalGetErrorString);
LOAD_SYM(cuhandle, "cuIpcGetMemHandle", cuInternalIpcGetMemHandle);
LOAD_SYM(cuhandle, "cuIpcOpenMemHandle", cuInternalIpcOpenMemHandle);
LOAD_SYM(cuhandle, "cuIpcCloseMemHandle", cuInternalIpcCloseMemHandle);
symbolsLoaded = 1; symbolsLoaded = 1;
return ncclSuccess; return ncclSuccess;
@ -109,11 +98,6 @@ ncclResult_t wrapSymbols(void) {
nvmlInternalDeviceSetCpuAffinity = NULL; nvmlInternalDeviceSetCpuAffinity = NULL;
nvmlInternalDeviceClearCpuAffinity = NULL; nvmlInternalDeviceClearCpuAffinity = NULL;
cuInternalGetErrorString = NULL;
cuInternalIpcGetMemHandle = NULL;
cuInternalIpcOpenMemHandle = NULL;
cuInternalIpcCloseMemHandle = NULL;
if (cuhandle != NULL) dlclose(cuhandle); if (cuhandle != NULL) dlclose(cuhandle);
if (nvmlhandle != NULL) dlclose(nvmlhandle); if (nvmlhandle != NULL) dlclose(nvmlhandle);
return ncclSystemError; return ncclSystemError;
@ -203,58 +187,3 @@ ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
} }
return ncclSuccess; return ncclSuccess;
} }
ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr) {
if (cuInternalIpcGetMemHandle == NULL) {
WARN("lib wrapper not initilaized.");
return ncclLibWrapperNotSet;
}
CUresult ret = cuInternalIpcGetMemHandle(pHandle, dptr);
if (ret != CUDA_SUCCESS) {
const char* reason = NULL;
cuInternalGetErrorString(ret, &reason);
if (reason != NULL)
WARN("cuInternalIpcGetMemHandle() failed: %s ", reason);
else
WARN("cuInternalIpcGetMemHandle() failed: %d ", ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags) {
if (cuInternalIpcOpenMemHandle == NULL) {
WARN("lib wrapper not initilaized.");
return ncclLibWrapperNotSet;
}
CUresult ret = cuInternalIpcOpenMemHandle(pdptr, handle, Flags);
if (ret != CUDA_SUCCESS) {
const char* reason = NULL;
cuInternalGetErrorString(ret, &reason);
if (reason != NULL)
WARN("cuInternalIpcOpenMemHandle() failed: %s ", reason);
else
WARN("cuInternalIpcOpenMemHandle() failed: %d ", ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr) {
if (cuInternalIpcCloseMemHandle == NULL) {
WARN("lib wrapper not initilaized.");
return ncclLibWrapperNotSet;
}
CUresult ret = cuInternalIpcCloseMemHandle(dptr);
if (ret != CUDA_SUCCESS) {
const char* reason = NULL;
cuInternalGetErrorString(ret, &reason);
if (reason != NULL)
WARN("cuInternalIpcCloseMemHandle() failed: %s ", reason);
else
WARN("cuInternalIpcCloseMemHandle() failed: %d ", ret);
return ncclSystemError;
}
return ncclSuccess;
}

View File

@ -33,7 +33,6 @@
#define SRC_LIBWRAP_H_ #define SRC_LIBWRAP_H_
#include "core.h" #include "core.h"
#include "cuda.h"
typedef struct nvmlDevice_st* nvmlDevice_t; typedef struct nvmlDevice_st* nvmlDevice_t;
@ -46,9 +45,5 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device); ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device); ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr);
ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr);
#endif // End include guard #endif // End include guard