Only call the CUDA runtime. That may fix #27.
This commit is contained in:
parent
620491a649
commit
d5e507fc7f
19
src/core.cu
19
src/core.cu
@ -36,7 +36,6 @@
|
|||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
@ -110,7 +109,7 @@ typedef struct {
|
|||||||
pid_t pid;
|
pid_t pid;
|
||||||
ncclMem* hostptr;
|
ncclMem* hostptr;
|
||||||
ncclMem* devptr;
|
ncclMem* devptr;
|
||||||
CUipcMemHandle devipc;
|
cudaIpcMemHandle_t devipc;
|
||||||
size_t buffSize;
|
size_t buffSize;
|
||||||
} RankEntry;
|
} RankEntry;
|
||||||
|
|
||||||
@ -299,7 +298,7 @@ static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm)
|
|||||||
info->buffSize = comm->buffSize;
|
info->buffSize = comm->buffSize;
|
||||||
info->hostptr = comm->hostMem;
|
info->hostptr = comm->hostMem;
|
||||||
info->devptr = comm->devMem;
|
info->devptr = comm->devMem;
|
||||||
if (wrapCuIpcGetMemHandle(&info->devipc, (CUdeviceptr)comm->devMem) != ncclSuccess) {
|
if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) {
|
||||||
WARN("rank %d failed to open CUDA IPC handle", rank);
|
WARN("rank %d failed to open CUDA IPC handle", rank);
|
||||||
return ncclUnhandledCudaError;
|
return ncclUnhandledCudaError;
|
||||||
}
|
}
|
||||||
@ -321,11 +320,11 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
|
|||||||
case CLEANUP_NONE:
|
case CLEANUP_NONE:
|
||||||
break;
|
break;
|
||||||
case CLEANUP_CUIPC:
|
case CLEANUP_CUIPC:
|
||||||
res = wrapCuIpcCloseMemHandle((CUdeviceptr)comm->ptrs[d].cleanupHandle);
|
cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].cleanupHandle);
|
||||||
if (res != ncclSuccess) {
|
if (cures != cudaSuccess) {
|
||||||
WARN("rank %d failed to close IPC handle to rank %d",
|
WARN("rank %d failed to close IPC handle to rank %d",
|
||||||
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
||||||
retval = (retval == ncclSuccess) ? res : retval;
|
retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case CLEANUP_UNMAP:
|
case CLEANUP_UNMAP:
|
||||||
@ -333,13 +332,13 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
|
|||||||
if (cures != cudaSuccess) {
|
if (cures != cudaSuccess) {
|
||||||
WARN("rank %d failed to unregister handle to rank %d",
|
WARN("rank %d failed to unregister handle to rank %d",
|
||||||
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
||||||
retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
|
retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
|
||||||
}
|
}
|
||||||
res = shmUnmap(comm->ptrs[d].cleanupHandle, offsetof(ncclMem, buff) + comm->buffSize);
|
res = shmUnmap(comm->ptrs[d].cleanupHandle, offsetof(ncclMem, buff) + comm->buffSize);
|
||||||
if (res != ncclSuccess) {
|
if (res != ncclSuccess) {
|
||||||
WARN("rank %d failed to unmap handle to rank %d",
|
WARN("rank %d failed to unmap handle to rank %d",
|
||||||
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
||||||
retval = (retval == ncclSuccess) ? res : retval;
|
retval = (retval == ncclSuccess) ? res : retval;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@ -462,8 +461,8 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
|
|||||||
if (canpeer || myDev == iDev) {
|
if (canpeer || myDev == iDev) {
|
||||||
INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
|
INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
|
||||||
comm->ptrs[i].local = ranks[myId].devptr;
|
comm->ptrs[i].local = ranks[myId].devptr;
|
||||||
if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
|
if (cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote),
|
||||||
ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
|
ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess) != cudaSuccess) {
|
||||||
WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
|
WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
|
||||||
commClearMaps(comm);
|
commClearMaps(comm);
|
||||||
return ncclUnhandledCudaError;
|
return ncclUnhandledCudaError;
|
||||||
|
@ -41,12 +41,6 @@ static RetCode (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
|
|||||||
static RetCode (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
|
static RetCode (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
|
||||||
static const char* (*nvmlInternalErrorString)(RetCode r);
|
static const char* (*nvmlInternalErrorString)(RetCode r);
|
||||||
|
|
||||||
static CUresult (*cuInternalGetErrorString)(CUresult error, const char** pStr);
|
|
||||||
static CUresult (*cuInternalIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
|
|
||||||
static CUresult (*cuInternalIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
|
|
||||||
static CUresult (*cuInternalIpcCloseMemHandle)(CUdeviceptr dptr);
|
|
||||||
|
|
||||||
|
|
||||||
ncclResult_t wrapSymbols(void) {
|
ncclResult_t wrapSymbols(void) {
|
||||||
|
|
||||||
if (symbolsLoaded)
|
if (symbolsLoaded)
|
||||||
@ -93,11 +87,6 @@ ncclResult_t wrapSymbols(void) {
|
|||||||
LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
|
LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
|
||||||
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
|
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
|
||||||
|
|
||||||
LOAD_SYM(cuhandle, "cuGetErrorString", cuInternalGetErrorString);
|
|
||||||
LOAD_SYM(cuhandle, "cuIpcGetMemHandle", cuInternalIpcGetMemHandle);
|
|
||||||
LOAD_SYM(cuhandle, "cuIpcOpenMemHandle", cuInternalIpcOpenMemHandle);
|
|
||||||
LOAD_SYM(cuhandle, "cuIpcCloseMemHandle", cuInternalIpcCloseMemHandle);
|
|
||||||
|
|
||||||
symbolsLoaded = 1;
|
symbolsLoaded = 1;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
|
||||||
@ -109,11 +98,6 @@ ncclResult_t wrapSymbols(void) {
|
|||||||
nvmlInternalDeviceSetCpuAffinity = NULL;
|
nvmlInternalDeviceSetCpuAffinity = NULL;
|
||||||
nvmlInternalDeviceClearCpuAffinity = NULL;
|
nvmlInternalDeviceClearCpuAffinity = NULL;
|
||||||
|
|
||||||
cuInternalGetErrorString = NULL;
|
|
||||||
cuInternalIpcGetMemHandle = NULL;
|
|
||||||
cuInternalIpcOpenMemHandle = NULL;
|
|
||||||
cuInternalIpcCloseMemHandle = NULL;
|
|
||||||
|
|
||||||
if (cuhandle != NULL) dlclose(cuhandle);
|
if (cuhandle != NULL) dlclose(cuhandle);
|
||||||
if (nvmlhandle != NULL) dlclose(nvmlhandle);
|
if (nvmlhandle != NULL) dlclose(nvmlhandle);
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
@ -203,58 +187,3 @@ ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
|
|||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr) {
|
|
||||||
if (cuInternalIpcGetMemHandle == NULL) {
|
|
||||||
WARN("lib wrapper not initilaized.");
|
|
||||||
return ncclLibWrapperNotSet;
|
|
||||||
}
|
|
||||||
CUresult ret = cuInternalIpcGetMemHandle(pHandle, dptr);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
const char* reason = NULL;
|
|
||||||
cuInternalGetErrorString(ret, &reason);
|
|
||||||
if (reason != NULL)
|
|
||||||
WARN("cuInternalIpcGetMemHandle() failed: %s ", reason);
|
|
||||||
else
|
|
||||||
WARN("cuInternalIpcGetMemHandle() failed: %d ", ret);
|
|
||||||
return ncclSystemError;
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags) {
|
|
||||||
if (cuInternalIpcOpenMemHandle == NULL) {
|
|
||||||
WARN("lib wrapper not initilaized.");
|
|
||||||
return ncclLibWrapperNotSet;
|
|
||||||
}
|
|
||||||
CUresult ret = cuInternalIpcOpenMemHandle(pdptr, handle, Flags);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
const char* reason = NULL;
|
|
||||||
cuInternalGetErrorString(ret, &reason);
|
|
||||||
if (reason != NULL)
|
|
||||||
WARN("cuInternalIpcOpenMemHandle() failed: %s ", reason);
|
|
||||||
else
|
|
||||||
WARN("cuInternalIpcOpenMemHandle() failed: %d ", ret);
|
|
||||||
return ncclSystemError;
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr) {
|
|
||||||
if (cuInternalIpcCloseMemHandle == NULL) {
|
|
||||||
WARN("lib wrapper not initilaized.");
|
|
||||||
return ncclLibWrapperNotSet;
|
|
||||||
}
|
|
||||||
CUresult ret = cuInternalIpcCloseMemHandle(dptr);
|
|
||||||
if (ret != CUDA_SUCCESS) {
|
|
||||||
const char* reason = NULL;
|
|
||||||
cuInternalGetErrorString(ret, &reason);
|
|
||||||
if (reason != NULL)
|
|
||||||
WARN("cuInternalIpcCloseMemHandle() failed: %s ", reason);
|
|
||||||
else
|
|
||||||
WARN("cuInternalIpcCloseMemHandle() failed: %d ", ret);
|
|
||||||
return ncclSystemError;
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
@ -33,7 +33,6 @@
|
|||||||
#define SRC_LIBWRAP_H_
|
#define SRC_LIBWRAP_H_
|
||||||
|
|
||||||
#include "core.h"
|
#include "core.h"
|
||||||
#include "cuda.h"
|
|
||||||
|
|
||||||
typedef struct nvmlDevice_st* nvmlDevice_t;
|
typedef struct nvmlDevice_st* nvmlDevice_t;
|
||||||
|
|
||||||
@ -46,9 +45,5 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
|
|||||||
ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
|
ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
|
||||||
ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
|
ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
|
||||||
|
|
||||||
ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr);
|
|
||||||
ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
|
|
||||||
ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr);
|
|
||||||
|
|
||||||
#endif // End include guard
|
#endif // End include guard
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user