From d5e507fc7f9579ae442db80a3f1b96b2b79c9465 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 7 Jun 2016 16:27:51 -0700 Subject: [PATCH 1/3] Only call the CUDA runtime. That may fix #27. --- src/core.cu | 19 +++++++------- src/libwrap.cu | 71 -------------------------------------------------- src/libwrap.h | 5 ---- 3 files changed, 9 insertions(+), 86 deletions(-) diff --git a/src/core.cu b/src/core.cu index cec2794..6d26f87 100644 --- a/src/core.cu +++ b/src/core.cu @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -110,7 +109,7 @@ typedef struct { pid_t pid; ncclMem* hostptr; ncclMem* devptr; - CUipcMemHandle devipc; + cudaIpcMemHandle_t devipc; size_t buffSize; } RankEntry; @@ -299,7 +298,7 @@ static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm) info->buffSize = comm->buffSize; info->hostptr = comm->hostMem; info->devptr = comm->devMem; - if (wrapCuIpcGetMemHandle(&info->devipc, (CUdeviceptr)comm->devMem) != ncclSuccess) { + if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) { WARN("rank %d failed to open CUDA IPC handle", rank); return ncclUnhandledCudaError; } @@ -321,11 +320,11 @@ static ncclResult_t commClearMaps(ncclComm_t comm) { case CLEANUP_NONE: break; case CLEANUP_CUIPC: - res = wrapCuIpcCloseMemHandle((CUdeviceptr)comm->ptrs[d].cleanupHandle); - if (res != ncclSuccess) { + cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].cleanupHandle); + if (cures != cudaSuccess) { WARN("rank %d failed to close IPC handle to rank %d", comm->userFromRing[comm->ncclId], comm->userFromRing[d]); - retval = (retval == ncclSuccess) ? res : retval; + retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval; } break; case CLEANUP_UNMAP: @@ -333,13 +332,13 @@ static ncclResult_t commClearMaps(ncclComm_t comm) { if (cures != cudaSuccess) { WARN("rank %d failed to unregister handle to rank %d", comm->userFromRing[comm->ncclId], comm->userFromRing[d]); - retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval; + retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval; } res = shmUnmap(comm->ptrs[d].cleanupHandle, offsetof(ncclMem, buff) + comm->buffSize); if (res != ncclSuccess) { WARN("rank %d failed to unmap handle to rank %d", comm->userFromRing[comm->ncclId], comm->userFromRing[d]); - retval = (retval == ncclSuccess) ? res : retval; + retval = (retval == ncclSuccess) ? res : retval; } break; default: @@ -462,8 +461,8 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran if (canpeer || myDev == iDev) { INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank); comm->ptrs[i].local = ranks[myId].devptr; - if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote), - ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) { + if (cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote), + ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess) != cudaSuccess) { WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank); commClearMaps(comm); return ncclUnhandledCudaError; diff --git a/src/libwrap.cu b/src/libwrap.cu index c4ae737..93cb818 100644 --- a/src/libwrap.cu +++ b/src/libwrap.cu @@ -41,12 +41,6 @@ static RetCode (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device); static RetCode (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device); static const char* (*nvmlInternalErrorString)(RetCode r); -static CUresult (*cuInternalGetErrorString)(CUresult error, const char** pStr); -static CUresult (*cuInternalIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); -static CUresult (*cuInternalIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); -static CUresult (*cuInternalIpcCloseMemHandle)(CUdeviceptr dptr); - - ncclResult_t wrapSymbols(void) { if (symbolsLoaded) @@ -93,11 +87,6 @@ ncclResult_t wrapSymbols(void) { LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity); LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); - LOAD_SYM(cuhandle, "cuGetErrorString", cuInternalGetErrorString); - LOAD_SYM(cuhandle, "cuIpcGetMemHandle", cuInternalIpcGetMemHandle); - LOAD_SYM(cuhandle, "cuIpcOpenMemHandle", cuInternalIpcOpenMemHandle); - LOAD_SYM(cuhandle, "cuIpcCloseMemHandle", cuInternalIpcCloseMemHandle); - symbolsLoaded = 1; return ncclSuccess; @@ -109,11 +98,6 @@ ncclResult_t wrapSymbols(void) { nvmlInternalDeviceSetCpuAffinity = NULL; nvmlInternalDeviceClearCpuAffinity = NULL; - cuInternalGetErrorString = NULL; - cuInternalIpcGetMemHandle = NULL; - cuInternalIpcOpenMemHandle = NULL; - cuInternalIpcCloseMemHandle = NULL; - if (cuhandle != NULL) dlclose(cuhandle); if (nvmlhandle != NULL) dlclose(nvmlhandle); return ncclSystemError; @@ -203,58 +187,3 @@ ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) { } return ncclSuccess; } - -ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr) { - if (cuInternalIpcGetMemHandle == NULL) { - WARN("lib wrapper not initilaized."); - return ncclLibWrapperNotSet; - } - CUresult ret = cuInternalIpcGetMemHandle(pHandle, dptr); - if (ret != CUDA_SUCCESS) { - const char* reason = NULL; - cuInternalGetErrorString(ret, &reason); - if (reason != NULL) - WARN("cuInternalIpcGetMemHandle() failed: %s ", reason); - else - WARN("cuInternalIpcGetMemHandle() failed: %d ", ret); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags) { - if (cuInternalIpcOpenMemHandle == NULL) { - WARN("lib wrapper not initilaized."); - return ncclLibWrapperNotSet; - } - CUresult ret = cuInternalIpcOpenMemHandle(pdptr, handle, Flags); - if (ret != CUDA_SUCCESS) { - const char* reason = NULL; - cuInternalGetErrorString(ret, &reason); - if (reason != NULL) - WARN("cuInternalIpcOpenMemHandle() failed: %s ", reason); - else - WARN("cuInternalIpcOpenMemHandle() failed: %d ", ret); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr) { - if (cuInternalIpcCloseMemHandle == NULL) { - WARN("lib wrapper not initilaized."); - return ncclLibWrapperNotSet; - } - CUresult ret = cuInternalIpcCloseMemHandle(dptr); - if (ret != CUDA_SUCCESS) { - const char* reason = NULL; - cuInternalGetErrorString(ret, &reason); - if (reason != NULL) - WARN("cuInternalIpcCloseMemHandle() failed: %s ", reason); - else - WARN("cuInternalIpcCloseMemHandle() failed: %d ", ret); - return ncclSystemError; - } - return ncclSuccess; -} - diff --git a/src/libwrap.h b/src/libwrap.h index ad63f1e..b89f54d 100644 --- a/src/libwrap.h +++ b/src/libwrap.h @@ -33,7 +33,6 @@ #define SRC_LIBWRAP_H_ #include "core.h" -#include "cuda.h" typedef struct nvmlDevice_st* nvmlDevice_t; @@ -46,9 +45,5 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device); ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device); -ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr); -ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); -ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr); - #endif // End include guard From aa8f669a3da902c2feb9eb3ca5e0af9ab8e5b713 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Mon, 13 Jun 2016 01:48:59 -0700 Subject: [PATCH 2/3] Updating for .deb rebuild --- Makefile | 2 +- debian/changelog | 8 ++++++++ debian/libnccl1.install | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 90bb40f..6cd8bfb 100644 --- a/Makefile +++ b/Makefile @@ -70,7 +70,7 @@ LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduc LIBNAME := libnccl.so VER_MAJOR := 1 VER_MINOR := 2 -VER_PATCH := 2 +VER_PATCH := 3 INCDIR := $(BUILDDIR)/include LIBDIR := $(BUILDDIR)/lib diff --git a/debian/changelog b/debian/changelog index 0d193ea..84060d7 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,11 @@ +nccl (1.2.3-1+cuda7.5) trusty; urgency=medium + + * Make NCCL collectives work on communicators with only one rank + * Changed CURAND generator to work on a wider set of platforms. + * Only call the CUDA runtime. + + -- Sylvain Jeaugey Mon, 13 Jun 2016 1:45:01 -0800 + nccl (1.2.2-1+cuda7.5) trusty; urgency=medium * Gencodes changed to NV recommended diff --git a/debian/libnccl1.install b/debian/libnccl1.install index 0e2d651..363708d 100644 --- a/debian/libnccl1.install +++ b/debian/libnccl1.install @@ -1,2 +1,2 @@ lib/libnccl.so.1 /usr/lib/x86_64-linux-gnu -lib/libnccl.so.1.2.2 /usr/lib/x86_64-linux-gnu +lib/libnccl.so.1.2.3 /usr/lib/x86_64-linux-gnu From 5d4716a8a38da8563911d5a2a8f0bbbad19b7ca5 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Wed, 15 Jun 2016 10:53:43 -0700 Subject: [PATCH 3/3] Include link to blog post in README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e8c1e95..05ca7ae 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ Optimized primitives for collective multi-GPU communication. ## Introduction NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications. +[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance. ## What's inside