Merge remote-tracking branch 'github/master' into public
This commit is contained in:
commit
428ec5b2a3
@ -5,6 +5,7 @@ Optimized primitives for collective multi-GPU communication.
|
||||
## Introduction
|
||||
|
||||
NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
|
||||
[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
|
||||
|
||||
## What's inside
|
||||
|
||||
|
19
src/core.cu
19
src/core.cu
@ -36,7 +36,6 @@
|
||||
#include <sched.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
@ -110,7 +109,7 @@ typedef struct {
|
||||
pid_t pid;
|
||||
ncclMem* hostptr;
|
||||
ncclMem* devptr;
|
||||
CUipcMemHandle devipc;
|
||||
cudaIpcMemHandle_t devipc;
|
||||
size_t buffSize;
|
||||
} RankEntry;
|
||||
|
||||
@ -299,7 +298,7 @@ static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm)
|
||||
info->buffSize = comm->buffSize;
|
||||
info->hostptr = comm->hostMem;
|
||||
info->devptr = comm->devMem;
|
||||
if (wrapCuIpcGetMemHandle(&info->devipc, (CUdeviceptr)comm->devMem) != ncclSuccess) {
|
||||
if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) {
|
||||
WARN("rank %d failed to open CUDA IPC handle", rank);
|
||||
return ncclUnhandledCudaError;
|
||||
}
|
||||
@ -321,11 +320,11 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
|
||||
case CLEANUP_NONE:
|
||||
break;
|
||||
case CLEANUP_CUIPC:
|
||||
res = wrapCuIpcCloseMemHandle((CUdeviceptr)comm->ptrs[d].cleanupHandle);
|
||||
if (res != ncclSuccess) {
|
||||
cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].cleanupHandle);
|
||||
if (cures != cudaSuccess) {
|
||||
WARN("rank %d failed to close IPC handle to rank %d",
|
||||
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
||||
retval = (retval == ncclSuccess) ? res : retval;
|
||||
retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
|
||||
}
|
||||
break;
|
||||
case CLEANUP_UNMAP:
|
||||
@ -333,13 +332,13 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
|
||||
if (cures != cudaSuccess) {
|
||||
WARN("rank %d failed to unregister handle to rank %d",
|
||||
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
||||
retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
|
||||
retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
|
||||
}
|
||||
res = shmUnmap(comm->ptrs[d].cleanupHandle, offsetof(ncclMem, buff) + comm->buffSize);
|
||||
if (res != ncclSuccess) {
|
||||
WARN("rank %d failed to unmap handle to rank %d",
|
||||
comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
|
||||
retval = (retval == ncclSuccess) ? res : retval;
|
||||
retval = (retval == ncclSuccess) ? res : retval;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@ -468,8 +467,8 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
|
||||
if (canpeer || myDev == iDev) {
|
||||
INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
|
||||
comm->ptrs[i].local = ranks[myId].devptr;
|
||||
if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
|
||||
ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
|
||||
if (cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote),
|
||||
ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess) != cudaSuccess) {
|
||||
WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
|
||||
commClearMaps(comm);
|
||||
return ncclUnhandledCudaError;
|
||||
|
@ -41,12 +41,6 @@ static RetCode (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
|
||||
static RetCode (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
|
||||
static const char* (*nvmlInternalErrorString)(RetCode r);
|
||||
|
||||
static CUresult (*cuInternalGetErrorString)(CUresult error, const char** pStr);
|
||||
static CUresult (*cuInternalIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
|
||||
static CUresult (*cuInternalIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
|
||||
static CUresult (*cuInternalIpcCloseMemHandle)(CUdeviceptr dptr);
|
||||
|
||||
|
||||
ncclResult_t wrapSymbols(void) {
|
||||
|
||||
if (symbolsLoaded)
|
||||
@ -93,11 +87,6 @@ ncclResult_t wrapSymbols(void) {
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
|
||||
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
|
||||
|
||||
LOAD_SYM(cuhandle, "cuGetErrorString", cuInternalGetErrorString);
|
||||
LOAD_SYM(cuhandle, "cuIpcGetMemHandle", cuInternalIpcGetMemHandle);
|
||||
LOAD_SYM(cuhandle, "cuIpcOpenMemHandle", cuInternalIpcOpenMemHandle);
|
||||
LOAD_SYM(cuhandle, "cuIpcCloseMemHandle", cuInternalIpcCloseMemHandle);
|
||||
|
||||
symbolsLoaded = 1;
|
||||
return ncclSuccess;
|
||||
|
||||
@ -109,11 +98,6 @@ ncclResult_t wrapSymbols(void) {
|
||||
nvmlInternalDeviceSetCpuAffinity = NULL;
|
||||
nvmlInternalDeviceClearCpuAffinity = NULL;
|
||||
|
||||
cuInternalGetErrorString = NULL;
|
||||
cuInternalIpcGetMemHandle = NULL;
|
||||
cuInternalIpcOpenMemHandle = NULL;
|
||||
cuInternalIpcCloseMemHandle = NULL;
|
||||
|
||||
if (cuhandle != NULL) dlclose(cuhandle);
|
||||
if (nvmlhandle != NULL) dlclose(nvmlhandle);
|
||||
return ncclSystemError;
|
||||
@ -203,58 +187,3 @@ ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr) {
|
||||
if (cuInternalIpcGetMemHandle == NULL) {
|
||||
WARN("lib wrapper not initilaized.");
|
||||
return ncclLibWrapperNotSet;
|
||||
}
|
||||
CUresult ret = cuInternalIpcGetMemHandle(pHandle, dptr);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
const char* reason = NULL;
|
||||
cuInternalGetErrorString(ret, &reason);
|
||||
if (reason != NULL)
|
||||
WARN("cuInternalIpcGetMemHandle() failed: %s ", reason);
|
||||
else
|
||||
WARN("cuInternalIpcGetMemHandle() failed: %d ", ret);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags) {
|
||||
if (cuInternalIpcOpenMemHandle == NULL) {
|
||||
WARN("lib wrapper not initilaized.");
|
||||
return ncclLibWrapperNotSet;
|
||||
}
|
||||
CUresult ret = cuInternalIpcOpenMemHandle(pdptr, handle, Flags);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
const char* reason = NULL;
|
||||
cuInternalGetErrorString(ret, &reason);
|
||||
if (reason != NULL)
|
||||
WARN("cuInternalIpcOpenMemHandle() failed: %s ", reason);
|
||||
else
|
||||
WARN("cuInternalIpcOpenMemHandle() failed: %d ", ret);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr) {
|
||||
if (cuInternalIpcCloseMemHandle == NULL) {
|
||||
WARN("lib wrapper not initilaized.");
|
||||
return ncclLibWrapperNotSet;
|
||||
}
|
||||
CUresult ret = cuInternalIpcCloseMemHandle(dptr);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
const char* reason = NULL;
|
||||
cuInternalGetErrorString(ret, &reason);
|
||||
if (reason != NULL)
|
||||
WARN("cuInternalIpcCloseMemHandle() failed: %s ", reason);
|
||||
else
|
||||
WARN("cuInternalIpcCloseMemHandle() failed: %d ", ret);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -33,7 +33,6 @@
|
||||
#define SRC_LIBWRAP_H_
|
||||
|
||||
#include "core.h"
|
||||
#include "cuda.h"
|
||||
|
||||
typedef struct nvmlDevice_st* nvmlDevice_t;
|
||||
|
||||
@ -46,9 +45,5 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
|
||||
ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
|
||||
ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
|
||||
|
||||
ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr);
|
||||
ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
|
||||
ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr);
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user