From d5e507fc7f9579ae442db80a3f1b96b2b79c9465 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 7 Jun 2016 16:27:51 -0700
Subject: [PATCH 1/3] Only call the CUDA runtime. That may fix #27.

---
 src/core.cu    | 19 +++++++-------
 src/libwrap.cu | 71 --------------------------------------------------
 src/libwrap.h  |  5 ----
 3 files changed, 9 insertions(+), 86 deletions(-)

diff --git a/src/core.cu b/src/core.cu
index cec2794..6d26f87 100644
--- a/src/core.cu
+++ b/src/core.cu
@@ -36,7 +36,6 @@
 #include <sched.h>
 #include <fcntl.h>
 #include <unistd.h>
-#include <cuda.h>
 #include <cuda_runtime.h>
 #include <string.h>
 #include <errno.h>
@@ -110,7 +109,7 @@ typedef struct {
   pid_t pid;
   ncclMem* hostptr;
   ncclMem* devptr;
-  CUipcMemHandle devipc;
+  cudaIpcMemHandle_t devipc;
   size_t buffSize;
 } RankEntry;
 
@@ -299,7 +298,7 @@ static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm)
   info->buffSize = comm->buffSize;
   info->hostptr = comm->hostMem;
   info->devptr = comm->devMem;
-  if (wrapCuIpcGetMemHandle(&info->devipc, (CUdeviceptr)comm->devMem) != ncclSuccess) {
+  if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) {
     WARN("rank %d failed to open CUDA IPC handle", rank);
     return ncclUnhandledCudaError;
   }
@@ -321,11 +320,11 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
       case CLEANUP_NONE:
         break;
       case CLEANUP_CUIPC:
-        res = wrapCuIpcCloseMemHandle((CUdeviceptr)comm->ptrs[d].cleanupHandle);
-        if (res != ncclSuccess) {
+        cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].cleanupHandle);
+        if (cures != cudaSuccess) {
           WARN("rank %d failed to close IPC handle to rank %d",
             comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
-          retval = (retval == ncclSuccess) ? res : retval;
+          retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
         }
         break;
       case CLEANUP_UNMAP:
@@ -333,13 +332,13 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
         if (cures != cudaSuccess) {
           WARN("rank %d failed to unregister handle to rank %d",
             comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
-            retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
+          retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
         }
         res = shmUnmap(comm->ptrs[d].cleanupHandle, offsetof(ncclMem, buff) + comm->buffSize);
         if (res != ncclSuccess) {
           WARN("rank %d failed to unmap handle to rank %d",
             comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
-            retval = (retval == ncclSuccess) ? res : retval;
+          retval = (retval == ncclSuccess) ? res : retval;
         }
         break;
       default:
@@ -462,8 +461,8 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
       if (canpeer || myDev == iDev) {
         INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
         comm->ptrs[i].local = ranks[myId].devptr;
-        if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
-            ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
+        if (cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote),
+            ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess) != cudaSuccess) {
           WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
           commClearMaps(comm);
           return ncclUnhandledCudaError;
diff --git a/src/libwrap.cu b/src/libwrap.cu
index c4ae737..93cb818 100644
--- a/src/libwrap.cu
+++ b/src/libwrap.cu
@@ -41,12 +41,6 @@ static RetCode (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
 static RetCode (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
 static const char* (*nvmlInternalErrorString)(RetCode r);
 
-static CUresult (*cuInternalGetErrorString)(CUresult error, const char** pStr);
-static CUresult (*cuInternalIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
-static CUresult (*cuInternalIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int  Flags);
-static CUresult (*cuInternalIpcCloseMemHandle)(CUdeviceptr dptr);
-
-
 ncclResult_t wrapSymbols(void) {
 
   if (symbolsLoaded)
@@ -93,11 +87,6 @@ ncclResult_t wrapSymbols(void) {
   LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
   LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
 
-  LOAD_SYM(cuhandle, "cuGetErrorString", cuInternalGetErrorString);
-  LOAD_SYM(cuhandle, "cuIpcGetMemHandle", cuInternalIpcGetMemHandle);
-  LOAD_SYM(cuhandle, "cuIpcOpenMemHandle", cuInternalIpcOpenMemHandle);
-  LOAD_SYM(cuhandle, "cuIpcCloseMemHandle", cuInternalIpcCloseMemHandle);
-
   symbolsLoaded = 1;
   return ncclSuccess;
 
@@ -109,11 +98,6 @@ ncclResult_t wrapSymbols(void) {
   nvmlInternalDeviceSetCpuAffinity = NULL;
   nvmlInternalDeviceClearCpuAffinity = NULL;
 
-  cuInternalGetErrorString = NULL;
-  cuInternalIpcGetMemHandle = NULL;
-  cuInternalIpcOpenMemHandle = NULL;
-  cuInternalIpcCloseMemHandle = NULL;
-
   if (cuhandle   != NULL) dlclose(cuhandle);
   if (nvmlhandle != NULL) dlclose(nvmlhandle);
   return ncclSystemError;
@@ -203,58 +187,3 @@ ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
   }
   return ncclSuccess;
 }
-
-ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr) {
-  if (cuInternalIpcGetMemHandle == NULL) {
-    WARN("lib wrapper not initilaized.");
-    return ncclLibWrapperNotSet;
-  }
-  CUresult ret = cuInternalIpcGetMemHandle(pHandle, dptr);
-  if (ret != CUDA_SUCCESS) {
-    const char* reason = NULL;
-    cuInternalGetErrorString(ret, &reason);
-    if (reason != NULL)
-      WARN("cuInternalIpcGetMemHandle() failed: %s ", reason);
-    else
-      WARN("cuInternalIpcGetMemHandle() failed: %d ", ret);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int  Flags) {
-  if (cuInternalIpcOpenMemHandle == NULL) {
-    WARN("lib wrapper not initilaized.");
-    return ncclLibWrapperNotSet;
-  }
-  CUresult ret = cuInternalIpcOpenMemHandle(pdptr, handle, Flags);
-  if (ret != CUDA_SUCCESS) {
-    const char* reason = NULL;
-    cuInternalGetErrorString(ret, &reason);
-    if (reason != NULL)
-      WARN("cuInternalIpcOpenMemHandle() failed: %s ", reason);
-    else
-      WARN("cuInternalIpcOpenMemHandle() failed: %d ", ret);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr) {
-  if (cuInternalIpcCloseMemHandle == NULL) {
-    WARN("lib wrapper not initilaized.");
-    return ncclLibWrapperNotSet;
-  }
-  CUresult ret = cuInternalIpcCloseMemHandle(dptr);
-  if (ret != CUDA_SUCCESS) {
-    const char* reason = NULL;
-    cuInternalGetErrorString(ret, &reason);
-    if (reason != NULL)
-      WARN("cuInternalIpcCloseMemHandle() failed: %s ", reason);
-    else
-      WARN("cuInternalIpcCloseMemHandle() failed: %d ", ret);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
diff --git a/src/libwrap.h b/src/libwrap.h
index ad63f1e..b89f54d 100644
--- a/src/libwrap.h
+++ b/src/libwrap.h
@@ -33,7 +33,6 @@
 #define SRC_LIBWRAP_H_
 
 #include "core.h"
-#include "cuda.h"
 
 typedef struct nvmlDevice_st* nvmlDevice_t;
 
@@ -46,9 +45,5 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
 ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
 ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
 
-ncclResult_t wrapCuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr);
-ncclResult_t wrapCuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int  Flags);
-ncclResult_t wrapCuIpcCloseMemHandle(CUdeviceptr dptr);
-
 #endif // End include guard
 

From aa8f669a3da902c2feb9eb3ca5e0af9ab8e5b713 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Mon, 13 Jun 2016 01:48:59 -0700
Subject: [PATCH 2/3] Updating for .deb rebuild

---
 Makefile                | 2 +-
 debian/changelog        | 8 ++++++++
 debian/libnccl1.install | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 90bb40f..6cd8bfb 100644
--- a/Makefile
+++ b/Makefile
@@ -70,7 +70,7 @@ LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduc
 LIBNAME     := libnccl.so
 VER_MAJOR   := 1
 VER_MINOR   := 2
-VER_PATCH   := 2
+VER_PATCH   := 3
 
 INCDIR := $(BUILDDIR)/include
 LIBDIR := $(BUILDDIR)/lib
diff --git a/debian/changelog b/debian/changelog
index 0d193ea..84060d7 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,11 @@
+nccl (1.2.3-1+cuda7.5) trusty; urgency=medium
+
+  * Make NCCL collectives work on communicators with only one rank
+  * Changed CURAND generator to work on a wider set of platforms.
+  * Only call the CUDA runtime.
+
+ -- Sylvain Jeaugey <sjeaugey@nvidia.com> Mon, 13 Jun 2016 1:45:01 -0800
+
 nccl (1.2.2-1+cuda7.5) trusty; urgency=medium
 
   * Gencodes changed to NV recommended
diff --git a/debian/libnccl1.install b/debian/libnccl1.install
index 0e2d651..363708d 100644
--- a/debian/libnccl1.install
+++ b/debian/libnccl1.install
@@ -1,2 +1,2 @@
 lib/libnccl.so.1 /usr/lib/x86_64-linux-gnu
-lib/libnccl.so.1.2.2 /usr/lib/x86_64-linux-gnu
+lib/libnccl.so.1.2.3 /usr/lib/x86_64-linux-gnu

From 5d4716a8a38da8563911d5a2a8f0bbbad19b7ca5 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Wed, 15 Jun 2016 10:53:43 -0700
Subject: [PATCH 3/3] Include link to blog post in README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e8c1e95..05ca7ae 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ Optimized primitives for collective multi-GPU communication.
 ## Introduction
 
 NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
+[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
 
 ## What's inside