2.3.5-5

Add support for inter-node communication using sockets and InfiniBand/RoCE. Improve latency. Add support for aggregation. Improve LL/regular tuning. Remove tests as those are now at github.com/nvidia/nccl-tests .
2018-09-24 16:06:59 -07:00 · 2018-09-24 16:06:59 -07:00 · f93fe9bfd9
commit f93fe9bfd9
parent 286916a1a3
132 changed files with 12424 additions and 9415 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 /build
+*.gcov
+/coverage/
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -1,5 +1,5 @@

- Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
--- a/250
+++ b/250
@ -1,236 +1,30 @@
 #
-# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
 #
-# See LICENCE.txt for license information
+# See LICENSE.txt for license information
 #
+.PHONY : all clean

-CUDA_HOME ?= /usr/local/cuda
-PREFIX ?= /usr/local
-VERBOSE ?= 0
-KEEP ?= 0
-DEBUG ?= 0
-PROFAPI ?= 0
-BUILDDIR ?= build
-BUILDDIR := $(abspath $(BUILDDIR))
+default : src.build
+BUILDDIR ?= $(abspath ./build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := src pkg
+clean: ${TARGETS:%=%.clean}
+test.build: src.build
+LICENSE_FILES := LICENSE.txt
+LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
+lic: $(LICENSE_TARGETS)

-CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-NVCC ?= $(CUDA_HOME)/bin/nvcc
+${BUILDDIR}/%.txt: %.txt
+	@printf "Copying    %-35s > %s\n" $< $@
+	mkdir -p ${BUILDDIR}
+	cp $< $@

-NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_60,code=sm_60\
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_60,code=compute_60
+src.%:
+	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}

-CXXFLAGS   := -I$(CUDA_INC) -fPIC -fvisibility=hidden
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96
-# Use addprefix so that we can specify more than one path
-LDFLAGS    := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
-
-ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3
-CXXFLAGS  += -O3
-else
-NVCUFLAGS += -O0 -G
-CXXFLAGS  += -O0 -g -ggdb3
-endif
-
-ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
-CXXFLAGS  += -Wall -Wextra
-else
-.SILENT:
-endif
-
-ifneq ($(KEEP), 0)
-NVCUFLAGS += -keep
-endif
-
-ifneq ($(PROFAPI), 0)
-CXXFLAGS += -DPROFAPI
-endif
-
-NCCL_MAJOR   := 1
-NCCL_MINOR   := 3
-NCCL_PATCH   := 5
-CXXFLAGS  += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
-
-CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
-CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
-CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
-CXXFLAGS  += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
-
-.PHONY : all lib staticlib clean test mpitest install deb debian debclean forlib fortest forclean
-.DEFAULT : all
-
-INCEXPORTS  := nccl.h
-LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
-LIBNAME     := libnccl.so
-STATICLIBNAME := libnccl_static.a
-
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-STATICLIBTARGET := $(STATICLIBNAME)
-LIBLINK    := $(patsubst lib%.so, -l%, $(LIBNAME))
-LIBOBJ     := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
-DEPFILES   := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
-
-all : lib staticlib
-
-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-
-staticlib : $(INCTARGETS) $(LIBDIR)/$(STATICLIBTARGET)
-
-include $(DEPFILES)
-
-$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
-	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
-	@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	ar cr $@ $(LIBOBJ)
-
-$(INCDIR)/%.h : src/%.h
-	@printf "Grabbing  %-35s > %s\n" $< $@
-	mkdir -p $(INCDIR)
-	cp -f $< $@
-
-$(OBJDIR)/%.o : src/%.cu
-	@printf "Compiling %-35s > %s\n" $< $@
-	mkdir -p $(OBJDIR)
-	$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
-	@rm -f $(@:%.o=%.d.tmp)
-
-clean :
-	rm -rf $(BUILDDIR)
-
-install : lib
-	mkdir -p $(PREFIX)/lib
-	mkdir -p $(PREFIX)/include
-	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
-	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
-
-
-#### TESTS ####
-
-TEST_ONLY ?= 0
-
-# Tests depend on lib, except in TEST_ONLY mode.
-ifeq ($(TEST_ONLY), 0)
-TSTDEP = $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-endif
-
-NCCL_LIB ?= $(LIBDIR)
-NCCL_INC ?= $(INCDIR)
-
-MPI_HOME ?= /usr
-MPI_INC ?= $(MPI_HOME)/include
-MPI_LIB ?= $(MPI_HOME)/lib
-MPIFLAGS   := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
-
-TESTS       := all_gather_test     all_gather_scan \
-               all_reduce_test     all_reduce_scan \
-               broadcast_test      broadcast_scan \
-               reduce_test         reduce_scan \
-               reduce_scatter_test reduce_scatter_scan
-MPITESTS    := mpi_test
-
-TSTINC     := -I$(NCCL_INC) -Itest/include
-TSTLIB     := -L$(NCCL_LIB) $(LIBLINK) $(LDFLAGS)
-TSTDIR     := $(BUILDDIR)/test/single
-MPITSTDIR  := $(BUILDDIR)/test/mpi
-TESTBINS   := $(patsubst %, $(TSTDIR)/%, $(TESTS))
-MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
-
-test : $(TESTBINS)
-
-$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(TSTDIR)
-	$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
-	@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-mpitest : $(MPITESTBINS)
-
-$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(MPITSTDIR)
-	$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
-	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-#### PACKAGING ####
-
-DEBIANDIR  := $(BUILDDIR)/debian
-
-DEBGEN_IN  := $(shell (cd debian ; ls *.in))
-DEBGEN     := $(DEBGEN_IN:.in=)
-DEBFILES   := compat copyright libnccl-dev.install libnccl-dev.manpages nccl.7 rules $(DEBGEN)
-DEBTARGETS := $(patsubst %, $(DEBIANDIR)/%, $(DEBFILES))
-
-DEB_REVISION   ?= 1
-DEB_TIMESTAMP  := $(shell date -R)
-DEB_ARCH       ?= amd64
-
-debian : $(DEBTARGETS)
-
-deb : lib debian
-	@printf "Building Debian package\n"
-	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
-	mkdir -p $(BUILDDIR)/deb/
-	mv $(BUILDDIR)/../libnccl*.deb $(BUILDDIR)/deb/
-
-debclean :
-	rm -Rf $(DEBIANDIR)
-
-$(DEBIANDIR)/% : debian/%.in
-	@printf "Generating %-35s > %s\n" $< $@
-	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-	    -e "s/\$${deb:Revision}/$(DEB_REVISION)/g" \
-	    -e "s/\$${deb:Timestamp}/$(DEB_TIMESTAMP)/g" \
-	    -e "s/\$${deb:Arch}/$(DEB_ARCH)/g" \
-	    $< > $@
-
-$(DEBIANDIR)/% : debian/%
-	@printf "Grabbing  %-35s > %s\n" $< $@
-	mkdir -p $(DEBIANDIR)
-	cp -f $< $@
-
-#### FORTRAN BINDINGS ####
-
-export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
-
-forlib : lib
-	$(MAKE) -C fortran lib
-fortest : forlib
-	$(MAKE) -C fortran test
-forclean :
-	$(MAKE) -C fortran clean
+pkg.%:
+	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}

+pkg.debian.prep: lic
+pkg.txz.prep: lic
--- a/README.md
+++ b/README.md
@ -1,128 +1,84 @@
-**IMPORTANT NOTE**
-
-**NCCL1 is no longer maintained/updated and has been replaced by NCCL2, available at**
-
-**http://developer.nvidia.com/nccl.**
-
 # NCCL

 Optimized primitives for collective multi-GPU communication.

 ## Introduction

-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
-[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
+NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+
+For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).

 ## What's inside

-At present, the library implements the following collectives:
+At present, the library implements the following collectives operations:
+
 - all-reduce
 - all-gather
 - reduce-scatter
 - reduce
 - broadcast

-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
+These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.

 ## Requirements

-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
+NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.

-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
+## Build

-## Build & run
-
-To build the library and tests.
+To build the library :

 ```shell
 $ cd nccl
-$ make CUDA_HOME=<cuda install path> test
+$ make -j src.build
 ```

-Test binaries are located in the subdirectories nccl/build/test/{single,mpi}.
+If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :

 ```shell
-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./build/lib
-$ ./build/test/single/all_reduce_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all_reduce_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/single/all_reduce_test 10000000
-# Using devices
-#   Device  0 ->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 ->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 ->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 ->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out-of-place                    in-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e-03    1.636   6.11   9.17    4e-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e-03    1.657   6.03   9.05    1e-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e-07    1.622   6.17   9.25    5e-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e-07    1.628   6.14   9.21    1e-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e-16    1.628   6.14   9.21    2e-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
+$ make src.build CUDA_HOME=<path to cuda install>
 ```

-To install, run `make PREFIX=<install dir> install` and add `<instal dir>/lib` to your `LD_LIBRARY_PATH`.
+NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.

-## Usage
-
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single-process machine, all GPUs can be conveniently initialized using `ncclCommInitAll`. For multi-process applications (e.g., with MPI), `ncclCommInitRank` must be called for each GPU. Internally `ncclCommInitRank` invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single-process example follows, for an MPI example see test/mpi/mpi_test.cu. For details about the API see nccl.h.
-
-```c
-#include <nccl.h>
-
-typedef struct {
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream_t stream;
-} PerThreadData;
-
-int main(int argc, char* argv[])
-{
-  int nGPUs;
-  cudaGetDeviceCount(&nGPUs);
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) {
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  }
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-}
+By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
+```shell
+$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
 ```

-## Copyright and License
+## Install

-NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
-accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
-rights reserved.
+To install NCCL on the system, create a package then install it as root.

+Debian/Ubuntu :
+```shell
+$ make pkg.debian.build
+$ ls build/pkg/deb/
+```
+
+RedHat/CentOS :
+```shell
+$ make pkg.redhat.build
+$ ls build/pkg/rpm/
+```
+
+OS-agnostic tarball :
+```shell
+$ make pkg.txz.build
+$ ls build/pkg/txz/
+```
+
+## Tests
+
+Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
+
+```shell
+$ git clone https://github.com/NVIDIA/nccl-tests.git
+$ cd nccl-tests
+$ make
+$ ./build/allreduce_perf -b 8 -e 256M -f 2 -g <ngpus>
+```
+
+## Copyright
+
+All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
--- a/debian/changelog.in
+++ b/debian/changelog.in
@ -1,5 +0,0 @@
-nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}-${deb:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
-
-  * Automatic Debian package from build
-
- -- cudatools <cudatools@nvidia.com>  ${deb:Timestamp}
--- a/debian/copyright
+++ b/debian/copyright
@ -1 +0,0 @@
-../LICENSE.txt
--- a/debian/libnccl-dev.install
+++ b/debian/libnccl-dev.install
@ -1,2 +0,0 @@
-include/nccl.h usr/include
-lib/libnccl.so /usr/lib/x86_64-linux-gnu
--- a/debian/libnccl-dev.manpages
+++ b/debian/libnccl-dev.manpages
@ -1 +0,0 @@
-debian/nccl.7
--- a/debian/libnccl1.install.in
+++ b/debian/libnccl1.install.in
@ -1,2 +0,0 @@
-lib/libnccl.so.${nccl:Major} /usr/lib/x86_64-linux-gnu
-lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/x86_64-linux-gnu
--- a/debian/nccl.7
+++ b/debian/nccl.7
@ -1,139 +0,0 @@
-.TH NCCL
-.SH NAME
-.PP
-nccl \- Optimized primitives for collective multi\-GPU communication.
-
-.SH Introduction
-.PP
-NCCL (pronounced "Nickel") is a stand\-alone library of standard collective communication routines, such as all\-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single\- or multi\-process (e.g., MPI) applications.
-
-.SH What's inside
-.PP
-At present, the library implements the following collectives:
-\- all\-reduce
-\- all\-gather
-\- reduce\-scatter
-\- reduce
-\- broadcast
-
-.PP
-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
-
-.SH Requirements
-.PP
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi\-socket configurations are also supported.
-
-.PP
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
-
-.SH Build & run
-.PP
-To build the library and tests.
-
-.PP
-.RS
-
-.nf
-$ cd nccl
-$ make CUDA\_HOME=<cuda install path> test
-
-.fi
-.RE
-
-.PP
-Test binaries are located in the subdirectories nccl/build/test and nccl/build/mpitest.
-
-.PP
-.RS
-
-.nf
-$ export LD\_LIBRARY\_PATH=$LD\_LIBRARY\_PATH:./build/lib
-$ ./build/test/all\_reduce\_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all\_reduce\_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/all\_reduce\_test 10000000
-# Using devices
-#   Device  0 \->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 \->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 \->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 \->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out\-of\-place                    in\-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e\-03    1.636   6.11   9.17    4e\-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e\-03    1.657   6.03   9.05    1e\-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e\-07    1.622   6.17   9.25    5e\-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e\-07    1.628   6.14   9.21    1e\-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e\-16    1.628   6.14   9.21    2e\-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
-
-.fi
-.RE
-
-.PP
-To install, run \fB\fCmake PREFIX=<install dir> install\fR and add \fB\fC<instal dir>/lib\fR to your \fB\fCLD\_LIBRARY\_PATH\fR.
-
-.SH Usage
-.PP
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single\-process machine, all GPUs can be conveniently initialized using \fB\fCncclCommInitAll\fR. For multi\-process applications (e.g., with MPI), \fB\fCncclCommInitRank\fR must be called for each GPU. Internally \fB\fCncclCommInitRank\fR invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single\-process example follows, for an MPI example see src/mpi\_test.cu. For details about the API see nccl.h.
-
-.PP
-.RS
-
-.nf
-#include <nccl.h>
-
-typedef struct \{
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream\_t stream;
-\} PerThreadData;
-
-int main(int argc, char* argv[])
-\{
-  int nGPUs;
-  cudaGetDeviceCount(\&nGPUs);
-  ncclComm\_t* comms = (ncclComm\_t*)malloc(sizeof(ncclComm\_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) \{
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  \}
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-\}
-
-.fi
-.RE
-
-.SH Copyright
-.PP
-All source code and accompanying documentation is copyright (c) 2015\-2016, NVIDIA CORPORATION. All
-rights reserved.
--- a/debian/shlibs.local.in
+++ b/debian/shlibs.local.in
@ -1 +0,0 @@
-libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}
--- a/fortran/Makefile
+++ b/fortran/Makefile
@ -1,81 +0,0 @@
-FC := gfortran
-FCNAME := $(notdir $(FC))
-
-BUILDDIR ?= ../build
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-LIBNAME    := libncclfor.so
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-LIBLINK    += $(patsubst lib%.so,-l%,$(LIBNAME))
-
-LIBCUDAFOR := libcudafor.so
-
-ifneq ($(filter pgf%, $(FCNAME)), )
-# PGI compiler (pgfortran, pgf90, pgf95)
-FCMODFLAGS  := -module $(INCDIR)
-FCPREFLAGS  := -Mpreprocess
-FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
-FCFLAGS     := -fast -O3
-else
-# non-PGI compilers do not have CUDA support, compile our own CUDA lib
-CUDAFORDEP  := $(LIBDIR)/$(LIBCUDAFOR)
-CUDALINK    := -L$(CUDA_LIB) -lcudart
-CUDAFORLINK := -lcudafor
-ifeq ($(FCNAME), gfortran)
-FCMODFLAGS  := -J$(INCDIR)
-FCPREFLAGS  += -cpp
-FCFLAGS     += -ffree-line-length-none
-else ifeq ($(FCNAME), ifort)
-FCMODFLAGS  := -module $(INCDIR)
-FCPREFLAGS  += -fpp
-endif
-endif
-
-ifeq ($(VERBOSE), 0)
-.SILENT:
-endif
-
-lib: $(CUDAFORDEP)
-	$(MAKE) $(LIBDIR)/$(LIBTARGET)
-
-$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
-	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
-	@printf "Linking   %-35s > %s\n" $(LIBCUDAFOR) $@
-	mkdir -p $(LIBDIR)
-	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
-
-$(OBJDIR)/%.o: src/%.f90
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(OBJDIR)
-	mkdir -p $(INCDIR)
-	$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
-
-TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
-ifneq ($(filter pgf%, $(FCNAME)), )
-TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
-endif
-
-TESTDIR  := $(BUILDDIR)/test/fortran
-TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
-
-test: lib $(TESTBINS)
-
-$(TESTDIR)/%: test/%.f90 lib
-	@printf "Building  %-35s > %s\n" $< $@
-	@mkdir -p $(TESTDIR)
-	$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
-
-clean:
-	rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
-	rm -rf $(TESTDIR)/
-
--- a/fortran/src/cudafor.f90
+++ b/fortran/src/cudafor.f90
@ -1,171 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-#ifndef _CUDA
-
-!Start cudaFor module
-module cudaFor
-use iso_c_binding
-implicit none
-private
-public :: c_devptr
-public :: cudaMemcpyKind,           &
-          cudaMemcpyHostToHost,     &
-          cudaMemcpyHostToDevice,   &
-          cudaMemcpyDeviceToHost,   &
-          cudaMemcpyDeviceToDevice, &
-          cudaMemcpyDefault
-public :: cuda_stream_kind
-public :: cudaGetDeviceCount
-public :: cudaSetDevice
-public :: cudaMalloc
-public :: cudaMemcpy
-public :: cudaFree
-public :: cudaStreamCreate
-public :: cudaStreamSynchronize
-public :: cudaStreamDestroy
-
-!Start types
-
-!Start c_devptr
-type, bind(c) :: c_devptr
-type(c_ptr) :: member
-end type c_devptr
-!End c_devptr
-
-!Start cudaMemcpyKind
-type, bind(c) :: cudaMemcpyKind
-integer(c_int) :: member
-end type cudaMemcpyKind
-
-type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost     = cudaMemcpyKind(0), &
-                                   cudaMemcpyHostToDevice   = cudaMemcpyKind(1), &
-                                   cudaMemcpyDeviceToHost   = cudaMemcpyKind(2), &
-                                   cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
-                                   cudaMemcpyDefault        = cudaMemcpyKind(4)
-!End cudaMemcpyKind
-
-!Start cuda_stream_kind
-integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
-!End cuda_stream_kind
-
-!End types
-
-!Start interfaces
-
-!Start cudaGetDeviceCount
-interface cudaGetDeviceCount
-integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
-import :: c_int
-implicit none
-integer(c_int) :: count
-end function cudaGetDeviceCount
-end interface cudaGetDeviceCount
-!End cudaGetDeviceCount
-
-!Start cudaSetDevice
-interface cudaSetDevice
-integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
-import :: c_int
-implicit none
-integer(c_int), value :: device
-end function cudaSetDevice
-end interface cudaSetDevice
-!End cudaSetDevice
-
-!Start cudaMalloc
-interface cudaMalloc
-integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
-import :: c_int, c_size_t
-import :: c_devptr
-implicit none
-type(c_devptr) :: devPtr
-integer(c_size_t), value :: size
-end function cudaMalloc
-end interface cudaMalloc
-!End cudaMalloc
-
-!Start cudaMemcpy
-interface cudaMemcpy
-
-!Start cudaMemcpyH2D
-integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
-import :: c_ptr, c_int, c_size_t
-import :: c_devptr, cudaMemcpyKind
-implicit none
-type(c_devptr), value :: dst
-type(c_ptr), value :: src
-integer(c_size_t), value :: count
-type(cudaMemcpyKind), value :: kind
-end function cudaMemcpyH2D
-!End cudaMemcpyH2D
-
-!Start cudaMemcpyD2H
-integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
-import :: c_ptr, c_int, c_size_t
-import :: c_devptr, cudaMemcpyKind
-implicit none
-type(c_ptr), value :: dst
-type(c_devptr), value :: src
-integer(c_size_t), value :: count
-type(cudaMemcpyKind), value :: kind
-end function cudaMemcpyD2H
-!End cudaMemcpyD2H
-
-end interface cudaMemcpy
-!End cudaMemcpy
-
-!Start cudaFree
-interface cudaFree
-integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
-import :: c_int
-import :: c_devptr
-implicit none
-type(c_devptr), value :: devPtr
-end function cudaFree
-end interface cudaFree
-!End cudaFree
-
-!Start cudaStreamCreate
-interface cudaStreamCreate
-integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind) :: pStream
-end function cudaStreamCreate
-end interface cudaStreamCreate
-!End cudaStreamCreate
-
-!Start cudaStreamSynchronize
-interface cudaStreamSynchronize
-integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind), value :: stream
-end function cudaStreamSynchronize
-end interface cudaStreamSynchronize
-!End cudaStreamSynchronize
-
-!Start cudaStreamDestroy
-interface cudaStreamDestroy
-integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind), value :: stream
-end function cudaStreamDestroy
-end interface cudaStreamDestroy
-!End cudaStreamDestroy
-
-!End interfaces
-
-end module cudaFor
-!End cudaFor module
-
-#endif
--- a/fortran/src/ncclfor.f90
+++ b/fortran/src/ncclfor.f90
@ -1,312 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-!Start defines
-#define NCCL_UNIQUE_ID_BYTES 128
-!End defines
-
-!Start ncclFor module
-module ncclFor
-use iso_c_binding
-use cudaFor
-implicit none
-private
-public :: ncclUniqueId
-public :: ncclComm
-public :: ncclResult,                 &
-          ncclSuccess,                &
-          ncclUnhandledCudaError,     &
-          ncclSystemError,            &
-          ncclInternalError,          &
-          ncclInvalidDevicePointer,   &
-          ncclInvalidRank,            &
-          ncclUnsupportedDeviceCount, &
-          ncclDeviceNotFound,         &
-          ncclInvalidDeviceIndex,     &
-          ncclLibWrapperNotSet,       &
-          ncclCudaMallocFailed,       &
-          ncclRankMismatch,           &
-          ncclInvalidArgument,        &
-          ncclInvalidType,            &
-          ncclInvalidOperation,       &
-          nccl_NUM_RESULTS
-public :: ncclDataType, &
-          ncclChar,     &
-          ncclInt,      &
-#ifdef CUDA_HAS_HALF
-          ncclHalf,     &
-#endif
-          ncclFloat,    &
-          ncclDouble,   &
-          ncclInt64,    &
-          ncclUInt64,   &
-          nccl_NUM_TYPES
-public :: ncclRedOp, &
-          ncclSum,   &
-          ncclProd,  &
-          ncclMax,   &
-          ncclMin,   &
-          nccl_NUM_OPS
-public :: ncclGetUniqueId
-public :: ncclCommInitRank
-public :: ncclCommInitAll
-public :: ncclCommCuDevice
-public :: ncclCommUserRank
-public :: ncclCommCount
-public :: ncclCommDestroy
-public :: ncclReduce
-public :: ncclAllReduce
-public :: ncclReduceScatter
-public :: ncclBcast
-public :: ncclAllGather
-
-!Start types
-
-!Start ncclUniqueId
-type, bind(c) :: ncclUniqueId
-character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
-end type ncclUniqueId
-!End ncclUniqueId
-
-!Start ncclComm
-type, bind(c) :: ncclComm
-type(c_ptr) :: member
-end type ncclComm
-!End ncclComm
-
-!Start ncclResult
-type, bind(c) :: ncclResult
-integer(c_int) :: member
-end type ncclResult
-
-type(ncclResult), parameter :: ncclSuccess                = ncclResult( 0), &
-                               ncclUnhandledCudaError     = ncclResult( 1), &
-                               ncclSystemError            = ncclResult( 2), &
-                               ncclInternalError          = ncclResult( 3), &
-                               ncclInvalidDevicePointer   = ncclResult( 4), &
-                               ncclInvalidRank            = ncclResult( 5), &
-                               ncclUnsupportedDeviceCount = ncclResult( 6), &
-                               ncclDeviceNotFound         = ncclResult( 7), &
-                               ncclInvalidDeviceIndex     = ncclResult( 8), &
-                               ncclLibWrapperNotSet       = ncclResult( 9), &
-                               ncclCudaMallocFailed       = ncclResult(10), &
-                               ncclRankMismatch           = ncclResult(11), &
-                               ncclInvalidArgument        = ncclResult(12), &
-                               ncclInvalidType            = ncclResult(13), &
-                               ncclInvalidOperation       = ncclResult(14), &
-                               nccl_NUM_RESULTS           = ncclResult(15)
-!End ncclResult
-
-!Start ncclDataType
-type, bind(c) :: ncclDataType
-integer(c_int) :: member
-end type ncclDataType
-
-type(ncclDataType), parameter :: ncclChar       = ncclDataType(0), &
-                                 ncclInt        = ncclDataType(1), &
-#ifdef CUDA_HAS_HALF
-                                 ncclHalf       = ncclDataType(2), &
-#endif
-                                 ncclFloat      = ncclDataType(3), &
-                                 ncclDouble     = ncclDataType(4), &
-                                 ncclInt64      = ncclDataType(5), &
-                                 ncclUInt64     = ncclDataType(6), &
-                                 nccl_NUM_TYPES = ncclDataType(7)
-!End ncclDataType
-
-!Start ncclRedOp
-type, bind(c) :: ncclRedOp
-integer(c_int) :: member
-end type ncclRedOp
-
-type(ncclRedOp), parameter :: ncclSum      = ncclRedOp(0), &
-                              ncclProd     = ncclRedOp(1), &
-                              ncclMax      = ncclRedOp(2), &
-                              ncclMin      = ncclRedOp(3), &
-                              nccl_NUM_OPS = ncclRedOp(4)
-!End ncclRedOp
-
-!End types
-
-!Start interfaces
-
-!Start ncclGetUniqueId
-interface ncclGetUniqueId
-type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
-import :: ncclResult, ncclUniqueId
-implicit none
-type(ncclUniqueId) :: uniqueId
-end function ncclGetUniqueId
-end interface ncclGetUniqueId
-!End ncclGetUniqueId
-
-!Start ncclCommInitRank
-interface ncclCommInitRank
-type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
-import :: c_int
-import :: ncclResult, ncclUniqueId, ncclComm
-implicit none
-type(ncclComm) :: comm(*)
-integer(c_int), value :: ndev
-type(ncclUniqueId), value :: commId
-integer(c_int), value :: rank
-end function ncclCommInitRank
-end interface ncclCommInitRank
-!End ncclCommInitRank
-
-!Start ncclCommInitAll
-interface ncclCommInitAll
-type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm) :: comm(*)
-integer(c_int), value :: ndev
-integer(c_int) :: devlist(*)
-end function ncclCommInitAll
-end interface ncclCommInitAll
-!End ncclCommInitAll
-
-!Start ncclCommCuDevice
-interface ncclCommCuDevice
-type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: devid
-end function ncclCommCuDevice
-end interface ncclCommCuDevice
-!End ncclCommCuDevice
-
-!Start ncclCommUserRank
-interface ncclCommUserRank
-type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: rank
-end function ncclCommUserRank
-end interface ncclCommUserRank
-!End ncclCommUserRank
-
-!Start ncclCommCount
-interface ncclCommCount
-type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: count
-end function ncclCommCount
-end interface ncclCommCount
-!End ncclCommCount
-
-!Start ncclCommDestroy
-interface ncclCommDestroy
-subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
-import :: ncclComm
-implicit none
-type(ncclComm), value :: comm
-end subroutine ncclCommDestroy
-end interface ncclCommDestroy
-!End ncclCommDestroy
-
-!Start ncclReduce
-interface ncclReduce
-type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-integer(c_int), value :: root
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclReduce
-end interface ncclReduce
-!End ncclReduce
-
-!Start ncclAllReduce
-interface ncclAllReduce
-type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclAllReduce
-end interface ncclAllReduce
-!End ncclAllReduce
-
-!Start ncclReduceScatter
-interface ncclReduceScatter
-type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: recvcount
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclReduceScatter
-end interface ncclReduceScatter
-!End ncclReduceScatter
-
-!Start ncclBcast
-interface ncclBcast
-type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType
-implicit none
-type(c_devptr), value :: buff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-integer(c_int), value :: root
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclBcast
-end interface ncclBcast
-!End ncclBcast
-
-!Start ncclAllGather
-interface ncclAllGather
-type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType
-implicit none
-type(c_devptr), value :: sendbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(c_devptr), value :: recvbuff
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclAllGather
-end interface ncclAllGather
-!End ncclAllGather
-
-!End interfaces
-
-end module ncclFor
-!End nccl module
--- a/fortran/test/allgather_arr_out.f90
+++ b/fortran/test/allgather_arr_out.f90
@ -1,162 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 1))
-
-  call random_number(hostBuff)
-
-  print "(a)", "before allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl * nDev))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
-    hostBuff(:, i) = recvBuff
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
-  end do
-
-  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
-  print "(a)", ""
-  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/allgather_ptr_out.f90
+++ b/fortran/test/allgather_ptr_out.f90
@ -1,171 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 1))
-
-  call random_number(hostBuff)
-
-  print "(a)", "before allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev))
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-  end do
-
-  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
-  print "(a)", ""
-  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/allreduce_arr_out.f90
+++ b/fortran/test/allreduce_arr_out.f90
@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before allreduce:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allreduce:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    hostBuff(:, nDev + 1) = recvBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/allreduce_ptr_out.f90
+++ b/fortran/test/allreduce_ptr_out.f90
@ -1,166 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before allreduce:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allreduce:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/broadcast_arr.f90
+++ b/fortran/test/broadcast_arr.f90
@ -1,137 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: devBuff(:)
-type(c_devptr), allocatable :: devBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 1))
-
-  call random_number(hostBuff(:, 1:nDev))
-
-  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
-
-  print "(a)", "before broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-
-  allocate(devBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(devBuff(nEl))
-    devBuffPtr(i) = c_devloc(devBuff)
-    devBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
-    hostBuff(:, i) = devBuff
-  end do
-
-  print "(a)", ""
-  print "(a)", "after broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
-    deallocate(devBuff)
-  end do
-
-  deallocate(devBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/broadcast_ptr.f90
+++ b/fortran/test/broadcast_ptr.f90
@ -1,142 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: devBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 1))
-
-  call random_number(hostBuff(:, 1:nDev))
-
-  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
-
-  print "(a)", "before broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev))
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(devBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-  end do
-
-  print "(a)", ""
-  print "(a)", "after broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(devBuffPtr(i))
-  end do
-
-  deallocate(devBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/reduce_arr_out.f90
+++ b/fortran/test/reduce_arr_out.f90
@ -1,164 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  stat = cudaSetDevice(devList(root + 1))
-  call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
-  hostBuff(:, nDev + 1) = recvBuff
-
-  print "(a)", ""
-  print "(a)", "after reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/reduce_ptr_out.f90
+++ b/fortran/test/reduce_ptr_out.f90
@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  stat = cudaSetDevice(devList(root + 1))
-  stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-
-  print "(a)", ""
-  print "(a)", "after reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/reducescatter_arr_out.f90
+++ b/fortran/test/reducescatter_arr_out.f90
@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reducescatter:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl * nDev))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after reducescatter:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/fortran/test/reducescatter_ptr_out.f90
+++ b/fortran/test/reducescatter_ptr_out.f90
@ -1,174 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reducescatter:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after reduceScatter:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@ -0,0 +1,87 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+KEEP ?= 0
+DEBUG ?= 0
+TRACE ?= 0
+PROFAPI ?= 0
+
+NVCC = $(CUDA_HOME)/bin/nvcc
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
+
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
+                -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61
+CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
+
+CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
+CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
+
+# Include Volta support if we're using CUDA9 or above
+ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
+else
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
+endif
+#$(info NVCC_GENCODE is ${NVCC_GENCODE})
+
+CXXFLAGS   := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS   += -Wall -Wno-sign-compare
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
+# Use addprefix so that we can specify more than one path
+NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+
+########## GCOV ##########
+GCOV ?= 0 # disable by default.
+GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
+CXXFLAGS  += ${GCOV_FLAGS}
+NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
+LDFLAGS   += ${GCOV_FLAGS}
+NVLDFLAGS   += ${GCOV_FLAGS:%=-Xcompiler %}
+# $(warning GCOV_FLAGS=${GCOV_FLAGS})
+########## GCOV ##########
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+CXXFLAGS  += -Wall -Wextra
+else
+.SILENT:
+endif
+
+ifneq ($(TRACE), 0)
+CXXFLAGS  += -DENABLE_TRACE
+endif
+
+ifneq ($(KEEP), 0)
+NVCUFLAGS += -keep
+endif
+
+ifneq ($(PROFAPI), 0)
+CXXFLAGS += -DPROFAPI
+endif
--- a/makefiles/formatting.mk
+++ b/makefiles/formatting.mk
@ -0,0 +1,33 @@
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
+# As this file defines a new target (format), it should be included at least after the definition of the
+# default target.
+
+ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
+ASTYLEDIR := $(BUILDDIR)/contrib
+ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
+ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
+ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
+ASTYLEVER := 3.1
+ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
+
+$(ASTYLEDIR) :
+	@mkdir -p $(ASTYLEDIR)
+
+$(ASTYLETAR) : $(ASTYLEDIR)
+	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
+
+$(ASTYLEBLD) : $(ASTYLETAR)
+	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
+
+$(ASTYLEBIN) : $(ASTYLEBLD)
+	${MAKE} -C $(ASTYLEBLD)
+
+.PHONY : format
+format : $(ASTYLEBIN)
+	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -0,0 +1,6 @@
+##### version
+NCCL_MAJOR   := 2
+NCCL_MINOR   := 3
+NCCL_PATCH   := 5
+NCCL_SUFFIX  :=
+PKG_REVISION := 5
--- a/pkg/Makefile
+++ b/pkg/Makefile
@ -0,0 +1,26 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.PHONY : all clean
+
+default : build
+build : debian.build txz.build
+
+BUILDDIR ?= $(abspath ../build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := debian txz
+all:   ${TARGETS:%=%.build}
+prep:  ${TARGETS:%=%.prep}
+build: ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+
+%.prep:
+	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
+
+%.build:
+	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
+
+%.clean:
+	${MAKE} -C $* clean
--- a/pkg/debian/.gitignore
+++ b/pkg/debian/.gitignore
--- a/pkg/debian/Makefile
+++ b/pkg/debian/Makefile
@ -0,0 +1,58 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+DEBPREPDIR := $(BUILDDIR)/debian
+PKGDIR  := $(BUILDDIR)/pkg/deb/
+
+DEBGEN_IN  := $(wildcard *.in)
+DEBGEN     := $(DEBGEN_IN:.in=)
+DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
+DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
+
+PKG_TIMESTAMP  := $(shell date -R)
+ARCH           := $(shell uname -m)
+PKG_ARCH       ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
+PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
+ifeq ($(PKG_MULTIARCH),)
+# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
+PKG_MULTIARCH  := $(ARCH)-linux-gnu
+endif
+
+prep : $(DEBTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build : prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	@printf "Building Debian package\n"
+	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
+
+clean:
+	rm -Rf $(DEBPREPDIR) $(PKGDIR)
+
+$(DEBPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(DEBPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
+	    $< > $@
+
+$(DEBPREPDIR)/% : %
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DEBPREPDIR)
+	cp -f $< $@
--- a/pkg/debian/changelog.in
+++ b/pkg/debian/changelog.in
@ -0,0 +1,5 @@
+nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
+
+  * Automatic Debian package from build
+
+ -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
--- a/pkg/debian/compat
+++ b/pkg/debian/compat
--- a/pkg/debian/control.in
+++ b/pkg/debian/control.in
@ -7,22 +7,24 @@ Standards-Version: 3.9.5

 Package: libnccl${nccl:Major}
 Section: libs
-Architecture: ${deb:Arch}
+Architecture: ${pkg:Arch}
 Depends: ${misc:Depends}, ${shlibs:Depends}
 Description: NVIDIA Collectives Communication Library (NCCL) Runtime
 NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
+ communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+ broadcast, and reduce-scatter.
+ It has been optimized to achieve high bandwidth on any platform using PCIe,
+ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+ sockets.

 Package: libnccl-dev
 Section: libdevel
-Architecture: ${deb:Arch}
+Architecture: ${pkg:Arch}
 Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
 Description: NVIDIA Collectives Communication Library (NCCL) Development Files
 NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
+ communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+ broadcast, and reduce-scatter.
+ It has been optimized to achieve high bandwidth on any platform using PCIe,
+ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+ sockets.
--- a/pkg/debian/copyright
+++ b/pkg/debian/copyright
@ -0,0 +1 @@
+../../LICENSE.txt
--- a/pkg/debian/gbp.conf
+++ b/pkg/debian/gbp.conf
@ -0,0 +1,9 @@
+[DEFAULT]
+debian-branch   = master
+upstream-branch = master
+
+ignore-new = True
+
+[git-buildpackage]
+
+no-purge = True
--- a/pkg/debian/libnccl-dev.install.in
+++ b/pkg/debian/libnccl-dev.install.in
@ -0,0 +1,3 @@
+include/nccl.h /usr/include
+lib/libnccl.so /usr/lib/${pkg:MultiArch}
+lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
--- a/pkg/debian/libnccl2.install.in
+++ b/pkg/debian/libnccl2.install.in
@ -0,0 +1,2 @@
+lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
+lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
--- a/pkg/debian/rules
+++ b/pkg/debian/rules
--- a/pkg/debian/source/format
+++ b/pkg/debian/source/format
--- a/pkg/redhat/Makefile
+++ b/pkg/redhat/Makefile
@ -0,0 +1,62 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+RPMPREPDIR := $(BUILDDIR)/redhat
+PKGDIR  := $(BUILDDIR)/pkg/rpm/
+
+RPMGEN_IN  := $(wildcard *.in)
+RPMGEN     := $(RPMGEN_IN:.in=)
+RPMFILES   := $(RPMGEN)
+RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
+
+PKG_TIMESTAMP  := $(shell date -R)
+ARCH           := $(shell uname -m)
+PKG_ARCH       ?= $(shell uname -m)
+PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
+ifeq ($(PKG_MULTIARCH),)
+# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
+PKG_MULTIARCH  := $(ARCH)-linux-gnu
+endif
+
+prep : $(RPMTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build : prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
+	@printf "Building Redhat package\n"
+	mkdir -p $(PKGDIR)
+	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
+                 --define "_rpmdir $(PKGDIR)" \
+                 --define "_builddir $(PKGDIR)/build/" \
+                 --define "_buildrootdir $(PKGDIR)/buildroot/" \
+                 -bb $(BUILDDIR)/redhat/nccl.spec
+
+clean:
+	rm -Rf $(RPMPREPDIR) $(PKGDIR)
+
+$(RPMPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(RPMPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
+	    $< > $@
+
+$(RPMPREPDIR)/% : %
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(RPMPREPDIR)
+	cp -f $< $@
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@ -0,0 +1,73 @@
+Name:           libnccl
+Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+Release:        ${pkg:Revision}
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+
+Group:          Development/Libraries
+License:        BSD
+URL:            http://developer.nvidia.com/nccl
+Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
+
+%description
+NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+broadcast, and reduce-scatter.
+It has been optimized to achieve high bandwidth on any platform using PCIe,
+NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+sockets.
+
+%package devel
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Group:          Development/Libraries
+%description devel
+NCCL development files
+
+%package static
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Group:          Development/Libraries
+%description static
+NCCL static library
+
+%define debug_package %{nil}
+
+%prep
+%setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
+
+%build
+
+%install
+rm -rf $RPM_BUILD_ROOT
+install -m 755 -d $RPM_BUILD_ROOT
+install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
+install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
+ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
+
+# devel
+install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
+install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
+ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
+
+# static
+install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files devel
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_includedir}/nccl.h
+%{_libdir}/libnccl.so
+
+%files static
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_libdir}/libnccl_static.a
+
+%files
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_libdir}/libnccl.so.${nccl:Major}
+%{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+
+%changelog
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@ -0,0 +1,39 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+TXZPREPDIR  := $(BUILDDIR)/srctxz
+PKGDIR  := $(BUILDDIR)/pkg/srctxz/
+
+TXZGEN_IN  := $(wildcard *.in)
+TXZGEN     := $(TXZGEN_IN:.in=)
+TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
+
+PKG_REVISION   ?= 3
+PKG_ARCH       := $(shell uname -m)
+
+prep: $(TXZTARGETS)
+
+build: prep
+	$(MAKE) -C ../../src clean
+	@printf "Building source tar.xz package\n"
+	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
+
+clean:
+	rm -Rf $(TXZPREPDIR) $(PKGDIR)
+
+$(TXZPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(TXZPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    $< > $@
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# To run from $BUILDDIR/
+
+cd ..
+NCCLDIR=`basename $PWD`
+
+echo "Checking for unclean directory ..."
+git clean -x -i
+echo "Clean done."
+echo "Checking for uncommited files ..."
+if [ "`git status -s | wc -l`" != "0" ]; then
+  git status -s
+  echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
+  read
+fi
+
+cd ..
+NCCL_MAJOR=${nccl:Major}
+NCCL_MINOR=${nccl:Minor}
+NCCL_PATCH=${nccl:Patch}
+NCCL_SUFFIX=${nccl:Suffix}
+
+NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
+
+tar --exclude build \
+    --exclude ".git*" \
+    --exclude pkg/srctxz \
+    --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
--- a/pkg/txz/Makefile
+++ b/pkg/txz/Makefile
@ -0,0 +1,43 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+TXZPREPDIR  := $(BUILDDIR)/txz
+PKGDIR  := $(BUILDDIR)/pkg/txz/
+
+TXZGEN_IN  := $(wildcard *.in)
+TXZGEN     := $(TXZGEN_IN:.in=)
+TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
+
+PKG_ARCH   := $(shell uname -m)
+
+prep: $(TXZTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build: prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	@printf "Building tar.xz package\n"
+	(cd $(BUILDDIR); bash txz/create_txz.sh)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
+
+clean:
+	rm -Rf $(TXZPREPDIR) $(PKGDIR)
+
+$(TXZPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(TXZPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    $< > $@
--- a/pkg/txz/create_txz.sh.in
+++ b/pkg/txz/create_txz.sh.in
@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# To run from $BUILDDIR/
+
+BUILDDIR=`basename $PWD`
+
+cd ..
+NCCL_MAJOR=${nccl:Major}
+NCCL_MINOR=${nccl:Minor}
+NCCL_PATCH=${nccl:Patch}
+NCCL_SUFFIX=${nccl:Suffix}
+CUDA_MAJOR=${cuda:Major}
+CUDA_MINOR=${cuda:Minor}
+PKG_REVISION=${pkg:Revision}
+PKG_ARCH=${pkg:Arch}
+
+NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
+
+tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
--- a/src/Makefile
+++ b/src/Makefile
@ -0,0 +1,106 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../makefiles/common.mk
+include ../makefiles/version.mk
+
+##### src files
+INCEXPORTS  := nccl.h
+LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
+		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
+		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
+                collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+
+##### lib files
+LIBNAME     := libnccl.so
+STATICLIBNAME := libnccl_static.a
+##### dirs
+BUILDDIR ?= $(abspath ../build)
+INCDIR := $(BUILDDIR)/include
+LIBDIR := $(BUILDDIR)/lib
+OBJDIR := $(BUILDDIR)/obj
+##### target files
+INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
+LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
+LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
+STATICLIBTARGET := $(STATICLIBNAME)
+LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+DEPFILES   := $(LIBOBJ:%.o=%.d)
+LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lrt
+
+DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
+
+
+##### rules
+build : lib staticlib
+
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+
+staticlib : $(LIBDIR)/$(STATICLIBTARGET)
+
+devicelib: nccl.h
+	$(MAKE) -C collectives/device
+
+-include $(DEPFILES)
+$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
+
+nccl.h : nccl.h.in
+# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
+	@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
+	printf "Generating %-35s > %s\n" $< $@
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
+	    $< > $@
+
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
+	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
+	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
+	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(eval TMP := $(shell mktemp -d))
+	cp $(LIBOBJ) $(TMP)
+	cd $(TMP) && ar x $(DEVICELIB) && cd -
+	ar cr $@ $(LIBOBJ) $(TMP)/*.o
+	rm -Rf $(TMP)
+
+$(INCDIR)/%.h : %.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)
+	cp -f $< $@
+
+$(OBJDIR)/%.o : %.cu
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -I. -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
+	@$(NVCC) -I. -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
+	@rm -f $(@:%.o=%.d.tmp)
+
+clean :
+	rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR} nccl.h
+	$(MAKE) -C collectives/device clean
+
+install : lib
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/include
+	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
+	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+# Note that formatting.mk defines a new target so in order to not overwrite the default target,
+# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
+# as the BUILDDIR variable.
+include ../makefiles/formatting.mk
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@ -1,202 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllGatherKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
-    /////////////// begin AllGather steps ///////////////
-    int offset;
-    int maxOffset = size-chunkOffset;
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring.userRank[0];
-    offset = chunkOffset + rankDest * size;
-
-    if (thisInput == thisOutput) {
-      Prims::Copy(
-          thisInput  + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    } else {
-      Prims::DoubleCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: copy to next GPU
-    if (pushrecv) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring.userRank[1];
-      offset = chunkOffset + rankDest * size;
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllGather(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllGatherKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class AllGather {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllGather<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, "AllGather"));
-  return enqueue<AllGather, FuncNull>(sendbuff, recvbuff, count, datatype, 0, comm, stream);
-}
-
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@ -1,234 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllReduceKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += nranks*sliceSize) {
-    /////////////// begin AllReduce steps ///////////////
-    int offset;
-    int maxOffset;
-    int slice;
-    int chunkSize = min(sliceSize, DIVUP(size-chunkOffset,nranks));
-    ALIGN_SIZE(chunkSize, THREADS*UNROLL);
-
-    // step 0: push data to next GPU
-    slice = ring.userRank[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
-
-    Prims::Copy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      slice = ring.userRank[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      Prims::Reduce(
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    slice = ring.userRank[0];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
-
-    Prims::ReduceCopy(
-        prevInput  + poffset,
-        thisInput  + offset,
-        pushrecv ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
-
-    if (pushrecv) {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      slice = ring.userRank[1];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllReduce(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template <typename> class RedOp>
-class AllReduce {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllReduce<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, 0, comm, "AllReduce"));
-  return enqueue<AllReduce>(sendbuff, recvbuff, count, datatype, op, 0, comm, stream);
-}
-
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cu
@ -0,0 +1,292 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "utils.h"
+#include "bootstrap.h"
+#include "net.h"
+#include <unistd.h>
+#include <sys/types.h>
+
+// Always use sockets for bootstrap
+ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
+
+static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
+
+// Additional sync functions based on async + test for bootstrap, using host ptrs.
+static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
+  void* request;
+  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
+  int done = 0;
+  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  return ncclSuccess;
+}
+static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
+  void* request;
+  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
+  int done = 0;
+  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  return ncclSuccess;
+}
+
+struct extId {
+  ncclNetHandle_t extHandle;
+  void* extListenComm;
+  uint64_t hostHash;
+  pid_t pid;
+  int fd;
+  pthread_t boostrapThread;
+};
+
+struct bootstrapOp {
+  int op;
+  int size;
+};
+
+struct extInfo {
+  int rank;
+  int nranks;
+  ncclNetHandle_t extHandle;
+};
+
+enum {
+  BOOTSTRAP_ALLGATHER = 1,
+  BOOTSTRAP_RINGEXCHANGE,
+};
+
+#include <sys/resource.h>
+
+static ncclResult_t setFilesLimit() {
+  struct rlimit filesLimit;
+  SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
+  filesLimit.rlim_cur = filesLimit.rlim_max;
+  SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
+  return ncclSuccess;
+}
+
+static void *bootstrapRoot(void* commId) {
+  struct extInfo info;
+  struct extId* id = (struct extId*)commId;
+  struct bootstrapOp bop;
+  void **extSendComm = NULL;
+  void **extRecvComm = NULL;
+  int size, alloc_size = 0;
+  char* data = NULL;
+  ncclResult_t res;
+  setFilesLimit();
+
+  /* Receive addresses from all ranks */
+  int nranks = 0, c = 0;
+  do {
+    void* tmpRecvComm;
+    NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpRecvComm), res, out);
+    NCCLCHECKGOTO(bootstrapRecv(tmpRecvComm, &info, sizeof(info)), res, out);
+    if (!c) {
+      extSendComm = (void**)calloc(info.nranks, sizeof(void*));
+      extRecvComm = (void**)calloc(info.nranks, sizeof(void*));
+      if (extSendComm == NULL || extRecvComm == NULL) {
+        WARN("Bootstrap thread : failed to allocate memory");
+        goto out;
+      }
+      nranks = info.nranks;
+    }
+
+    if (nranks != info.nranks) {
+      WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
+      goto out;
+    }
+
+    extRecvComm[info.rank] = tmpRecvComm;
+    NCCLCHECKGOTO(bootstrapConnect(0, info.extHandle, extSendComm+info.rank), res, out);
+    c++;
+  } while (c < nranks);
+
+  do {
+    NCCLCHECKGOTO(bootstrapRecv(extRecvComm[0], &bop, sizeof(struct bootstrapOp)), res, out);
+    if (bop.size == -1) {
+      break;
+    } else {
+      size = bop.size;
+      if (size*nranks*2 > alloc_size) {
+        if (data) free(data); data = NULL;
+        NCCLCHECKGOTO(ncclCalloc(&data, size*nranks*2), res, out);
+        alloc_size = size*nranks*2;
+      }
+    }
+
+    if (bop.op == BOOTSTRAP_ALLGATHER) {
+      for (int r=0; r<nranks; r++) {
+        NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], data+size*r, size), res, out);
+      }
+
+      for (int r=0; r<nranks; r++) {
+        NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data, size*nranks), res, out);
+      }
+    } else if (bop.op == BOOTSTRAP_RINGEXCHANGE) {
+      // Receive from all and build total table
+      for (int r=0; r<nranks; r++) {
+        NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], data+r*2*size, 2*size), res, out);
+      }
+
+      // Get prev/next request from everyone and answer.
+      for (int r=0; r<nranks; r++) {
+        int offset;
+        NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], &offset, sizeof(int)), res, out);
+        NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data+offset, size), res, out);
+        NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], &offset, sizeof(int)), res, out);
+        NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data+offset, size), res, out);
+      }
+    } else {
+      WARN("Bootstrap Root : invalid op type received %d", bop.op);
+      break;
+    }
+  } while (1);
+
+out:
+  bootstrapCloseListen(id->extListenComm);
+  for (int r=0; r<nranks; r++) {
+    if (extSendComm[r]) bootstrapCloseSend(extSendComm[r]);
+    if (extRecvComm[r]) bootstrapCloseRecv(extRecvComm[r]);
+  }
+  free(commId);
+  if (data) free(data);
+  if (extSendComm) free(extSendComm);
+  if (extRecvComm) free(extRecvComm);
+  return NULL;
+}
+
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
+  struct extId* id = (struct extId*)commId;
+  id->hostHash = getHostHash();
+  NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandle, &id->extListenComm));
+  ncclUniqueId* threadIdCopy;
+  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
+  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
+  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
+  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  extId* id = (extId*)out;
+
+  char* env = getenv("NCCL_COMM_ID");
+  if (env) {
+    if (ncclSocketCreateHandle(&id->extHandle, env) != 0) {
+      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+      return ncclInvalidArgument;
+    }
+    id->pid = -1;
+  } else {
+    id->pid = getpid();
+    NCCLCHECK(bootstrapCreateRoot(out, false));
+  }
+
+  return ncclSuccess;
+}
+
+struct extState {
+  void* extRecvComm;
+  void* extSendComm;
+  int rank;
+  int nranks;
+};
+
+ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
+  struct extId* id = (struct extId*)commId;
+  bool idFromEnv = id->pid < 0;
+  struct extState* state;
+  NCCLCHECK(ncclCalloc(&state, 1));
+  state->rank = rank;
+  state->nranks = nranks;
+  *commState = state;
+
+  struct extInfo info;
+  info.rank = rank;
+  info.nranks = nranks;
+  void* tmpListenComm;
+  // Pass the remote address to listen via info
+  if (idFromEnv) {
+    memcpy(&info.extHandle, &id->extHandle, sizeof(ncclNetHandle_t));
+  }
+  // listen will return the local address via info ('findSubnetIf' indicates that the net device is unknown)
+  int dev = idFromEnv ? findSubnetIf : 0;
+  NCCLCHECK(bootstrapListen(dev, &info.extHandle, &tmpListenComm));
+  NCCLCHECK(bootstrapConnect(dev, id->extHandle, &state->extSendComm));
+  NCCLCHECK(bootstrapSend(state->extSendComm, &info, sizeof(info)));
+  NCCLCHECK(bootstrapAccept(tmpListenComm, &state->extRecvComm));
+  NCCLCHECK(bootstrapCloseListen(tmpListenComm));
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
+  struct extState* state = (struct extState*)commState;
+  char* data = (char*)allData;
+  struct bootstrapOp bop;
+
+  bop.op = BOOTSTRAP_ALLGATHER;
+  bop.size = size;
+
+  if (!state->rank) {
+    NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
+  }
+
+  NCCLCHECK(bootstrapSend(state->extSendComm, data+state->rank*size, size));
+  NCCLCHECK(bootstrapRecv(state->extRecvComm, data, size*state->nranks));
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapRingExchange(void* commState, void* prevNextData, int prev, int next, int size) {
+  struct extState* state = (struct extState*)commState;
+  char* mydata = (char*)prevNextData;
+  int prev_offset = prev*2*size+size, next_offset = next*2*size;
+
+  struct bootstrapOp bop;
+  bop.op = BOOTSTRAP_RINGEXCHANGE;
+  bop.size = size;
+
+  if (!state->rank) {
+    NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
+  }
+
+  // Send data to root
+  NCCLCHECK(bootstrapSend(state->extSendComm, mydata, 2*size));
+
+  // Receive prev and next data
+  NCCLCHECK(bootstrapSend(state->extSendComm, &prev_offset, sizeof(int)));
+  NCCLCHECK(bootstrapRecv(state->extRecvComm, mydata, size));
+  NCCLCHECK(bootstrapSend(state->extSendComm, &next_offset, sizeof(int)));
+  NCCLCHECK(bootstrapRecv(state->extRecvComm, mydata+size, size));
+
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapClose(void* commState) {
+  struct extState* state = (struct extState*)commState;
+  struct bootstrapOp bop;
+  bop.size = -1;
+
+  if (!state->rank) {
+    NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
+  }
+
+  NCCLCHECK(bootstrapCloseSend(state->extSendComm));
+  NCCLCHECK(bootstrapCloseRecv(state->extRecvComm));
+
+  free(state);
+
+  return ncclSuccess;
+}
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@ -1,164 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 4
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void BroadcastKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int rank = ring.userRank[0];
-  const int nextRank = ring.userRank[1];
-  const int root = args.root;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int offset = 0; offset < size; offset += sliceSize) {
-    int maxOffset = size-offset;
-    if (rank == root) {
-      Prims::Copy(
-          thisInput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
-    } else if (nextRank == root) {
-      if (pushrecv) maxOffset = 0; // Only wait for signals
-      Prims::Copy(
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      if (pushrecv) {
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(
-            prevInput + boffset,
-            thisOutput + offset,
-            nextOutput + boffset,
-	    sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for last update from next then reset the flag
-      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-      *ring.recvFlagFromNext = 0;
-    }
-
-    if (rank != root) {
-      // reset the flag
-      *ring.recvFlagFromPrev = 0;
-    }
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 256
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingBroadcast(void* buff, const int count, const int root,
-    ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks != 1) {
-    KernelArgs<T> args;
-    ArgsSetup(&args, buff, buff, root, count, comm);
-    LAUNCH_KERNEL(BroadcastKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class Broadcast {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int root, ncclComm* comm, cudaStream_t stream) {
-    return RingBroadcast<RedOp<T>, T>(recvbuff, count, root, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclBcast, void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(buff, buff, count, datatype, ncclSum, root, comm, "Bcast"));
-  return enqueue<Broadcast, FuncNull>(nullptr, buff, count, datatype, root, comm, stream);
-}
-
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cu
@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
+          ncclSum, 0, comm, stream);
+}
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cu
@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
+          op, 0, comm, stream);
+}
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cu
@ -0,0 +1,42 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
+    NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
+  }
+
+  return ncclSuccess;
+}
+
+/* Deprecated original "in place" function, similar to MPI */
+NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
+          ncclSum, root, comm, stream);
+}
+
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
+          ncclSum, root, comm, stream);
+}
--- a/src/collectives/collectives.h
+++ b/src/collectives/collectives.h
@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COLLECTIVES_H_
+#define NCCL_COLLECTIVES_H_
+
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
+
+#define NCCL_COLL_NAME(coll, op, dtype) \
+  coll##_##op##_##dtype
+
+#define NCCL_KERN_NAME(coll, op, dtype) \
+  coll##Kernel_##op##_##dtype
+
+/* Declare all collective operations */
+#define DECL_COLL4(coll, op, dtype) \
+  extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
+  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
+
+#define DECL_COLL3(coll, op, dtype) \
+  DECL_COLL4(coll##LL, op, dtype) \
+  DECL_COLL4(coll, op, dtype)
+
+#define DECL_COLL2(coll, op) \
+  DECL_COLL3(coll, op, i8) \
+  DECL_COLL3(coll, op, u8) \
+  DECL_COLL3(coll, op, i32) \
+  DECL_COLL3(coll, op, u32) \
+  DECL_COLL3(coll, op, i64) \
+  DECL_COLL3(coll, op, u64) \
+  DECL_COLL3(coll, op, f16) \
+  DECL_COLL3(coll, op, f32) \
+  DECL_COLL3(coll, op, f64)
+
+#define DECL_COLL(coll) \
+  DECL_COLL2(coll, sum) \
+  DECL_COLL2(coll, prod) \
+  DECL_COLL2(coll, min) \
+  DECL_COLL2(coll, max)
+
+#define DECL_ALL_COLLS \
+  DECL_COLL2(ncclBroadcast, copy) \
+  DECL_COLL(ncclReduce) \
+  DECL_COLL2(ncclAllGather, copy) \
+  DECL_COLL(ncclReduceScatter) \
+  DECL_COLL(ncclAllReduce) \
+
+DECL_ALL_COLLS
+
+#define ALLREDUCE_SUBSTEPS 2
+#define ALLREDUCE_BUFCHUNKS 2
+#define ALLGATHER_SUBSTEPS 2
+#define ALLGATHER_BUFCHUNKS 2
+#define REDUCESCATTER_SUBSTEPS 2
+#define REDUCESCATTER_BUFCHUNKS 2
+#define BROADCAST_SUBSTEPS 8
+#define BROADCAST_BUFCHUNKS 2
+#define REDUCE_SUBSTEPS 8
+#define REDUCE_BUFCHUNKS 2
+
+#endif
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@ -0,0 +1,86 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../../makefiles/common.mk
+include ../../../makefiles/version.mk
+
+BUILDDIR ?= $(abspath ../../../build)
+OBJDIR := $(BUILDDIR)/obj/collectives/device
+
+LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
+
+LIBOBJ     := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
+              $(OBJDIR)/functions.o
+
+LIBSRCFILES += functions.cu
+
+DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
+DEPENDFILES := $(DEPFILES:%.d=%.dep)
+STATICLIB  := $(OBJDIR)/colldevice.a
+DEVOBJ     := $(OBJDIR)/devlink.o
+
+NVCUFLAGS  += -I. -I.. -I../.. -I../../include --compiler-options "-fPIC -fvisibility=hidden"
+
+
+all: $(STATICLIB)
+
+# Dummy rule so that the extra dependency (%.dep) files are preserved by make
+all_deps: $(DEPENDFILES)
+
+-include $(DEPFILES)
+
+$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
+	@printf "Archiving  %-35s > %s\n" objects $@
+	ar cr $@ $^
+
+# We do not want make to build *.d when running make clean.
+# So we only provide targets for .dep which will produce .dep and .d,
+# with only .d being included, and .dep keeping track of what needs to
+# be regenerated.
+$(OBJDIR)/%.dep : %.cu
+	@mkdir -p $(OBJDIR)
+	@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
+	@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $@
+	@rm -f $@.tmp
+	@cp $@ $(@:.dep=.d)
+
+# Compiled kernels and collectives with relocatable device code ...
+$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
+
+# ... and create the device-side linked object with all those.
+$(DEVOBJ) : $(LIBOBJ)
+	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
+
+clean:
+	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "all_gather.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
+#endif
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@ -0,0 +1,269 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
+
+  typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+    if (prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin AllGather steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    if (thisInput + chunkOffset == thisOutput + offset) { // In place
+      Prims::Copy(tid, nthreads,
+          thisInput  + chunkOffset,
+          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    } else {
+      Prims::DoubleCopy(tid, nthreads,
+          thisInput  + chunkOffset,
+          thisOutput + offset,
+          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    }
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: copy to next GPU
+    if (prevdirect) {
+      for (int j=1; j<nranks-1; ++j) {
+        rankDest = ring->devUserRanks[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+      Prims::Copy(tid, nthreads,
+          NULL,
+          NULL,
+          0, 0,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      for (int j=1; j<nranks-1; ++j) {
+        rankDest = ring->devUserRanks[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + poffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+
+      // Make final copy from buffer to dest.
+      rankDest = ring->devUserRanks[1];
+      offset = chunkOffset + rankDest * size;
+
+      // Here we need to copy from buffer to this output.
+      Prims::Copy(tid, nthreads,
+          prevInput + poffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    }
+  }
+
+  if (tid == 0) {
+    waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin AllGather steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    WAIT_NEXT;
+    if (thisInput + chunkOffset == thisOutput + offset) { // In place
+      LL::ReduceCopy(
+          thisInput  + chunkOffset,
+          nextOutput + noffset,
+          maxOffset, nflag, llNthreads);
+    } else {
+      LL::ReduceCopy(
+          thisInput  + chunkOffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, nflag, llNthreads);
+    }
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput  + poffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: final store
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    LL::ReduceCopy(
+        prevInput  + poffset,
+        thisOutput + offset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "all_reduce.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclAllReduce, sum,  FuncSum,  ncclCollAllReduce, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclAllReduce, min,  FuncMin,  ncclCollAllReduce, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
+#endif
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@ -0,0 +1,332 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
+
+  typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+    if (prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+
+    /////////////// begin AllReduce steps ///////////////
+    ssize_t offset;
+    int maxOffset;
+    int slice;
+
+    // step 0: push data to next GPU
+    slice = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    Prims::Copy(tid, nthreads,
+        thisInput  + offset,
+        nextOutput + noffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext,
+        postReadyToNext);
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      Prims::Reduce(tid, nthreads,
+          prevInput  + poffset,
+          thisInput  + offset,
+          nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring->devUserRanks[0];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    Prims::ReduceCopy(tid, nthreads,
+        prevInput  + poffset,
+        thisInput  + offset,
+        nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+        thisOutput + offset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext, waitReadyFromPrev,
+        postReadyToNext, postDoneToPrev);
+
+    NEXT_STEP;
+
+    // k-2 steps: copy to next GPU
+    if (prevdirect) {
+      for (int j=1; j<nranks-1; ++j) {
+        slice = ring->devUserRanks[nranks - j];
+        offset = chunkOffset + slice * chunkSize;
+        maxOffset = min(chunkSize, size-offset);
+
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+      Prims::Copy(tid, nthreads,
+          NULL,
+          NULL,
+          0, 0,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      for (int j=1; j<nranks-1; ++j) {
+        slice = ring->devUserRanks[nranks - j];
+        offset = chunkOffset + slice * chunkSize;
+        maxOffset = min(chunkSize, size-offset);
+
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + poffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+
+      // Make final copy from buffer to dest.
+      slice = ring->devUserRanks[1];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      // Here we need to copy from buffer to this output.
+      Prims::Copy(tid, nthreads,
+          prevInput + poffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    }
+  }
+
+  if (tid == 0) {
+    // Wait for next to have consumed all data before we reset the flag
+    waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*nranks*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+
+    /////////////// begin AllReduce steps ///////////////
+    ssize_t offset;
+    int maxOffset;
+    int slice;
+
+    // step 0: push data to next GPU
+    slice = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        nextOutput + noffset,
+        maxOffset, nflag, llNthreads);
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput  + offset,
+          prevInput  + poffset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring->devUserRanks[0];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        prevInput  + poffset,
+        thisOutput + offset,
+        nextOutput + noffset,
+        maxOffset, pflag, nflag, llNthreads);
+    POST_SIZE;
+    ACK_PREV;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks - j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput + poffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    // Here we need to copy from buffer to this output.
+    LL::ReduceCopy(
+        prevInput + poffset,
+        thisOutput + offset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "broadcast.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
+#endif
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@ -0,0 +1,228 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and boffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  boffset += sliceSize; \
+  if (boffset == buffSize) boffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
+  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
+
+  typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
+
+  const ssize_t size = args->N;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    if (nextRank != root) {
+      // Wait for next to be ready
+      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+      waitOpCountNext.wait(args->opCount);
+    }
+    if (rank != root && prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextRank != root && nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int boffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*chunkSize;
+    int maxOffset = min(chunkSize, size-offset);
+
+    if (rank == root) {
+      if (thisInput == thisOutput) {
+        Prims::Copy(tid, nthreads,
+            thisInput  + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext,
+            postReadyToNext);
+      } else {
+        Prims::DoubleCopy(tid, nthreads,
+            thisInput  + offset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext,
+            postReadyToNext);
+      }
+    } else if (nextRank == root) {
+      if (prevdirect) maxOffset = 0; // Only wait for signals
+      Prims::Copy(tid, nthreads,
+          prevInput  + boffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      if (prevdirect) {
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+      } else {
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + boffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+      }
+    }
+    NEXT_STEP; // Increases step, boffset
+  }
+
+  if (tid == 0) {
+    if (nextRank != root) {
+      // Wait for next to have consumed data before resetting the flag
+      waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
+      *ring->send.conn.head = 0ULL;
+    }
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  boffset += NCCL_LL_SLICE_LINES; \
+  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
+  flag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+  const int rank = comm->rank;
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t flag = step + 1;
+  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int maxOffset = min(chunkSize, size-offset);
+    if (rank == root) {
+      WAIT_NEXT;
+      if (thisInput == thisOutput) {
+        LL::ReduceCopy(
+            thisInput + offset,
+            nextOutput + boffset,
+            maxOffset, flag, llNthreads);
+      } else {
+        LL::ReduceCopy(
+            thisInput + offset,
+            thisOutput + offset,
+            nextOutput + boffset,
+            maxOffset, flag, llNthreads);
+      }
+      POST_SIZE;
+      NEXT_STEP_LL;
+    } else if (nextRank == root) {
+      LL::ReduceCopy(
+          prevInput + boffset,
+          thisOutput + offset,
+          maxOffset, flag, llNthreads);
+      NEXT_STEP_LL;
+      ACK_PREV;
+    } else {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput + boffset,
+          thisOutput + offset,
+          nextOutput + boffset,
+          maxOffset, flag, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+      ACK_PREV;
+    }
+  }
+
+  // We need everyone to acknowledge data even if they didn't receive anything
+  // so that the next collective can start right away.
+  ACK_PREV;
+
+  FIFO_CLEANING_AND_SAVE_STEP(flag);
+}
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@ -0,0 +1,90 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_COMMON_H_
+#define NCCL_DEVICE_COMMON_H_
+
+#include "../collectives.h"
+#include "core.h"
+#include "nccl.h"
+
+typedef void(*ncclKern_t)(struct CollectiveArgs* args);
+extern __device__ ncclKern_t ncclFuncs[];
+
+static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
+  int* d = (int*)dst;
+  int* s = (int*)src;
+  __syncthreads();
+  for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
+  __syncthreads();
+}
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
+  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+  if (tid == 0) hostColl->active = 0;
+}
+
+/* Functions for aggregation case */
+#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
+  coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
+}
+/* Kernels with the first operation inlined */
+#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
+__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
+__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
+  int tid = threadIdx.x; \
+  int bid = blockIdx.x; \
+  __shared__ struct ncclColl localColl; \
+ \
+  struct ncclComm* comm = firstColl.args.comm; \
+  struct ncclRing* ring = comm->rings+bid; \
+  struct ncclColl* c; \
+  if (bid == 0) { \
+    /* To optimize for latency, (only) the first operation is passed as argument.*/ \
+    c = &firstColl; \
+  } else { \
+    c = &localColl; \
+    load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
+  } \
+  while (1) { \
+    if (tid < c->nThreads) { \
+      if (c->funcIndex == fIndex) { \
+        coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
+      } else { \
+        ncclFuncs[c->funcIndex](&c->args); \
+      } \
+    } \
+    int nextIndex = c->nextIndex; \
+    if (tid == 0) ring->collFifoHead = nextIndex; \
+ \
+    if (c->active == 2) { \
+      return; \
+    } \
+ \
+    /* Load next collective operation*/ \
+    c = &localColl; /* for bid 0 */ \
+    load_coll(c, ring->devCollectives+nextIndex, tid); \
+  } \
+}
+
+#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
+  IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
+  IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
+
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
+  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8) \
+  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32) \
+  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
+  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64) \
+  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
+  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16) \
+  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
+  IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)
+
+#endif
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@ -0,0 +1,372 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMMON_KERNEL_H_
+#define NCCL_COMMON_KERNEL_H_
+
+#include "core.h"
+#include <cstdio>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+// Define min for ssize_t
+static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
+
+typedef uint64_t PackType;
+
+// unpack x and y to elements of type T and apply FUNC to each element
+template<class FUNC, typename T>
+struct MULTI {
+  __device__ PackType operator()(const PackType x, const PackType y) const;
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int8_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of uint32_t.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    // for char, we do these as vector ops
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint8_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of uint32_t.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    // for char, we do these as vector ops
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int32_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(int32_t),
+      "PackType must be twice the size of int.");
+  union converter {
+    PackType storage;
+    struct {
+      int32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint32_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of int.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, half> {
+  static_assert(sizeof(PackType) == 4 * sizeof(half),
+      "PackType must be four times the size of half.");
+
+  struct PackHalf2 {
+    half2 a, b;
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    struct PackHalf2 cx, cy, cr;
+    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
+    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return *(reinterpret_cast<PackType*>(&cr));
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, float> {
+  static_assert(sizeof(PackType) == 2 * sizeof(float),
+      "PackType must be twice the size of float.");
+  union converter {
+    PackType storage;
+    struct {
+      float a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, double> {
+  static_assert(sizeof(PackType) == sizeof(double),
+      "PackType must be the same size as double.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
+    return __double_as_longlong(rv);
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint64_t> {
+  static_assert(sizeof(PackType) == sizeof(uint64_t),
+      "PackType must be the same size as uint64_t.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    uint64_t rv = FUNC()(x, y);
+    return rv;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int64_t> {
+  static_assert(sizeof(PackType) == sizeof(int64_t),
+      "PackType must be the same size as int64_t.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    int64_t rv = FUNC()((int64_t)x, (int64_t)y);
+    return rv;
+  }
+};
+
+#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
+
+template<typename T>
+__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+  size_t ptrval = reinterpret_cast<size_t>(ptr);
+  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+}
+
+template<typename T> inline __device__
+T vFetch(const volatile T* ptr) {
+  return *ptr;
+}
+
+template<typename T> inline __device__
+void vStore(volatile T* ptr, const T val) {
+  *ptr = val;
+}
+
+#if CUDART_VERSION < 9000
+template<> inline __device__
+half vFetch<half>(const volatile half* ptr) {
+  half r;
+  r.x = ptr->x;
+  return r;
+}
+
+template<> inline __device__
+void vStore<half>(volatile half* ptr, const half val) {
+  ptr->x = val.x;
+}
+#else
+template<> inline __device__
+half vFetch<half>(const volatile half* ptr) {
+  half r;
+  r = ((half*)ptr)[0];
+  return r;
+}
+
+template<> inline __device__
+void vStore<half>(volatile half* ptr, const half val) {
+  ((half*)ptr)[0] = val;
+}
+#endif
+
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
+__device__ inline void ReduceCopy(
+    const int tid, const int nthreads,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1,
+    volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1, const int N) {
+  for (int idx = tid; idx < N; idx += nthreads) {
+    T val = vFetch(src0+idx);
+    if (TWO_INPUTS) {
+      val = FUNC()(val, vFetch(src1+idx));
+    }
+    vStore(dest0+idx, val);
+    if (TWO_OUTPUTS) {
+      vStore(dest1+idx, val);
+    }
+  }
+}
+
+typedef ulong2 Pack128;
+
+template<class FUNC, typename T>
+struct MULTI128 {
+  __device__ void operator()(Pack128& x, Pack128& y) {
+    x.x = MULTI<FUNC, T>()(x.x, y.x);
+    x.y = MULTI<FUNC, T>()(x.y, y.y);
+  }
+};
+
+inline __device__ void Fetch128(Pack128& v, Pack128* p) {
+  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
+}
+inline __device__ void Store128(Pack128* p, Pack128& v) {
+  asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
+}
+
+#define WARP_SIZE 32
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
+__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
+    Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
+    const int N) {
+  Pack128 t0[UNROLL];
+  Pack128 t1[UNROLL];
+  const Pack128* src0_end = src0 + N;
+  const int inc = nw * UNROLL * WARP_SIZE;
+  const int offset = w * UNROLL * WARP_SIZE + t;
+  src0 += offset;  if (TWO_INPUTS)  src1 += offset;
+  dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
+
+  while (src0 < src0_end) {
+#pragma unroll
+    for (int u = 0; u < UNROLL; ++u) {
+      Fetch128(t0[u], src0+u*WARP_SIZE);
+      if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
+    }
+#pragma unroll
+    for (int u = 0; u < UNROLL; ++u) {
+      if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
+      Store128(dest0+u*WARP_SIZE, t0[u]);
+      if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
+    }
+    src0 += inc;  if (TWO_INPUTS)  src1 += inc;
+    dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
+  }
+}
+
+template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
+__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
+    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
+    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+    int N) {
+  int Nrem = N;
+  if (Nrem <= 0) return;
+
+  int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
+
+  // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
+  // If not, we'll just use the slow preamble path for the whole operation
+  bool alignable = (((AlignUp(src0,  alignof(Pack128)) == src0  + Npreamble)) &&
+          (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
+          (!HAS_SRC1  || (AlignUp(src1,  alignof(Pack128)) == src1  + Npreamble)));
+
+  if (!alignable) {
+    Npreamble = Nrem;
+  }
+
+  // stage 1: preamble: handle any elements up to the point of everything coming
+  // into alignment
+  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
+
+  Nrem -= Npreamble;
+  if (Nrem == 0) return;
+
+  dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
+  src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
+
+  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
+  // assuming the pointers we have are all 128-bit alignable.
+  int w = tid / WARP_SIZE;       // Warp number
+  int nw = nthreads / WARP_SIZE; // Number of warps
+  int t = tid % WARP_SIZE;       // Thread (inside the warp)
+
+  const int PackFactor = sizeof(Pack128) / sizeof(T);
+
+  // stage 2a: main loop
+  int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
+      * (UNROLL * nthreads); // round down
+
+  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
+
+  int Ndone2a = Nalign2a * PackFactor;
+  Nrem -= Ndone2a;
+  if (Nrem == 0) return;
+  dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
+  src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
+
+  // stage 2b: slightly less optimized for section when we don't have full
+  // UNROLLs
+
+  int Nalign2b = Nrem / PackFactor;
+
+  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
+
+  int Ndone2b = Nalign2b * PackFactor;
+  Nrem -= Ndone2b;
+  if (Nrem == 0) return;
+  dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
+  src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
+
+  // stage 2c: tail
+  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
+}
+
+#endif // COMMON_KERNEL_H_
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@ -0,0 +1,64 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "collectives.h"
+#include "common.h"
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_COLL_NAME(coll, op, dtype), \
+  NCCL_COLL_NAME(coll##LL, op, dtype)  \
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  u8), \
+  NCCL_FUNC4(coll, op, i32), \
+  NCCL_FUNC4(coll, op, u32), \
+  NCCL_FUNC4(coll, op, i64), \
+  NCCL_FUNC4(coll, op, u64), \
+  NCCL_FUNC4(coll, op, f16), \
+  NCCL_FUNC4(coll, op, f32), \
+  NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum ), \
+  NCCL_FUNCS3A(coll, prod), \
+  NCCL_FUNCS3A(coll, max ), \
+  NCCL_FUNCS3A(coll, min )
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with ncclColl_t
+#define NCCL_FUNCS() { \
+  NCCL_FUNCS2B(ncclBroadcast), \
+  NCCL_FUNCS2A(ncclReduce), \
+  NCCL_FUNCS2B(ncclAllGather), \
+  NCCL_FUNCS2A(ncclReduceScatter), \
+  NCCL_FUNCS2A(ncclAllReduce) }
+
+// Must be consistent with the ncclFuncSet enum
+__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
--- a/src/collectives/device/ll_kernel.h
+++ b/src/collectives/device/ll_kernel.h
@ -0,0 +1,154 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_LL_KERNEL_H_
+#define NCCL_LL_KERNEL_H_
+
+static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
+  uint32_t data1, flag1, data2, flag2;
+  do {
+    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+  } while ((flag1 != flag) || (flag2 != flag));
+  uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+  return val64;
+}
+
+static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+}
+
+// Using memcpy handles misaligned pointers.
+static __device__ uint64_t readAL(uint64_t* src) {
+  uint64_t val;
+  memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+  return val;
+}
+static __device__ void storeAL(uint64_t* dst, uint64_t val) {
+  memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
+}
+
+template <typename T, class FUNC>
+class LLPrimitives {
+ private:
+  template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
+  static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    if (size <= 0) return;
+    size_t size64 = size * sizeof(T) / sizeof(uint64_t);
+    uint64_t* src1A = (uint64_t*)src1;
+    uint64_t* dst1A = (uint64_t*)dst1;
+    int offset = threadIdx.x;
+    // Do multiples of 64 bits
+#pragma unroll 1
+    for (; offset < size64; offset += nthreads) {
+      uint64_t val;
+      if (HAS_SRC1) {
+        val = readAL(src1A+offset);
+        if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
+      } else if (HAS_SRC2) {
+        val = readLL(src2+offset, iflag);
+      }
+      if (HAS_DST1) storeAL(dst1A+offset, val);
+      if (HAS_DST2) storeLL(dst2+offset, val, oflag);
+    }
+    // Finish last word
+    int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
+    int sizeRem = size - sizeDone;
+    if (threadIdx.x == 0 && sizeRem) {
+      const T* src1B = src1 + sizeDone;
+      T* dst1B = dst1 + sizeDone;
+
+      uint64_t lastVal;
+      T* vals = (T*)&lastVal;
+
+      if (HAS_SRC2) {
+        uint64_t lastVal2 = readLL(src2+size64, iflag);
+        T* src2B = (T*)&lastVal2;
+        for (int offset = 0; offset < sizeRem; offset++) {
+          vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
+        }
+      } else if (HAS_SRC1) {
+        for (int offset = 0; offset < sizeRem; offset++) {
+          vals[offset] = src1B[offset];
+        }
+      }
+      if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
+      if (HAS_DST1) {
+        for (int offset = 0; offset < sizeRem; offset++) {
+          dst1B[offset] = vals[offset];
+        }
+      }
+    }
+  }
+ public:
+  static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
+    return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
+  }
+};
+
+// Common macros
+
+#define STEP_TO_SLOT(step) \
+  (step % NCCL_LL_CHUNKS)
+
+#define WAIT_NEXT \
+  if (tid == 0) { \
+    while (sendHead + NCCL_LL_CHUNKS <= step) { \
+      sendHead = sendHeadPtr[0]; \
+    } \
+  } \
+  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads));
+
+#define POST_SIZE \
+  if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T));
+
+#define ACK_PREV \
+  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \
+  if (tid == 0) recvHeadPtr[0] = step;
+
+#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
+  if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \
+    /* Reset all flags */ \
+    static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
+    static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
+    const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
+    for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
+      prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
+    } \
+    __threadfence_system(); \
+    /* Restart from the same slot, only make sure sender waits for data to be reset */ \
+    step += NCCL_LL_CHUNKS; \
+    ACK_PREV; \
+    while (sendHeadPtr[0] < step); \
+    if (tid == 0) ring->send.conn.llLastCleaning = step; \
+  } \
+  ring->send.conn.llStep = step; \
+} while (0);
+
+#endif
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@ -0,0 +1,226 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PRIMITIVES_H_
+#define NCCL_PRIMITIVES_H_
+
+#include <type_traits>
+#include "reduce_kernel.h" // for reduction funcs
+
+
+/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
+ *
+ * In order to reduce the reptetion of template arguments, the operations
+ * are bundled as static methods of the Primitives class.
+ *
+ * Each primitive operation copies/reduces a contiguous buffer and syncs
+ * an optional set of flags against a sub-step counter. The sync value is
+ * based on the step parameter. Sync flags must be of type WaitFlag or
+ * PostFlag. The primitive routines wait for all WaitFlag args to attain
+ * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
+ * corresponding substep by previous step) before executing the transfer.
+ * After each substep is transfered, all PostFlag arguments get updated to
+ * the value SUBSTEPS*step+substep+1.
+ */
+
+
+class WaitFlag {
+  volatile uint64_t * const flag;
+  const int shift;
+ public:
+  __device__ __forceinline__
+  WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
+  __device__ __forceinline__
+  void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; }
+};
+
+
+class PostFlag {
+  volatile uint64_t * const flag;
+  const int shift;
+  volatile int * const fifo;
+  const int fifo_size;
+ public:
+  __device__ __forceinline__
+  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
+  __device__ __forceinline__
+  void post(uint64_t val) { *flag = (val - shift); }
+  __device__ __forceinline__
+  void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; };
+};
+
+
+// Helper to check if any argument is of type T.
+// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
+template<typename T> __device__ __forceinline__
+bool AnyAre() { return false; }
+
+template<typename T, typename FIRST_T, typename... TAIL_Ts>
+__device__ __forceinline__
+bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
+  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
+}
+
+
+// Wait on all WaitFlags, ignore PostFlags
+__device__ __forceinline__
+void WaitOnFlags(uint64_t val) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
+  flag.wait(val);
+  WaitOnFlags(val, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
+  WaitOnFlags(val, tail...);
+}
+
+
+// Post all PostFlags, ignore WaitFlags
+__device__ __forceinline__
+void PostToFlags(uint64_t val) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
+  PostToFlags(val, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
+  flag.post(val);
+  PostToFlags(val, tail...);
+}
+
+
+// Post sizes for PostFlags, ignore WaitFlags
+__device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
+  PostSizeToFlags(step, size, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
+  flag.postSize(step, size);
+  PostSizeToFlags(step, size, tail...);
+}
+
+
+// Create pointer arithmetic syntax that doesn't break for nullptr_t
+template <typename Tptr> __device__ __forceinline__
+Tptr ptradd(Tptr ptr, int i) {
+  return ptr + i;
+}
+
+__device__ __forceinline__
+nullptr_t ptradd(nullptr_t ptr, int i) {
+  return nullptr;
+}
+
+
+// Implementation of primitive types
+template <int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
+class Primitives {
+ private:
+  template <typename SRC2_T, // either T* or nullptr_t
+      typename DST2_T, // either T* or nullptr_t
+      typename... SYNC_Ts> // either WaitFunc or PostFunc
+  static __device__ __forceinline__ void
+  GenericOp(const int tid, const int nthreads,
+      const T*     src1,
+      const SRC2_T src2,
+      T*     dst1,
+      DST2_T dst2,
+      int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
+
+    enum { noSrc2 = std::is_same<SRC2_T, nullptr_t>::value };
+    enum { noDst2 = std::is_same<DST2_T, nullptr_t>::value };
+    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
+        "src2 must be of type T* or nullptr_t");
+    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
+        "dst2 must be of type T* or nullptr_t");
+
+    using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
+
+    int sliceSize = len / SUBSTEPS;
+    int sliceOffset = 0;
+
+#pragma unroll 1
+    for (int sub=0; sub<SUBSTEPS; ++sub) {
+      int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
+      if (tid < nthreads) {
+        if (AnyAre<WaitFlag>(flags...)) {
+          if (tid == 0) {
+            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
+          }
+          asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+        }
+        ReduceOrCopy
+        <
+        UNROLL,
+        OpType,
+        T,
+        !std::is_same<DST2_T, nullptr_t>::value, // HAS_DEST1
+        !std::is_same<SRC2_T, nullptr_t>::value  // HAS_SRC1
+        >
+        (
+            tid, nthreads,
+            ptradd(dst1, sliceOffset),
+            ptradd(dst2, sliceOffset),
+            ptradd(src1, sliceOffset),
+            ptradd(src2, sliceOffset),
+            realSize
+        );
+        if (AnyAre<PostFlag>(flags...)) {
+          __syncthreads();
+        }
+      } else {
+        if (AnyAre<PostFlag>(flags...)) {
+          __syncthreads();
+          PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
+          __threadfence_system();
+          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+        }
+      }
+      sliceOffset += sliceSize;
+    }
+  }
+
+ public:
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  Copy(const int tid, const int nthreads, const T* src, T* dst,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+  }
+};
+
+#endif // end include guard
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "reduce.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclReduce, sum,  FuncSum,  ncclCollReduce, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclReduce, min,  FuncMin,  ncclCollReduce, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclReduce, max,  FuncMax,  ncclCollReduce, ncclMax);
+#endif
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@ -0,0 +1,190 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and boffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  boffset += sliceSize; \
+  if (boffset == buffSize) boffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
+  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
+
+  typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int rank = ring->devUserRanks[0];
+  const int prevRank = ring->devUserRanks[nranks-1];
+  const int root = args->root;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+
+    if (rank != root) {
+      // Wait for next to be ready
+      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+      waitOpCountNext.wait(args->opCount);
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int boffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*chunkSize;
+    int maxOffset = min(chunkSize, size-offset);
+    if (prevRank == root) {
+      Prims::Copy(tid, nthreads,
+          thisInput + offset,
+          nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    } else if (rank == root) {
+      Prims::Reduce(tid, nthreads,
+          prevInput  + boffset,
+          thisInput + offset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      Prims::Reduce(tid, nthreads,
+          prevInput + boffset,
+          thisInput + offset,
+          nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+    }
+    NEXT_STEP; // Increases step, boffset
+  }
+
+  if (tid == 0) {
+    if (rank != root) {
+      // Wait for next to have consumed data before resetting the flag
+      waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
+      *ring->send.conn.head = 0ULL;
+    }
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  boffset += NCCL_LL_SLICE_LINES; \
+  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
+  flag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+  const int nranks = comm->nRanks;
+  const int rank = comm->rank;
+  const int prevRank = ring->devUserRanks[nranks-1];
+  const int root = args->root;
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t flag = step + 1;
+  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int maxOffset = min(chunkSize, size-offset);
+    if (prevRank == root) {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput + offset,
+          nextOutput + boffset,
+          maxOffset, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+    } else if (rank == root) {
+      LL::ReduceCopy(
+          thisInput + offset,
+          prevInput  + boffset,
+          thisOutput + offset,
+          maxOffset, flag, llNthreads);
+      NEXT_STEP_LL;
+      ACK_PREV;
+    } else {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput + offset,
+          prevInput + boffset,
+          nextOutput + boffset,
+          maxOffset, flag, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+      ACK_PREV;
+    }
+  }
+
+  // We need everyone to acknowledge data even if they didn't receive anything
+  // so that the next collective can start right away.
+  ACK_PREV;
+
+  FIFO_CLEANING_AND_SAVE_STEP(flag);
+}
--- a/src/collectives/device/reduce_kernel.h
+++ b/src/collectives/device/reduce_kernel.h
@ -0,0 +1,364 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef NCCL_REDUCE_KERNEL_H_
+#define NCCL_REDUCE_KERNEL_H_
+
+#include "common_kernel.h"
+#include <limits>
+
+template<typename T>
+struct FuncNull {
+  __device__ T operator()(const T x, const T y) const {
+    return 0;
+  }
+};
+
+template<typename T>
+struct FuncSum {
+  __device__ T operator()(const T x, const T y) const {
+    return x + y;
+  }
+};
+
+template<typename T>
+struct FuncProd {
+  __device__ T operator()(const T x, const T y) const {
+    return x * y;
+  }
+};
+
+template<typename T>
+struct FuncMax {
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? y : x;
+  }
+};
+
+template<typename T>
+struct FuncMin {
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? x : y;
+  }
+};
+
+template<>
+struct FuncSum<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = cx.a.x + cy.a.x;
+    cr.a.y = cx.a.y + cy.a.y;
+    cr.a.z = cx.a.z + cy.a.z;
+    cr.a.w = cx.a.w + cy.a.w;
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return x+y;
+  }
+};
+template<>
+struct FuncSum<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vadd.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = cx.a.x + cy.a.x;
+    cr.a.y = cx.a.y + cy.a.y;
+    cr.a.z = cx.a.z + cy.a.z;
+    cr.a.w = cx.a.w + cy.a.w;
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return x+y;
+  }
+};
+
+static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
+  /* This can be used both for signed and unsigned 8-bit multiplication */
+#if (__CUDA_ARCH__ >= 300)
+  uint32_t rv;
+  asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
+      " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
+      " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
+      " shl.b32          t3, t3, 16;\n\t"
+      " shl.b32          t2, t2, 16;\n\t"
+      " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
+      " shl.b32          t1, t1, 8;\n\t"
+      " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
+      " and.b32          t1, t1, 0xff00ff00;\n\t"
+      " and.b32          t0, t0, 0x00ff00ff;\n\t"
+      " or.b32           %0,  t0, t1;\n\t"
+      "}" : "=r"(rv) : "r"(x), "r"(y));
+  return rv;
+#else
+  union converter { uint32_t storage; char4 a; };
+  converter cx, cy, cr;
+  cx.storage = x;
+  cy.storage = y;
+  cr.a.x = cx.a.x * cy.a.x;
+  cr.a.y = cx.a.y * cy.a.y;
+  cr.a.z = cx.a.z * cy.a.z;
+  cr.a.w = cx.a.w * cy.a.w;
+  return cr.storage;
+#endif
+}
+
+template<>
+struct FuncProd<int8_t> {
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+    return mulChar4(x, y);
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return x*y;
+  }
+};
+template<>
+struct FuncProd<uint8_t> {
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+    return mulChar4(x, y);
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return x*y;
+  }
+};
+
+template<>
+struct FuncMax<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = max(cx.a.x, cy.a.x);
+    cr.a.y = max(cx.a.y, cy.a.y);
+    cr.a.z = max(cx.a.z, cy.a.z);
+    cr.a.w = max(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return (x>y) ? x : y;
+  }
+};
+template<>
+struct FuncMax<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = max(cx.a.x, cy.a.x);
+    cr.a.y = max(cx.a.y, cy.a.y);
+    cr.a.z = max(cx.a.z, cy.a.z);
+    cr.a.w = max(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return (x>y) ? x : y;
+  }
+};
+
+template<>
+struct FuncMin<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = min(cx.a.x, cy.a.x);
+    cr.a.y = min(cx.a.y, cy.a.y);
+    cr.a.z = min(cx.a.z, cy.a.z);
+    cr.a.w = min(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return (x<y) ? x : y;
+  }
+};
+template<>
+struct FuncMin<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = min(cx.a.x, cy.a.x);
+    cr.a.y = min(cx.a.y, cy.a.y);
+    cr.a.z = min(cx.a.z, cy.a.z);
+    cr.a.w = min(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return (x<y) ? x : y;
+  }
+};
+
+template<>
+struct FuncSum<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hadd2(x, y);
+#else
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fx.x + fy.x;
+    fr.y = fx.y + fy.y;
+    return __float22half2_rn(fr);
+#endif
+  }
+  __device__ half operator()(const half x, const half y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hadd(x, y);
+#else
+    return __float2half( __half2float(x) + __half2float(y) );
+#endif
+  }
+};
+
+template<>
+struct FuncProd<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hmul2(x, y);
+#else
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fx.x * fy.x;
+    fr.y = fx.y * fy.y;
+    return __float22half2_rn(fr);
+#endif
+  }
+  __device__ half operator()(const half x, const half y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hmul(x, y);
+#else
+    return __float2half( __half2float(x) * __half2float(y) );
+#endif
+  }
+};
+
+template<>
+struct FuncMax<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fmaxf(fx.x, fy.x);
+    fr.y = fmaxf(fx.y, fy.y);
+    return __float22half2_rn(fr);
+  }
+  __device__ half operator()(const half x, const half y) const {
+    float fx, fy, fm;
+    fx = __half2float(x);
+    fy = __half2float(y);
+    fm = fmaxf(fx, fy);
+    return __float2half(fm);
+  }
+};
+
+template<>
+struct FuncMin<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fminf(fx.x, fy.x);
+    fr.y = fminf(fx.y, fy.y);
+    return __float22half2_rn(fr);
+  }
+  __device__ half operator()(const half x, const half y) const {
+    float fx, fy, fm;
+    fx = __half2float(x);
+    fy = __half2float(y);
+    fm = fminf(fx, fy);
+    return __float2half(fm);
+  }
+};
+#endif // REDUCE_KERNEL_H_
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "reduce_scatter.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclReduceScatter, sum,  FuncSum,  ncclCollReduceScatter, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclReduceScatter, min,  FuncMin,  ncclCollReduceScatter, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
+#endif
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@ -0,0 +1,217 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
+
+  typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin ReduceScatter steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    Prims::Copy(tid, nthreads,
+        thisInput  + offset,
+        nextOutput + noffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext,
+        postReadyToNext);
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      Prims::Reduce(tid, nthreads,
+          prevInput  + poffset,
+          thisInput  + offset,
+          nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    Prims::Reduce(tid, nthreads,
+        prevInput  + poffset,
+        thisInput  + offset,
+        thisOutput + chunkOffset,
+        sliceSize, maxOffset,
+        step,
+        waitReadyFromPrev,
+        postDoneToPrev);
+  }
+
+  if (tid == 0) {
+    waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin ReduceScatter steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        nextOutput + noffset,
+        maxOffset, nflag, llNthreads);
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput  + offset,
+          prevInput  + poffset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    LL::ReduceCopy(
+        thisInput  + offset,
+        prevInput  + poffset,
+        thisOutput + chunkOffset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cu
@ -0,0 +1,33 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
+    NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
+  }
+
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
+          op, root, comm, stream);
+}
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cu
@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
+          op, 0, comm, stream);
+}
--- a/src/common_coll.h
+++ b/src/common_coll.h
@ -1,115 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
-  if (err != cudaSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer\n", opname, ptrname);
-    return ncclInvalidDevicePointer;
-  }
-  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d \n", opname, ptrname, attr.device, comm->cudaDev);
-    return ncclInvalidDevicePointer;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
-  if (ptr == NULL) {
-    WARN("%s : %s argument is NULL", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, int count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
-  NCCLCHECK(PtrCheck(comm, opname, "comm"));
-  // First, the easy ones
-  if (root < 0 || root >= comm->nRanks) {
-    WARN("%s : invalid root %d (root should be in the 0..%d range)\n", opname, root, comm->nRanks);
-    return ncclInvalidRank;
-  }
-  if (type < 0 || type >= nccl_NUM_TYPES) {
-    WARN("%s : invalid type %d\n", opname, type);
-    return ncclInvalidType;
-  }
-  if (op < 0 || op >= nccl_NUM_OPS) {
-    WARN("%s : invalid reduction operation %d\n", opname, op);
-    return ncclInvalidOperation;
-  }
-  if (count < 0) {
-    WARN("%s : invalid count %d\n", opname, count);
-    return ncclInvalidArgument;
-  }
-
-  // Check pointers
-  NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname))
-  if (strcmp(opname, "Reduce") == 0 && comm->rank != root) {
-    // No need to check recvbuff pointer for non-root reduce
-    return ncclSuccess;
-  }
-  NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname))
-  return ncclSuccess;
-}
-
-// Kernel launch
-template<typename T>
-struct KernelArgs {
-  // general parameters
-  int nRanks;
-  int root;
-  int buffSize;
-  int N;
-  int opIndex;
-  volatile int * __restrict__ opCounter;
-  int * __restrict__ doneCount;
-  bool pushrecv;
-
-  // some pre-computed sizes
-  int SliceSize;
-  int SliceOffset;
-  int ChunkSize;
-  int NumChunks;
-
-  // local and remote input, output, and buffer
-  const T * __restrict__ ThisInput;
-  T * __restrict__ ThisOutput;
-
-  DevRing<char>* ring;
-};
-
-template<typename T>
-void ArgsSetup(KernelArgs<T> *args, const void* sendbuff, void* recvbuff,
-		const int root, const int count, ncclComm *comm) {
-  args->nRanks = comm->nRanks;
-  args->root = root;
-  args->buffSize = comm->buffSize;
-  args->N = count;
-  args->opIndex = comm->opSched;
-  args->opCounter = comm->opCounter;
-  args->ThisInput = (const T*)sendbuff;
-  args->ThisOutput = (T*)recvbuff;
-  args->ring = comm->devRing;
-  args->pushrecv = comm->globalMemSpace;
-}
-
-#define LAUNCH_KERNEL(K, THREADS, UNROLL, FUNC, T, \
-		args, stream) do { \
-  dim3 grid(1, 1, 1); \
-  dim3 block(THREADS+1, 1, 1); \
-  void* argptrs[] = {&args}; \
-  CUDACHECK(cudaLaunchKernel( \
-            (void*)K<THREADS, UNROLL, FUNC, T>, \
-            grid, block, argptrs, 0, stream), ncclUnhandledCudaError); \
-} while (0)
-
-#endif
--- a/src/common_kernel.h
+++ b/src/common_kernel.h
@ -1,362 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COMMON_KERNEL_H_
-#define COMMON_KERNEL_H_
-
-#include <cstdio>
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-// BAR macro and helpers
-#define WARP_SIZE 32
-#define ROUNDUP(x, y)                                                           \
-    (((((x) + (y) - 1) / (y))) * (y))
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define BAR_EXEC(type, barid, nthreads) \
-    asm("bar." #type " " #barid ", " #nthreads ";\n\t")
-#define BAR_EXPAND(type, barid, nthreads) \
-    BAR_EXEC(type, barid, (nthreads))
-
-// Named barrier macro.
-// Expands to asm("bar.type barid, nthreads") where
-// nthreads has been rounded up to WARP_SIZE.
-#define BAR(type, barid, nthreads) \
-    BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
-
-template<typename T> inline __device__
-T vFetch(const volatile T* ptr) {
-  return *ptr;
-}
-
-template<typename T> inline __device__
-void vStore(volatile T* ptr, const T val) {
-  *ptr = val;
-}
-
-#ifdef CUDA_HAS_HALF
-#if CUDART_VERSION < 9000
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  half r;
-  r.x = ptr->x;
-  return r;
-}
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  ptr->x = val.x;
-}
-#else
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  return *((half*)ptr);
-}
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  *((half*)ptr) = val;
-}
-#endif
-#endif
-
-__device__ unsigned int spinct;
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-__device__ inline void Wait(const FUNC& func) {
-  while (!func()) {
-    // waste time
-    atomicInc(&spinct, 10);
-  }
-}
-
-typedef uint64_t PackType;
-
-// unpack x and y to elements of type T and apply FUNC to each element
-template<class FUNC, typename T>
-struct MULTI {
-  __device__ PackType operator()(const PackType x, const PackType y) const;
-};
-
-template<class FUNC>
-struct MULTI<FUNC, char> {
-  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
-      "PackType must be twice the size of uint32_t.");
-  union converter {
-    PackType storage;
-    struct {
-      uint32_t a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    // for char, we do these as vector ops
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, int> {
-  static_assert(sizeof(PackType) == 2 * sizeof(int),
-      "PackType must be twice the size of int.");
-  union converter {
-    PackType storage;
-    struct {
-      int a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template<class FUNC>
-struct MULTI<FUNC, half> {
-  static_assert(sizeof(PackType) == 4 * sizeof(half),
-      "PackType must be four times the size of half.");
-
-  struct PackHalf2 {
-    half2 a, b;
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    struct PackHalf2 cx, cy, cr;
-    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
-    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return *(reinterpret_cast<PackType*>(&cr));
-  }
-};
-#endif
-
-template<class FUNC>
-struct MULTI<FUNC, float> {
-  static_assert(sizeof(PackType) == 2 * sizeof(float),
-      "PackType must be twice the size of float.");
-  union converter {
-    PackType storage;
-    struct {
-      float a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, double> {
-  static_assert(sizeof(PackType) == sizeof(double),
-      "PackType must be the same size as double.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
-    return __double_as_longlong(rv);
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, unsigned long long> {
-  static_assert(sizeof(PackType) == sizeof(unsigned long long),
-      "PackType must be the same size as unsigned long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    unsigned long long rv = FUNC()(x, y);
-    return rv;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, long long> {
-  static_assert(sizeof(PackType) == sizeof(long long),
-      "PackType must be the same size as long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    long long rv = FUNC()((long long)x, (long long)y);
-    return rv;
-  }
-};
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__device__ inline void ReduceCopy(
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int idx) {
-  T val = vFetch(src0+idx);
-  if (TWO_INPUTS) {
-    val = FUNC()(val, vFetch(src1+idx));
-  }
-  vStore(dest0+idx, val);
-  if (TWO_OUTPUTS) {
-    vStore(dest1+idx, val);
-  }
-}
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL, int THREADS>
-__device__ inline void ReduceCopy64b(
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int offset) {
-  PackType t0[UNROLL];
-  PackType t1[UNROLL];
-  #pragma unroll
-  for (int u = 0; u < UNROLL; ++u) {
-    int idx = offset + u*THREADS;
-    t0[u] = (reinterpret_cast<const volatile PackType *>(src0))[idx];
-    if (TWO_INPUTS) {
-      t1[u] = (reinterpret_cast<const volatile PackType *>(src1))[idx];
-    }
-  }
-  #pragma unroll
-  for (int u = 0; u < UNROLL; ++u) {
-    int idx = offset + u*THREADS;
-    PackType val = TWO_INPUTS ? MULTI<FUNC, T>()(t0[u], t1[u]) : t0[u];
-    (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
-    if (TWO_OUTPUTS) {
-      (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
-    }
-  }
-}
-
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
-    bool HAS_SRC1>
-__device__ inline void ReduceOrCopy(const int tid,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
-    int N) {
-  if (N<=0) {
-    return;
-  }
-
-  int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;
-
-  // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
-      (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
-      (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = N;
-  }
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  for (int idx = tid; idx < Npreamble; idx += THREADS) {
-    // ought to be no way this is ever more than one iteration, except when
-    // alignable is false
-    ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
-  }
-
-  // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 64-bit alignable.
-  if (alignable) {
-    const int PackFactor = sizeof(PackType) / sizeof(T);
-    int Nrem = N - Npreamble;
-    dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-    src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
-
-    // stage 2a: main loop
-    int Nalign2a = (Nrem / (PackFactor * UNROLL * THREADS))
-        * (UNROLL * THREADS); // round down
-
-    #pragma unroll 1 // don't unroll this loop
-    for (int idx = tid; idx < Nalign2a; idx += UNROLL * THREADS) {
-      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL, THREADS>(src0, src1, dest0, dest1, idx);
-    }
-
-    int Ndone2a = Nalign2a * PackFactor;
-    Nrem -= Ndone2a;
-
-    // stage 2b: slightly less optimized for section when we don't have full
-    // UNROLLs
-
-    int Nalign2b = Nrem / PackFactor;
-
-    #pragma unroll 4
-    for (int idx = Nalign2a + tid; idx < Nalign2a + Nalign2b; idx += THREADS) {
-      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, 1, 0>(src0, src1, dest0, dest1, idx);
-    }
-
-    int Ndone2b = Nalign2b * PackFactor;
-    Nrem -= Ndone2b;
-    int Ndone2 = Ndone2a + Ndone2b;
-    dest0 += Ndone2; if (HAS_DEST1) { dest1 += Ndone2; }
-    src0  += Ndone2; if (HAS_SRC1)  { src1  += Ndone2; }
-
-    // stage 2c: tail
-
-    for (int idx = tid; idx < Nrem; idx += THREADS) {
-      // never ought to make it more than one time through this loop.  only a
-      // few threads should even participate
-      ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
-    }
-  } // done fast path
-}
-
-template <typename T>
-__device__ inline void incrementOpCounter(const KernelArgs<T> *args) {
-  // increment comm's operation counts
-  __threadfence_system(); // Technically need to ensure that cleared flags
-  // are visible before incrementing op counter.
-  *args->opCounter = args->opIndex+1;
-}
-
-template <int THREADS, typename T> __device__ __forceinline__
-void LoadRing(const DevRing<char>* src, DevRing<T>* dst) {
-  enum { NUM_WORDS = sizeof(DevRing<char>) / sizeof(long long) };
-  static_assert(sizeof(DevRing<char>) % sizeof(long long) == 0, "Bad alignment");
-  static_assert(THREADS >= NUM_WORDS, "Not enough threads to load DevRing");
-  static_assert(sizeof(DevRing<char>) == sizeof(DevRing<T>), "DevRing size mismatch");
-  long long* lldst = reinterpret_cast<long long*>(dst);
-  const long long* llsrc = reinterpret_cast<const long long*>(src);
-  if (threadIdx.x < NUM_WORDS) {
-    lldst[threadIdx.x] = llsrc[threadIdx.x];
-  }
-}
-
-
-#endif // COMMON_KERNEL_H_
--- a/src/copy_kernel.h
+++ b/src/copy_kernel.h
@ -1,55 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COPY_KERNEL_H_
-#define COPY_KERNEL_H_
-
-#include "common_kernel.h"
-
-template<typename T>
-struct FuncPassA {
-  __device__ T operator()(const T x, const T y) const {
-    return x;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template <>
-struct FuncPassA<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    return x;
-  }
-  __device__ half operator()(const half x, const half y) const {
-    return x;
-  }
-};
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void Copy(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, false, false>(threadIdx.x,
-      dest, nullptr, src, nullptr, N);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, true, false>(threadIdx.x,
-      dest0, dest1, src, nullptr, N);
-}
-
-#endif // COPY_KERNEL_H_
--- a/src/core.cu
+++ b/src/core.cu
--- a/src/core.h
+++ b/src/core.h
@ -1,162 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef CORE_H_
-#define CORE_H_
-
-
-#include "nccl.h"
-#include <cstdio>
-#include <cuda_runtime.h>
-
-#define MAXRANKS 32
-#define DEFAULT_BUFFER_SIZE_BYTES (1UL << 21)
-#define NCCL_MEM_PAD_ALIGN 65536
-
-
-struct ncclMem {
-  union { // Pad this block so that devBuff is correctly aligned
-    struct {
-      int   flags[2];
-      void* recvPtrs;
-      int   opCounter; // Used to determine when remote Communicators are ready.
-                       // Only used in host memory.
-    };
-    char pad[NCCL_MEM_PAD_ALIGN];
-  };
-  // devBuff will be bigger ; we only use its offset/address.
-  char buff[1];
-};
-
-template <typename T>
-struct alignas(long long) DevRing {
-  volatile int* __restrict__ prevOpCounter;
-  volatile int* __restrict__ nextOpCounter;
-  volatile int* __restrict__ sendFlagToNext;
-  volatile int* __restrict__ sendFlagToPrev;
-  volatile int* __restrict__ recvFlagFromNext;
-  volatile int* __restrict__ recvFlagFromPrev;
-
-  T* volatile * __restrict__ recvPtrFromNext;
-  T* volatile * __restrict__ sendPtrToPrev;
-  T*   __restrict__ recvBuffer;
-  T*   __restrict__ sendBuffer;
-
-  int userRank[MAXRANKS];
-};
-
-struct NodeRef {
-  ncclMem* remote; // TODO: Verify if these
-  ncclMem* local;  //       are still needed.
-  enum {DEVICE, HOST} type;
-  ncclMem* devCleanup;  // Used only when remote comm uses same process & GPU
-  ncclMem* hostCleanup; // Used whenever target is in different process
-  int* opCounter; // TODO: see if this can be removed too.
-};
-
-
-struct ncclComm {
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-
-  // Device and Host allocated chunks. Stored here to correctly free() memory.
-  ncclMem* devMem;
-  ncclMem* hostMem;
-  int hostMemState;
-  int opSched; // Scheduling operation index
-  int* opCounter; // Counter of completed operations
-
-  cudaStream_t prevStream; // cache last used stream
-  cudaEvent_t doneEvent; // orders operations in different streams
-
-  // Maps an internal nccl index to user-specified rank order. This is necessary
-  // since we need to know how the user expects data to be ordered across
-  // devices. Ordered from current device.
-  int* userFromRing;
-
-  // copy of the above stored on each device
-  int* devUserFromRing;
-
-  // Ring order
-  int* ncclFromRing; // TODO: REMOVE IF NOT NEEDED BEYOND CORE.CU
-
-  // Size of temp buffer in bytes.
-  size_t buffSize;
-
-  // Whether we have remote access to the recvbuff pointers passed from remote
-  // GPUs. In single process mode this can be used as long as QPI links are
-  // not present. In multi-process, we never push to a remote recvbuff.
-  int globalMemSpace;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;  // TODO: Remove this if not useful
-
-  // Device-side ring view
-  DevRing<char>* devRing;
-
-  // Device-to-device communication structures to access remote or local device
-  // memory. Actual allocation larger than 1.
-  NodeRef ptrs[1];
-};
-
-
-typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
-extern DebugLevel ncclDebugLevel;
-
-#define WARN(...) do {                                           \
-  if (ncclDebugLevel >= WARN) {                                  \
-    printf("WARN %s:%d ", __FILE__, __LINE__);                   \
-    printf(__VA_ARGS__);                                         \
-    printf("\n");                                                \
-    fflush(stdout);                                              \
-    if (ncclDebugLevel >= ABORT) abort();                        \
-  }                                                              \
-} while(0)
-
-#define INFO(...) do {                                           \
-  if (ncclDebugLevel >= INFO) {                                  \
-    printf("INFO "); printf(__VA_ARGS__); printf("\n");          \
-    fflush(stdout);                                              \
-  }                                                              \
-} while(0)
-
-// Check CUDA calls
-#define CUDACHECK(cmd, retcode) do {                        \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'\n", cudaGetErrorString(e)); \
-        return retcode;                                     \
-    }                                                       \
-} while(false)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    return res; \
-  } \
-} while (0);
-
-#ifdef PROFAPI
-#define NCCL_API(ret, func, args...)        \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((alias(#func)))          \
-    ret p##func (args);                     \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((weak))                  \
-    ret func(args)
-#else
-#define NCCL_API(ret, func, args...)        \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    ret func(args)
-#endif // end PROFAPI
-
-
-#endif // end include guard
-
--- a/src/enqueue.h
+++ b/src/enqueue.h
@ -1,112 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef enqueue_h_
-#define enqueue_h_
-
-#include "core.h"
-#include "reduce_kernel.h"
-
-/* Syncronize previous collective (if in different stream) and enqueue
- * collective. Work is performed asynchronously with the host thread.
- * The ColFunc class should be templated on the datatype and reduction
- * operator (if applicable) and define a static entry() method as
- * follows.
- *   template <typename T, template <typename> class RedOp>
- *   class CollectiveFunctor {
- *     public:
- *     static ncclResult_t entry(const void* sendbuff, void* recvbuff, int count,
- *         int root, ncclComm* comm, cudaStream_t stream);
- *   };
- * The entry() method can assume that the appropriate cuda device has been set. */
-template< template<typename, template<typename> class> class ColFunc,
-          typename T,
-          template<typename> class Op >
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  if (stream != comm->prevStream) { // sync required for calls in different streams
-    comm->prevStream = stream;
-    CUDACHECK(cudaStreamWaitEvent(stream, comm->doneEvent, 0), ncclUnhandledCudaError);
-  }
-
-  ncclResult_t ret;
-  ret = ColFunc<T, Op>::entry(sendbuff, recvbuff, count, root, comm, stream);
-
-  // Always have to record done event because we don't know what stream next
-  // collective will be in.
-  CUDACHECK(cudaEventRecord(comm->doneEvent, stream), ncclUnhandledCudaError);
-  comm->opSched += 1;
-  return ret;
-}
-
-
-// This version decodes type
-template< template<typename, template<typename> class> class ColFunc,
-          template<typename> class Op >
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     ncclDataType_t type,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  switch(type) {
-  case ncclChar:
-    return enqueue<ColFunc, char, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclInt:
-    return enqueue<ColFunc, int, Op>(sendbuff, recvbuff, count, root, comm, stream);
-#ifdef CUDA_HAS_HALF
-  case ncclHalf:
-    return enqueue<ColFunc, half, Op>(sendbuff, recvbuff, count, root, comm, stream);
-#endif
-  case ncclFloat:
-    return enqueue<ColFunc, float, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclDouble:
-    return enqueue<ColFunc, double, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclInt64:
-    return enqueue<ColFunc, long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclUint64:
-    return enqueue<ColFunc, unsigned long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  default:
-    WARN("Invalid ncclType %d", type);
-    return ncclInvalidType;
-  }
-}
-
-// This version decodes both type and reduction op
-template< template<typename, template<typename> class> class ColFunc>
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     ncclDataType_t type,
-                     ncclRedOp_t op,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  switch(op) {
-  case ncclSum:
-    return enqueue<ColFunc, FuncSum>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclProd:
-    return enqueue<ColFunc, FuncProd>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclMax:
-    return enqueue<ColFunc, FuncMax>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclMin:
-    return enqueue<ColFunc, FuncMin>(sendbuff, recvbuff, count, type, root, comm, stream);
-  default:
-    WARN("Invalid ncclRedOp: %d", op);
-    return ncclInvalidOperation;
-  }
-}
-
-#endif // End include guard
-
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_BOOTSTRAP_H_
+#define NCCL_BOOTSTRAP_H_
+
+#include "nccl.h"
+
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
+ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapRingExchange(void* commState, void* prevNextData, int prev, int next, int size);
+ncclResult_t bootstrapClose(void* commState);
+#endif
--- a/src/include/common_coll.h
+++ b/src/include/common_coll.h
@ -0,0 +1,195 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_COLL_H_
+#define COMMON_COLL_H_
+
+#include "core.h"
+#include "enqueue.h"
+#include "collectives/collectives.h"
+
+static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
+  if (err != cudaSuccess || attr.devicePointer == NULL) {
+    WARN("%s : %s is not a valid pointer", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+#if __CUDACC_VER_MAJOR__ >= 10
+  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#else
+  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#endif
+    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+  if (ptr == NULL) {
+    WARN("%s : %s argument is NULL", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
+  NCCLCHECK(PtrCheck(comm, opname, "comm"));
+  // First, the easy ones
+  if (root < 0 || root >= comm->nRanks) {
+    WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
+    return ncclInvalidArgument;
+  }
+  if (type < 0 || type >= ncclNumTypes) {
+    WARN("%s : invalid type %d", opname, type);
+    return ncclInvalidArgument;
+  }
+  if (op < 0 || op >= ncclNumOps) {
+    WARN("%s : invalid reduction operation %d", opname, op);
+    return ncclInvalidArgument;
+  }
+
+  if (comm->checkPointers) {
+    // Check CUDA device pointers
+    if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
+      NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
+    }
+    if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
+      NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
+    }
+  }
+  return ncclSuccess;
+}
+
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8:
+    case ncclUint8:
+      return 1;
+    case ncclFloat16:
+      return 2;
+    case ncclInt32:
+    case ncclUint32:
+    case ncclFloat32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat64:
+      return 8;
+    default:
+      return -1;
+  }
+}
+
+// In : comm, nbytes ; Out : nrings, nthreads, ll
+// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
+//   If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
+// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
+//   This ensures we don't use a large number of rings with a small number of threads
+// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
+//   we use NCCL_THREAD_THRESHOLD when we reach the max
+// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
+// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
+static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
+  *ll = 0;
+  int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
+  if (comm->llThreshold >= 0) { /* user sets total LL threshold */
+    if (nbytes > comm->llThreshold) { /* non-LL */
+      *nthreads = comm->nThreads+1;
+      *nrings = comm->nRings;
+      return;
+    } else {
+      llEnforced = 1; /* user wants to use LL */
+    }
+  }
+  int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
+  size_t nr;
+  int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
+  int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
+  ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
+  while (nt < ll_max_nthreads && *ll == 0) {
+    nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
+    if (nr <= maxRings) { /* avoid using few threads but many rings */
+      nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
+      *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
+    }
+    if (*ll == 0) {
+      nt = nt << 1;
+    }
+  }
+  if (*ll == 1) {
+    *nthreads = nt;
+    *nrings = (int)nr;
+    return; /* we can use smaller number of threads to make LL work, stop here */
+  }
+  nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
+  nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
+  *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
+  *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1;
+  *nrings = *ll ? (int)nr : comm->nRings;
+}
+
+static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) {
+  int llMode, nBlocks, nThreads;
+  ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
+  comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
+  if (comm->userStreamSet == false) {
+    comm->userStream = stream;
+    comm->userStreamSet = true;
+  } else if (stream != comm->userStream) {
+    WARN("Error : mixing different streams within a group call is not supported.");
+    return ncclInvalidUsage;
+  }
+  int lastChunkSize = 0;
+  if (llMode == 1) {
+    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
+    const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
+    lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
+    ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
+  }
+  for (int bid=0; bid<nBlocks; bid++) {
+    struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
+    if (ring->collCount == NCCL_MAX_OPS) {
+      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+      return ncclInvalidUsage;
+    }
+
+    comm->myParams->gridDim.x++;
+
+    int opIndex = ring->collFifoTail;
+    struct ncclColl* c = ring->collectives+opIndex;
+    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+    while (activePtr[0] != 0) sched_yield();
+
+    struct CollectiveArgs* args = &c->args;
+    args->root = root;
+    args->N = count;
+    args->ThisInput = sendbuff;
+    args->ThisOutput = recvbuff;
+    args->comm = comm->devComm;
+    args->opCount = comm->opCount;
+    args->bid = bid;
+    args->nRings = nBlocks;
+    args->nThreads = nThreads;
+    args->lastChunkSize = lastChunkSize;
+
+    c->nThreads = nThreads;
+    c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
+    c->active = 1;
+    opIndex = (opIndex+1)%NCCL_MAX_OPS;
+    c->nextIndex = opIndex;
+    ring->collFifoTail = opIndex;
+    ring->collCount++;
+  }
+  /*if (llMode == 0)*/ comm->opCount++;
+  return ncclSuccess;
+}
+
+extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
+
+#endif
--- a/src/include/core.h
+++ b/src/include/core.h
@ -0,0 +1,385 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CORE_H_
+#define NCCL_CORE_H_
+
+#define NCCL_MAX_OPS 2048
+
+#include "nccl.h"
+#include "transport.h"
+#include "debug.h"
+#include <cstdio>
+#include <algorithm> // std::min/std::max
+#include <unistd.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#if __CUDACC_VER_MAJOR__ < 9
+struct cudaLaunchParams {
+  void *func;
+  dim3 gridDim;
+  dim3 blockDim;
+  void **args;
+  size_t sharedMem;
+  cudaStream_t stream;
+};
+#endif
+
+#define MAXRINGS 16
+#define MAXTHREADS 256
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+// Rings / LL tuning
+#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 32  // Per thread size before we switch to non-LL
+#define NCCL_LL_MAX_NTHREADS 256
+#define NCCL_LL_MIN_NTHREADS 64
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buff;         // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  uint64_t *opCount;  // Local for recv, remote for send
+
+  int direct;         // Direct communication
+  void **ptrExchange; // Pointer exchange for direct communication
+
+  int *fifo;          // Size fifo for proxy
+
+  // Low latency mechanism
+  char *llBuff;       // Local for recv, remote for send
+  uint64_t *llHead;   // Local for send, remote for recv
+  int *llFifo;        // LL Size fifo for proxy
+  uint64_t llStep;    // Keep where we are
+  uint64_t llLastCleaning;
+};
+
+struct ncclConnector {
+  struct transportProxyInfo* proxyInfo;
+  struct ncclTransport* transport;
+  void* transportResources; // Host-side resources
+  struct ncclConnInfo conn;
+};
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define SIZES_FIFO_SIZE 32
+#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
+
+#define NCCL_LL_CHUNKS 8
+#define NUM_LINES_PER_THREAD 2
+#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 64K
+#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
+#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
+#define NCCL_LL_CLEAN_FREQ 0x10000000
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t llHead;
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      uint64_t opCount;
+      char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[SIZES_FIFO_SIZE];
+      int llSizesFifo[SIZES_FIFO_SIZE];
+    };
+    char pad5[MEM_ALIGN];
+  };
+  char llBuff[NCCL_LL_BUFF_SIZE];
+  char buff[1]; // Actually larger than that
+};
+
+struct ncclRing {
+  union {
+    struct {
+      int id;
+      int nthreads;
+      // Per ring resources
+      struct ncclSendMem* devMemSend;   // CUDA-size resources
+      struct ncclRecvMem* devMemRecv;   // CUDA-size resources
+      int buffSize;
+      int devMemSendSize;    // Keep the size for IPCs
+      int devMemRecvSize;    // Keep the size for IPCs
+      struct ncclConnector send;
+      struct ncclConnector recv;
+
+      // Maps an internal nccl index to user-specified rank order. This is necessary
+      // since we need to know how the user expects data to be ordered across
+      // devices. Ordered from current device.
+      int* userRanks;
+      int* devUserRanks;
+
+      // Operation list for aggregation
+      struct ncclColl* collectives;
+      struct ncclColl* devCollectives;
+      int collStart;
+      int collCount;
+      int collFifoHead; // Only used by GPU
+      int collFifoTail; // Only used by CPU
+    };
+    int data[0x80];
+  };
+};
+static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
+
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+  struct ncclComm* comm;
+  uint64_t opCount;
+
+  // local and remote input, output, and buffer
+  const void * ThisInput;
+  void * ThisOutput;
+
+  // general parameters
+  size_t N;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nRings;
+  uint16_t nThreads;
+
+  int lastChunkSize;
+};
+struct ncclColl {
+  union {
+    struct {
+      struct CollectiveArgs args;
+      uint16_t nThreads;
+      uint16_t funcIndex;
+      uint16_t nextIndex;
+      uint8_t  active;
+    };
+    int data[0x10];
+  };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclComm {
+  struct ncclRing rings[MAXRINGS];
+
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+
+  enum { GROUP, PARALLEL } launchMode;
+  cudaStream_t userStream;
+  bool userStreamSet;
+  cudaEvent_t doneEvent;
+  bool checkPointers;
+
+  // Counter to make sure collectives match (needed for bcast/reduce
+  // where syncs are not symmetric).
+  uint64_t opCount;
+
+  // Rings for collectives
+  int nRings;
+  int nThreads;
+
+  // Low-latency algorithm threshold
+  ssize_t llThreshold;
+  ssize_t threadThreshold;
+
+  // An internal CUDA stream for NCCL kernel CGMD launches
+  int groupCudaStream;
+  cudaStream_t groupStream;
+
+  // Device copy of the communicator
+  struct ncclComm *devComm;
+
+  // Intra-process sync
+  int intraRank;
+  int intraRanks;
+  int* intraBarrier;
+  int intraPhase;
+
+  // Storage for deferred intra-process launch
+  struct cudaLaunchParams * intraParams;
+  struct cudaLaunchParams *myParams;
+  int* intraCudaDevs;
+  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+  struct ncclColl args;
+  void* argsptr;
+};
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do {                                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do {                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        res = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int ret = -1; \
+  while (ret == -1) { \
+    SYSCHECKVAL(call, name, ret); \
+    if (ret == -1) { \
+      INFO(ALL,"Got %s, retrying", strerror(errno));   \
+    }\
+  } \
+} while (0);
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+#define SYSCHECKNTIMES(call, name, times, usec, exptype) do { \
+  int ret = -1; \
+  int count = 0; \
+  while (ret == -1 && count < times) { \
+    SYSCHECKVALEXP(call, name, ret, exptype); \
+    count++; \
+    if (ret == -1) { \
+      usleep(usec); \
+    }\
+  } \
+  if (ret == -1) { \
+    WARN("Call to " name " timeout : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+#define SYSCHECKVALEXP(call, name, retval, exptype) do { \
+  retval = call; \
+  if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN && errno != exptype) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return res; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+  res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#ifdef PROFAPI
+#define NCCL_API(ret, func, args...)        \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((alias(#func)))          \
+    ret p##func (args);                     \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((weak))                  \
+    ret func(args)
+#else
+#define NCCL_API(ret, func, args...)        \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    ret func(args)
+#endif // end PROFAPI
+
+int ncclCudaCompCap();
+
+#include <sys/mman.h>
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  memset(*ptr, 0, size);
+  *devPtr = *ptr;
+  return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(cudaFreeHost(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
+  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  return ncclSuccess;
+}
+
+#endif // end include guard
--- a/src/include/debug.h
+++ b/src/include/debug.h
@ -0,0 +1,179 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEBUG_H_
+#define NCCL_DEBUG_H_
+
+#include <pthread.h>
+#include <stdio.h>
+#include <chrono>
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <limits.h>
+#include <string.h>
+#include "nccl.h"
+#define gettid() (pid_t) syscall(SYS_gettid)
+
+typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel;
+typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys;
+extern DebugLevel ncclDebugLevel;
+extern uint64_t ncclDebugMask;
+extern pthread_mutex_t ncclDebugOutputLock;
+extern FILE *ncclDebugFile;
+extern ncclResult_t getHostName(char* hostname, int maxlen);
+
+#define WARN(...) do {                                           \
+  if (ncclDebugLevel >= WARN) {                                  \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \
+    fprintf(ncclDebugFile,__VA_ARGS__);                          \
+    fprintf(ncclDebugFile,"\n");                                 \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+    if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \
+  }                                                              \
+} while(0)
+
+#define INFO(FLAGS, ...) do {                                    \
+  if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) {     \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \
+    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+  }                                                              \
+} while(0)
+
+#ifdef ENABLE_TRACE
+#define TRACE(FLAGS, ...) do {                                   \
+  if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) {    \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \
+    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; \
+    fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \
+    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+  }                                                              \
+} while(0)
+
+extern std::chrono::high_resolution_clock::time_point ncclEpoch;
+
+#else
+#define TRACE(...)
+#endif
+
+#include <stdlib.h>
+
+static inline void initDebug() {
+  const char* nccl_debug = getenv("NCCL_DEBUG");
+  if (nccl_debug == NULL) {
+    ncclDebugLevel = NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    ncclDebugLevel = VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    ncclDebugLevel = WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    ncclDebugLevel = INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    ncclDebugLevel = ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    ncclDebugLevel = TRACE;
+  }
+
+  /* Parse the NCCL_DEBUG_SUBSYS env var
+   * This can be a comma separated list such as INIT,COLL
+   * or ^INIT,COLL etc
+   */
+  char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
+  if (nccl_debug_subsys != NULL) {
+    char *subsys = strtok(nccl_debug_subsys, ",");
+    while (subsys != NULL) {
+      int invert = 0;
+      uint64_t mask = 0;
+      if (subsys[0] == '^') { invert = 1; subsys++; }
+      if (strcasecmp(subsys, "INIT") == 0) {
+        mask = INIT;
+      } else if (strcasecmp(subsys, "COLL") == 0) {
+        mask = COLL;
+      } else if (strcasecmp(subsys, "P2P") == 0) {
+        mask = P2P;
+      } else if (strcasecmp(subsys, "SHM") == 0) {
+        mask = SHM;
+      } else if (strcasecmp(subsys, "NET") == 0) {
+        mask = NET;
+      } else if (strcasecmp(subsys, "ALL") == 0) {
+        mask = ALL;
+      }
+      if (mask) {
+        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
+      }
+      subsys = strtok(NULL, ",");
+    }
+  }
+
+  /* Parse and expand the NCCL_DEBUG_FILE path and
+   * then create the debug file. But don't bother unless the
+   * NCCL_DEBUG level is > VERSION
+   */
+  const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
+  if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) {
+    int c = 0;
+    char debug_fn[PATH_MAX+1] = "";
+    char *dfn = debug_fn;
+    while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
+      if (nccl_debug_file[c++] != '%') {
+        *dfn++ = nccl_debug_file[c-1];
+        continue;
+      }
+      switch (nccl_debug_file[c++]) {
+        case '%': // Double %
+          *dfn++ = '%';
+          break;
+        case 'h': // %h = hostname
+          char hostname[1024];
+          getHostName(hostname, 1024);
+          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+          break;
+        case 'p': // %p = pid
+          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+          break;
+        default: // Echo everything we don't understand
+          *dfn++ = '%';
+          *dfn++ = nccl_debug_file[c-1];
+          break;
+      }
+    }
+    *dfn = '\0';
+    if (debug_fn[0] != '\0') {
+      FILE *file = fopen(debug_fn, "w");
+      if (file != NULL) {
+        INFO(ALL,"DEBUG file is '%s'", debug_fn);
+        ncclDebugFile = file;
+      }
+    }
+  }
+  pthread_mutex_init(&ncclDebugOutputLock, NULL);
+
+#ifdef ENABLE_TRACE
+  ncclEpoch = std::chrono::high_resolution_clock::now();
+#endif
+}
+
+#endif
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ENQUEUE_H_
+#define NCCL_ENQUEUE_H_
+
+#include "core.h"
+#include "group.h"
+
+typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
+    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
+ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
+ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
+ncclResult_t ncclBarrierEnqueue(ncclComm_t comm);
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm);
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm);
+
+#endif // End include guard
--- a/src/include/group.h
+++ b/src/include/group.h
@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GROUP_H_
+#define NCCL_GROUP_H_
+
+#include "nccl.h"
+#include "core.h"
+
+bool ncclAsyncMode();
+ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
+
+typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+
+typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+ncclResult_t ncclAsyncColl(ncclComm_t comm);
+#endif
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@ -0,0 +1,64 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+
+#define NCCL_NET_MAJOR 1
+#define NCCL_NET_MINOR 0
+
+#define NCCL_NET_HANDLE_MAXSIZE 64
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+
+#define NCCL_MAX_SCORE 0x7
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Return the number of network devices along with their scores relative to the
+  // current CUDA device. The per device score should be a value from 1-7 with a
+  // higher score representing a better choice for performance.
+  // This call should allocate the 'scores' array using malloc(3), and it
+  // will then be freed automatically by NCCL.
+  ncclResult_t (*devices)(int* ndev, int** scores);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int type, void** request);
+  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, int type, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size);
+  // Test whether a request is complete and return the size received (can be less than requested).
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_t;
+
+extern
+#ifdef __cplusplus
+"C"
+#endif
+ncclNet_t* ncclNet;
+
+#endif // end include guard
--- a/src/include/net.h
+++ b/src/include/net.h
@ -0,0 +1,40 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_NET_H_
+#define NCCL_INT_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+
+typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+/* Socket Interface Selection type */
+typedef enum { findSubnetIf   = -1,
+    dontCareIf     = -2
+} ncclSocketIfSl_t;
+
+// Translation to external API
+static const char* ncclNetName() { return ncclNet->name; }
+static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; }
+static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
+static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
+static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
+
+extern bool ncclIbSupport();
+extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
+extern ncclNet_t ncclNetIb;
+extern ncclNet_t ncclNetSocket;
+
+#endif
--- a/src/include/nvlink.h
+++ b/src/include/nvlink.h
@ -0,0 +1,155 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVLINK_H_
+#define NCCL_NVLINK_H_
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "nvmlwrap.h"
+#include "topo.h"
+
+#define CONNECT_NVLINK 0x10
+#define CONNECT_NVSWITCH 0x100
+
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+};
+
+static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
+  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
+  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
+  char* rPath = realpath(classPath, NULL);
+  int fd;
+  SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
+  free(rPath);
+  char pciClass[9];
+  strncpy(pciClass, "0x000000", 9);
+  int len;
+  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
+  SYSCHECK(close(fd), "close");
+  if (strcmp(pciClass, "0x068000") == 0) {
+    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
+    *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
+      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
+    *type = ncclNvLinkDeviceGpu;
+  } else {
+    // Ignore if we don't know what's on the other side.
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+/* Get the maximum number of NVLinks based on the GPU generation */
+static ncclResult_t getMaxNvlinks(int* maxLinks) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  int ccMajor;
+  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
+  // 6 for Volta, 4 for Pascal
+  *maxLinks = (ccMajor > 6) ? 6 : 4;
+  // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
+  return ncclSuccess;
+}
+
+static int getNvlinkGpu(const char* busId1, const char* busId2) {
+  // Determine if that connection is through NVLink
+  int links = 0;
+  int nvswitch_links = 0;
+  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
+  nvmlDevice_t nvmlDev;
+  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
+  if (res != ncclSuccess) return 0;
+
+  for(int l=0; l<maxNvLinks; ++l) {
+    // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
+    // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
+    // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
+    // the POWER CPU case, so it seems best to check this as well.
+    unsigned canP2P;
+    if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+    // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
+    // if the links don't exist, or are disabled. So checking for that return
+    // here would probably make the nvmlDeviceGetNvLinkCapability check above
+    // redundant. Presumably, we still need to check the P2P capability above,
+    // since even non-GPUs would possess PCI info.
+    nvmlPciInfo_t remoteProc;
+    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+    // Old versions of NVML return a lowercase PCI ID
+    char* p = remoteProc.busId;
+    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+      if (p[c] == 0) break;
+      p[c] = toupper(p[c]);
+    }
+
+    if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
+      links++;
+    } else {
+      // Make a lower case copy of the bus ID for calling ncclDeviceType
+      // PCI system path is in lower case
+      char* p = remoteProc.busId;
+      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+        if (p[c] == 0) break;
+        lowerId[c] = tolower(p[c]);
+      }
+
+      // Determine if the remote side is NVswitch
+      enum ncclNvLinkDeviceType type;
+      if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
+        //TODO: we are making an assumption that all GPUs are connected to this switch
+        //This assumption may change for future architectures
+        nvswitch_links++;
+      }
+    }
+  }
+  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
+}
+
+static int getNumNvlinks(const char* busId) {
+  nvmlDevice_t nvmlDev;
+  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
+  if (res != ncclSuccess) return 0;
+
+  int nvlinks = 0, nvswitch_links = 0;
+  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
+  for(int l=0; l<maxNvLinks; ++l) {
+    unsigned canP2P;
+    nvmlEnableState_t isActive;
+    if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
+        wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
+      nvlinks++;
+    } else {
+      continue;
+    }
+
+    nvmlPciInfo_t remoteProc;
+    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+    // Make a lower case copy of the bus ID for calling ncclDeviceType
+    // PCI system path is in lower case
+    char* p = remoteProc.busId;
+    char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+      if (p[c] == 0) break;
+      lowerId[c] = tolower(p[c]);
+    }
+
+    // Determine if the remote side is NVswitch
+    enum ncclNvLinkDeviceType type;
+    if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
+      //TODO: we are making an assumption that all GPUs are connected to this switch
+      //This assumption may change for future architectures
+      nvswitch_links++;
+    }
+  }
+  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
+}
+#endif
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@ -0,0 +1,149 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVMLWRAP_H_
+#define NCCL_NVMLWRAP_H_
+
+#include "core.h"
+
+//#define NVML_DIRECT 1
+#ifdef NVML_DIRECT
+#include "nvml.h"
+
+#define NVMLCHECK(cmd) do {                              \
+    nvmlReturn_t e = cmd;                                \
+    if( e != NVML_SUCCESS ) {                            \
+      WARN("NVML failure '%s'", nvmlErrorString(e));     \
+      return ncclSystemError;                            \
+    }                                                    \
+} while(false)
+
+static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
+static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
+static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
+static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  NVMLCHECK(nvmlDeviceGetIndex(device, index));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
+  NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
+  NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
+  NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult) {
+  NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
+  return ncclSuccess;
+}
+#else
+// Dynamically handle dependencies on NVML
+
+/* Extracted from nvml.h */
+typedef struct nvmlDevice_st* nvmlDevice_t;
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE   16
+
+typedef enum nvmlEnableState_enum
+{
+    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled
+    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
+} nvmlEnableState_t;
+
+typedef enum nvmlNvLinkCapability_enum
+{
+    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
+    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
+    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
+    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
+    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
+    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
+    // should be last
+    NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+typedef enum nvmlReturn_enum
+{
+    NVML_SUCCESS = 0,                   //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
+} nvmlReturn_t;
+
+typedef struct nvmlPciInfo_st
+{
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffff
+    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
+    unsigned int device;             //!< The device's id on the bus, 0 to 31
+    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
+
+    // Added in NVML 2.285 API
+    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
+
+    // NVIDIA reserved for internal use only
+    unsigned int reserved0;
+    unsigned int reserved1;
+    unsigned int reserved2;
+    unsigned int reserved3;
+} nvmlPciInfo_t;
+/* End of nvml.h */
+
+ncclResult_t wrapNvmlSymbols(void);
+
+ncclResult_t wrapNvmlInit(void);
+ncclResult_t wrapNvmlShutdown(void);
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
+ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
+#endif // NVML_DIRECT
+
+#endif // End include guard
--- a/src/include/param.h
+++ b/src/include/param.h
@ -0,0 +1,81 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PARAM_H_
+#define NCCL_PARAM_H_
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <pwd.h>
+
+static const char* userHomeDir() {
+  struct passwd *pwUser = getpwuid(getuid());
+  return pwUser == NULL ? NULL : pwUser->pw_dir;
+}
+
+static void setEnvFile(const char* fileName) {
+  FILE * file = fopen(fileName, "r");
+  if (file == NULL) return;
+
+  char *line = NULL;
+  char envVar[1024];
+  char envValue[1024];
+  size_t n = 0;
+  ssize_t read;
+  while ((read = getline(&line, &n, file)) != -1) {
+    if (line[read-1] == '\n') line[read-1] = '\0';
+    int s=0; // Env Var Size
+    while (line[s] != '\0' && line[s] != '=') s++;
+    if (line[s] == '\0') continue;
+    strncpy(envVar, line, std::min(1024,s));
+    envVar[s] = '\0';
+    s++;
+    strncpy(envValue, line+s, 1024);
+    setenv(envVar, envValue, 0);
+    char *str = getenv(envVar);
+  }
+  if (line) free(line);
+  fclose(file);
+}
+
+static void initEnv() {
+  char confFilePath[1024];
+  const char * userDir = userHomeDir();
+  if (userDir) {
+    sprintf(confFilePath, "%s/.nccl.conf", userDir);
+    setEnvFile(confFilePath);
+  }
+  sprintf(confFilePath, "/etc/nccl.conf");
+  setEnvFile(confFilePath);
+}
+
+
+#define NCCL_PARAM(name, env, default_value) \
+pthread_mutex_t ncclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
+int64_t ncclParam##name() { \
+  static_assert(default_value != -1LL, "default value cannot be -1"); \
+  static int64_t value = -1LL; \
+  pthread_mutex_lock(&ncclParamMutex##name); \
+  if (value == -1LL) { \
+    value = default_value; \
+    char* str = getenv("NCCL_" env); \
+    if (str && strlen(str) > 0) { \
+      errno = 0; \
+      int64_t v = strtoll(str, NULL, 0); \
+      if (errno) { \
+        INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
+      } else { \
+        value = v; \
+        INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value);  \
+      } \
+    } \
+  } \
+  pthread_mutex_unlock(&ncclParamMutex##name); \
+  return value; \
+}
+
+#endif
--- a/src/include/ring.h
+++ b/src/include/ring.h
@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RING_H_
+#define NCCL_RING_H_
+#include "core.h"
+
+ncclResult_t initRing(struct ncclComm* comm, int ringid);
+ncclResult_t freeRing(struct ncclRing* ring);
+
+#endif
--- a/src/include/rings.h
+++ b/src/include/rings.h
@ -0,0 +1,17 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RINGS_H_
+#define NCCL_RINGS_H_
+
+static int getDefaultThreads() {
+  // On Kepler, rings are doubled later.
+  return ncclCudaCompCap() == 3 ? 128 : 256;
+}
+
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
+
+#endif
--- a/src/include/shm.h
+++ b/src/include/shm.h
@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SHM_H_
+#define NCCL_SHM_H_
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
+  *shmPtr = NULL;
+  int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (fd == -1) {
+    WARN("shm_open failed to open %s : %s", shmname, strerror(errno));
+    return ncclSystemError;
+  }
+
+  if (create) {
+    int res = posix_fallocate(fd, 0, shmsize);
+    if (res != 0) {
+      WARN("Unable to allocate shared memory (%d bytes) : %s", shmsize, strerror(res));
+      shm_unlink(shmname);
+      close(fd);
+      return ncclSystemError;
+    }
+  }
+
+  void *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (ptr == MAP_FAILED) {
+    WARN("failure in mmap of %s (size %d) : %s", shmname, shmsize, strerror(errno));
+    shm_unlink(shmname);
+    return ncclSystemError;
+  }
+  if (create) {
+    memset(ptr, 0, shmsize);
+  }
+
+  cudaError_t e;
+  if ((e=cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped)) != cudaSuccess) {
+    WARN("failed to register host buffer %p : %s", ptr, cudaGetErrorString(e));
+    if (create) shm_unlink(shmname);
+    munmap(ptr, shmsize);
+    return ncclUnhandledCudaError;
+  }
+
+  if ((e=cudaHostGetDevicePointer(devShmPtr, ptr, 0)) != cudaSuccess) {
+    WARN("failed to get device pointer for local shmem %p : %s", ptr, cudaGetErrorString(e));
+    if (create) shm_unlink(shmname);
+    munmap(ptr, shmsize);
+    return ncclUnhandledCudaError;
+  }
+  *shmPtr = ptr;
+  return ncclSuccess;
+}
+
+static ncclResult_t shmUnlink(const char* shmname) {
+  if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink");
+  return ncclSuccess;
+}
+
+static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
+  CUDACHECK(cudaHostUnregister(shmPtr));
+  if (munmap(shmPtr, shmsize) != 0) {
+    WARN("munmap of shared memory failed");
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+#endif
--- a/src/include/socket.h
+++ b/src/include/socket.h
@ -0,0 +1,401 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SOCKET_H_
+#define NCCL_SOCKET_H_
+
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <netinet/tcp.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+#include "utils.h"
+
+#define MAX_IF_NAME_SIZE 16
+#define SLEEP_INT     1000  // sleep interval in usec
+#define RETRY_TIMES   2e4   // retry times before reporting a timeout (20 sec)
+
+/* Common socket address storage structure for IPv4/IPv6 */
+union socketAddress {
+  struct sockaddr sa;
+  struct sockaddr_in sin;
+  struct sockaddr_in6 sin6;
+};
+
+/* Format a string representation of a (struct sockaddr *) socket address using getnameinfo()
+ *
+ * Output: "IPv4/IPv6 address<port>"
+ */
+static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
+  if (buf == NULL || saddr == NULL) return NULL;
+  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
+  char host[NI_MAXHOST], service[NI_MAXSERV];
+  (void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
+  sprintf(buf, "%s<%s>", host, service);
+  return buf;
+}
+
+/* Allow the user to force the IPv4/IPv6 interface selection */
+static inline int envSocketFamily(void) {
+  int family = -1; // Family selection is not forced, will use first one found
+  char* env = getenv("NCCL_SOCKET_FAMILY");
+  if (env == NULL)
+    return family;
+
+  if (strcmp(env, "AF_INET") == 0)
+    family = AF_INET;  // IPv4
+  else if (strcmp(env, "AF_INET6") == 0)
+    family = AF_INET6; // IPv6
+  return family;
+}
+
+static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+  char line[1024];
+  struct netIf userIfs[maxIfs];
+  bool searchNot = prefixList && prefixList[0] == '^';
+  int nUserIfs = parseStringList(prefixList, userIfs, maxIfs);
+
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+
+    /* Allow the caller to force the socket family type */
+    if (sock_family != -1 && family != sock_family)
+      continue;
+
+    /* We also need to skip IPv6 loopback interfaces */
+    if (family == AF_INET6) {
+      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
+      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+    }
+
+    // check against user specified interfaces
+    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
+      continue;
+    }
+
+    // Check that this interface has not already been saved
+    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
+    bool duplicate = false;
+    for (int i = 0; i < found; i++) {
+      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+    }
+
+    if (!duplicate) {
+      // Store the interface name
+      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+      // Store the IP address
+      int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+      memcpy(addrs+found, interface->ifa_addr, salen);
+      INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+      found++;
+    }
+  }
+
+  freeifaddrs(interfaces);
+  return found;
+}
+
+static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
+  /* Check family first */
+  int family = local_if.ifa_addr->sa_family;
+  if (family != remote.sa.sa_family) {
+    return false;
+  }
+
+  if (family == AF_INET) {
+    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
+    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
+    struct sockaddr_in& remote_addr = remote.sin;
+    struct in_addr local_subnet, remote_subnet;
+    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
+    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
+    return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
+  } else if (family == AF_INET6) {
+    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
+    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
+    struct sockaddr_in6& remote_addr = remote.sin6;
+    struct in6_addr& local_in6 = local_addr->sin6_addr;
+    struct in6_addr& mask_in6 = mask->sin6_addr;
+    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
+    bool same = true;
+    int len = 16;  //IPv6 address is 16 unsigned char
+    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
+      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      if (c1 ^ c2) {
+        same = false;
+        break;
+      }
+    }
+    // At last, we need to compare scope id
+    // Two Link-type addresses can have the same subnet address even though they are not in the same scope
+    // For Global type, this field is 0, so a comparison wouldn't matter
+    same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
+    return same;
+  } else {
+    WARN("Net : Unsupported address family type");
+    return false;
+  }
+}
+
+static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
+  char line[1024], line_a[1024];
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    // check against user specified interfaces
+    if (!matchSubnet(*interface, remoteAddr)) {
+      continue;
+    }
+
+    // Store the local IP address
+    int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+    memcpy(localAddrs+found, interface->ifa_addr, salen);
+
+    // Store the interface name
+    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
+
+    INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    found++;
+    if (found == maxIfs) break;
+  }
+
+  if (found == 0) {
+    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
+  }
+  freeifaddrs(interfaces);
+  return found;
+}
+
+static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) {
+  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
+    WARN("Net : string is null");
+    return ncclInvalidArgument;
+  }
+
+  bool ipv6 = ip_port_pair[0] == '[';
+  /* Construct the sockaddress structure */
+  if (!ipv6) {
+    struct netIf ni;
+    // parse <ip_or_hostname>:<port> string, expect one pair
+    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+      WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
+      return ncclInvalidArgument;
+    }
+
+    struct addrinfo hints, *p;
+    int rv;
+    memset(&hints, 0, sizeof(hints));
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
+      return ncclInvalidArgument;
+    }
+
+    // use the first
+    if (p->ai_family == AF_INET) {
+      struct sockaddr_in& sin = ua->sin;
+      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
+      sin.sin_family = AF_INET;                        // IPv4
+      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
+      sin.sin_port = htons(ni.port);                   // port
+    } else if (p->ai_family == AF_INET6) {
+      struct sockaddr_in6& sin6 = ua->sin6;
+      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
+      sin6.sin6_family = AF_INET6;                     // IPv6
+      sin6.sin6_port = htons(ni.port);                 // port
+      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
+      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
+    } else {
+      WARN("Net : unsupported IP family");
+      return ncclInvalidArgument;
+    }
+
+    freeaddrinfo(p); // all done with this structure
+
+  } else {
+    int i, j = -1, len = strlen(ip_port_pair);
+    for (i = 1; i < len; i++) {
+      if (ip_port_pair[i] == '%') j = i;
+      if (ip_port_pair[i] == ']') break;
+    }
+    if (i == len) {
+      WARN("Net : No valid [IPv6]:port pair found");
+      return ncclInvalidArgument;
+    }
+    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
+
+    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
+    memset(ip_str, '\0', sizeof(ip_str));
+    memset(port_str, '\0', sizeof(port_str));
+    memset(if_name, '\0', sizeof(if_name));
+    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
+    strncpy(port_str, ip_port_pair+i+2, len-i-1);
+    int port = atoi(port_str);
+    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+
+    struct sockaddr_in6& sin6 = ua->sin6;
+    sin6.sin6_family = AF_INET6;                       // IPv6
+    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
+    sin6.sin6_port = htons(port);                      // port
+    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
+    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
+  }
+  return ncclSuccess;
+}
+
+static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+  int nIfs = 0;
+  // Allow user to force the INET socket family selection
+  int sock_family = envSocketFamily();
+  // User specified interface
+  char* env = getenv("NCCL_SOCKET_IFNAME");
+  if (env && strlen(env) > 1) {
+    // Specified by user : find or fail
+    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  } else {
+    // Try to automatically pick the right one
+    // Start with IB
+    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // else see if we can get some hint from COMM ID
+    if (nIfs == 0) {
+      char* commId = getenv("NCCL_COMM_ID");
+      if (commId && strlen(commId) > 1) {
+        // Try to find interface that is in the same subnet as the IP in comm id
+        union socketAddress idAddr;
+        GetSocketAddrFromString(&idAddr, commId);
+        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
+      }
+    }
+    // Then look for anything else (but not docker or lo)
+    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // Finally look for docker, then lo.
+    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  }
+  return nIfs;
+}
+
+static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) {
+  /* IPv4/IPv6 support */
+  int family = localAddr->sa.sa_family;
+  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+
+  /* Create socket and bind it to a port */
+  int sockfd = socket(family, SOCK_STREAM, 0);
+  if (sockfd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  int opt = 1;
+  SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+
+  // localAddr port should be 0 (Any port)
+  SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind");
+
+  /* Get the assigned Port */
+  socklen_t size = salen;
+  SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
+
+#ifdef ENABLE_TRACE
+  char line[1024];
+  TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
+#endif
+
+  /* Put the socket in listen mode */
+  SYSCHECK(listen(sockfd, 128), "listen");
+  *fd = sockfd;
+  return ncclSuccess;
+}
+
+static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
+  /* IPv4/IPv6 support */
+  int family = remoteAddr->sa.sa_family;
+  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+
+  /* Connect to a hostname / port */
+  *fd = socket(family, SOCK_STREAM, 0);
+  if (*fd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  const int one = 1;
+  SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+
+  /*  const int bufsize = 128*1024;
+    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
+    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
+
+#ifdef ENABLE_TRACE
+  char line[1024];
+  TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
+#endif
+
+  SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
+  return ncclSuccess;
+}
+
+static ncclResult_t socketReceive(int fd, void* ptr, int size) {
+  char* data = (char*)ptr;
+  int offset = 0;
+  while (offset < size) {
+    int recvsize;
+    SYSCHECKVAL(recv(fd, data, size-offset, 0), "recv", recvsize);
+    if (recvsize == 0) {
+      WARN("Net : Connection closed by remote peer");
+      return ncclSystemError;
+    }
+    if (recvsize == -1) {
+      INFO(NET,"Recv : got retcode %d, retrying", errno);
+      continue;
+    }
+    data += recvsize;
+    offset += recvsize;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t socketSend(int fd, void* ptr, int size) {
+  char* data = (char*)ptr;
+  int offset = 0;
+  while (offset < size) {
+    int sendsize;
+    SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
+    if (sendsize == -1) {
+      INFO(NET,"Send : got retcode %d, retrying", errno);
+      continue;
+    }
+    data += sendsize;
+    offset += sendsize;
+  }
+  return ncclSuccess;
+}
+
+#endif
--- a/src/include/topo.h
+++ b/src/include/topo.h
@ -0,0 +1,83 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TOPO_H_
+#define NCCL_TOPO_H_
+
+#include "nccl.h"
+#include <ctype.h>
+
+#define MAXPATHSIZE 1024
+
+static ncclResult_t getCudaPath(int cudaDev, char** path) {
+  char busId[16];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
+  for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
+  char busPath[] = "/sys/class/pci_bus/0000:00/device";
+  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
+  char* cudaRpath = realpath(busPath, NULL);
+  char pathname[MAXPATHSIZE];
+  strncpy(pathname, cudaRpath, MAXPATHSIZE);
+  strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname));
+  strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname));
+  free(cudaRpath);
+  *path = realpath(pathname, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", pathname);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getMlxPath(char* ibName, char** path) {
+  char devicepath[MAXPATHSIZE];
+  snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", devicepath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getSockPath(char* ifName, char** path) {
+  char devicepath[MAXPATHSIZE];
+  snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    INFO(NET|INIT, "Could not find real path of %s", devicepath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+enum ncclIbPathDist {
+  PATH_PIX = 0,
+  PATH_PXB = 1,
+  PATH_PHB = 2,
+  PATH_SOC = 3
+};
+
+static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
+
+static int pciDistance(char* path1, char* path2) {
+  int score = 0;
+  int depth = 0;
+  int same = 1;
+  for (int i=0; i<strlen(path1); i++) {
+    if (path1[i] != path2[i]) same = 0;
+    if (path1[i] == '/') {
+      depth++;
+      if (same == 1) score++;
+    }
+  }
+  if (score == 3) return PATH_SOC;
+  if (score == 4) return PATH_PHB;
+  if (score == depth-1) return PATH_PIX;
+  return PATH_PXB;
+}
+
+#endif
--- a/src/include/transport.h
+++ b/src/include/transport.h
@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TRANSPORT_H_
+#define NCCL_TRANSPORT_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NTRANSPORTS 3
+
+extern struct ncclTransport ncclTransports[];
+
+// Forward declarations
+struct ncclRing;
+struct ncclConnector;
+struct ncclComm;
+
+#define RANK_INFO_SIZE 64
+typedef char ncclTinfo_t[RANK_INFO_SIZE];
+
+struct ncclInfo {
+  ncclTinfo_t tinfo[NTRANSPORTS];
+};
+
+// Used to hold the transport connection values
+typedef int64_t ncclTvalue_t;
+
+#define CONNECT_SIZE 128
+struct ncclConnect {
+  char data[CONNECT_SIZE];
+};
+
+struct ncclProxyArgs {
+  struct ncclRing* ring;
+  int substeps;
+  int nsteps;
+  uint64_t opCount;
+  int llMode;
+  bool needProxy;
+  int active;   // add component before this line -- it is left out during initialization
+};
+
+struct ncclTransportComm {
+  ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
+  ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
+  ncclResult_t (*free)(void*);
+  ncclResult_t (*proxy)(struct ncclProxyArgs*);
+};
+
+struct ncclTransport {
+  const char name[4];
+  ncclResult_t (*fillInfo)(ncclTinfo_t*, int);
+  ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
+  ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
+  struct ncclTransportComm send;
+  struct ncclTransportComm recv;
+};
+
+#include <pthread.h>
+
+typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
+
+#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
+
+struct transportProxyInfo {
+  struct ncclComm* comm;
+  pthread_t thread;
+  threadFunc_t func;
+  volatile int proxyReady;
+  struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
+  volatile uint64_t argsFifoHead;
+  volatile uint64_t argsFifoTail;
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+};
+
+ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
+ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
+
+enum proxyMode {
+  proxyRing = 0,
+  proxyFrom = 1,
+  proxyTo = 2
+};
+
+static int proxyPatternRing = proxyRing;
+static inline int proxyPatternFrom(int root) { return 1+root; }
+static inline int proxyPatternTo(int root) { return -1-root; }
+static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
+static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
+
+ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
+ncclResult_t transportStartProxies(struct ncclComm* comm);
+
+#include <unistd.h>
+
+// Spin wait until func evaluates to true
+template<typename FUNC>
+inline void transportProxyWait(const FUNC& func) {
+  while (!func()) {
+    sched_yield();
+  }
+}
+
+inline void transportProxyIdle(int idle) {
+  sched_yield();
+}
+
+#endif
--- a/src/include/utils.h
+++ b/src/include/utils.h
@ -0,0 +1,25 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_UTILS_H_
+#define NCCL_UTILS_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+ncclResult_t getHostName(char* hostname, int maxlen);
+uint64_t getHostHash();
+uint64_t getPidHash();
+
+struct netIf {
+  char prefix[64];
+  int port;
+};
+
+int parseStringList(const char* string, struct netIf* ifList, int maxList);
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
+
+#endif
--- a/src/init.cu
+++ b/src/init.cu
@ -0,0 +1,815 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "ring.h"
+#include "param.h"
+#include "nvmlwrap.h"
+#include "rings.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "common_coll.h"
+#include "group.h"
+#include "utils.h"
+#include "net.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+DebugLevel ncclDebugLevel;
+uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT
+pthread_mutex_t ncclDebugOutputLock;
+FILE *ncclDebugFile = stdout;
+
+#ifdef ENABLE_TRACE
+std::chrono::high_resolution_clock::time_point ncclEpoch;
+#endif
+
+#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)
+#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
+#else
+#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
+#endif
+
+NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
+
+NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
+
+extern "C" __attribute__ ((visibility("default")))
+ncclNet_t* ncclNet = NULL;
+
+// We define this as weak to let tests redefine their own
+#pragma weak ncclCudaCompCap
+int ncclCudaCompCap() {
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  int ccMajor;
+  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  return ccMajor;
+}
+int ncclCudaFullCompCap() {
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  int ccMajor, ccMinor;
+  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
+  return ccMajor*10+ccMinor;
+}
+
+void initNet() {
+  if (ncclNet != NULL) {
+    INFO(INIT,"Using external Network %s", ncclNetName());
+  } else {
+    ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket;
+    INFO(INIT,"Using internal Network %s", ncclNetName());
+  }
+}
+
+NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
+NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", NCCL_THREAD_THRESHOLD);
+
+pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
+static bool initialized = false;
+static ncclResult_t ncclInit() {
+  if (initialized) return ncclSuccess;
+  pthread_mutex_lock(&initLock);
+  if (!initialized) {
+    initEnv();
+    initDebug();
+    initNet();
+    initialized = true;
+  }
+  pthread_mutex_unlock(&initLock);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetVersion, int* version);
+ncclResult_t ncclGetVersion(int* version) {
+  if (version == NULL) return ncclInvalidArgument;
+  *version = NCCL_VERSION_CODE;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
+ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
+  return bootstrapGetUniqueId(out);
+}
+
+static ncclResult_t commFree(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  CUDACHECK(cudaFree(comm->devComm));
+
+  for (int ring=0; ring<comm->nRings; ring++)
+    NCCLCHECK(freeRing(comm->rings+ring));
+
+  if (comm->doneEvent != NULL)
+    CUDACHECK(cudaEventDestroy(comm->doneEvent));
+
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(cudaStreamDestroy(comm->groupStream));
+  }
+
+  // Last rank frees shared resources between threads
+  int isLast;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+  if (isLast) {
+    free(comm->intraBarrier);
+    free(comm->intraParams);
+    free(comm->intraCudaDevs);
+    free(comm->intraCGMode);
+    free(comm->intraCC);
+  }
+
+  free(comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
+  if (ndev < 1) {
+    WARN("invalid device count (%d) requested", ndev);
+    return ncclInvalidArgument;
+  }
+  if (rank >= ndev || rank < 0) {
+    WARN("rank %d exceeds ndev=%d", rank, ndev);
+    return ncclInvalidArgument;
+  }
+
+  // Try to create a CUDA object right away. If there is something wrong with
+  // the device we're on (failure cause #1) , better know it early.
+  cudaEvent_t doneEvent;
+  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
+
+  struct ncclComm* comm;
+  NCCLCHECK(ncclCalloc(&comm, 1));
+
+  INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
+  comm->rank = rank;
+  comm->nRanks = ndev;
+  cudaGetDevice(&comm->cudaDev);
+  comm->doneEvent = doneEvent;
+  comm->llThreshold = ncclParamLlThreshold();
+  comm->threadThreshold = ncclParamThreadThreshold();
+  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)
+  comm->groupCudaStream = ncclParamGroupCudaStream();
+#else
+  // Don't allow the user to overload the default setting in older CUDA builds
+  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
+#endif
+
+  comm->argsptr = &comm->args;
+
+  *comret = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t devCommSetup(ncclComm_t comm) {
+  // Fully duplicate the comm on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+  // Copy the comm on the device
+  NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
+  // Copy userRanks
+  for (int r=0; r<comm->nRings; r++) {
+    NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
+  }
+  return ncclSuccess;
+}
+
+// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
+#define STR2(v) #v
+#define STR(v) STR2(v)
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
+static void showVersion() {
+  static int shown = 0;
+  if (shown == 0 && ncclDebugLevel >= VERSION) {
+    printf("%s\n", VERSION_STRING);
+    fflush(stdout);
+    if (ncclDebugFile != stdout)
+      INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
+    shown = 1;
+  }
+}
+
+static ncclResult_t fillInfo(struct ncclInfo* info, int rank) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank));
+  }
+  return ncclSuccess;
+}
+
+template <int type>
+static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
+    ncclTvalue_t ret = 0;
+    NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
+    if (ret > 0) {
+      NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
+      *transportRet = transport;
+      return ncclSuccess;
+    }
+  }
+  WARN("No transport found !");
+  *transportRet = NULL;
+  return ncclInternalError;
+}
+
+static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
+  NCCLCHECK(initRing(comm, ringid));
+
+  struct ncclRing* ring = comm->rings+ringid;
+  // Reorganize ranks to start with rank.
+  int shift;
+  for (shift = 0; shift<nranks; shift++) {
+    if (ringRanks[shift] == rank) {
+      break;
+    }
+  }
+  for (int i=0; i<nranks; i++) {
+    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
+  }
+  int prev = ring->userRanks[nranks-1];
+  int next = ring->userRanks[1];
+
+  NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
+  NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
+  NCCLCHECK(transportCreateProxy(0, ring, comm));
+  NCCLCHECK(transportCreateProxy(1, ring, comm));
+  return ncclSuccess;
+}
+
+static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
+  for (int r=0; r<nranks; r++) {
+    connectTransport[r] = -1;
+    for (int t=0; t<NTRANSPORTS; t++) {
+      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
+      if (connectValue[r] > 0) {
+        connectTransport[r] = t;
+        break;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+static void swap(void* mem1, void* mem2, int size) {
+  char tmp[size];
+  memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
+}
+
+#define MAXWIDTH 20
+#define PREFIXLEN 15
+#define STRLENGTH (PREFIXLEN+4*MAXWIDTH)
+void dumpMatrix(int* connectMatrix, int nranks) {
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
+  INFO(INIT,"%s", line);
+  for (int i=0; i<nranks; i++) {
+    memset(line, ' ', STRLENGTH);
+    sprintf(line, "%3d ", i);
+    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
+    INFO(INIT,"%s", line);
+  }
+}
+
+void dumpLine(int* values, int nranks, const char* prefix) {
+  int prefixlen = strlen(prefix);
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  strncpy(line, prefix, PREFIXLEN);
+  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
+  INFO(INIT,"%s", line);
+}
+
+static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  for (int r=0; r<nrings; r++) {
+    char prefix[30];
+    /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
+    dumpLine(prev+r*nranks, nranks, prefix);
+    sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
+    dumpLine(next+r*nranks, nranks, prefix);*/
+
+    int current = rank;
+    for (int i=0; i<nranks; i++) {
+      rings[r*nranks+i] = current;
+      current = next[r*nranks+current];
+    }
+    sprintf(prefix, "Ring %02d : ", r);
+    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
+    if (current != rank) {
+      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+      return ncclInternalError;
+    }
+    // Check that all ranks are there
+    for (int i=0; i<nranks; i++) {
+      int found = 0;
+      for (int j=0; j<nranks; j++) {
+        if (rings[r*nranks+j] == i) {
+          found = 1;
+          break;
+        }
+      }
+      if (found == 0) {
+        WARN("Error : ring %d does not contain rank %d", r, i);
+        return ncclInternalError;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+void* waitForNonNullPtr(void* p) {
+  volatile void** ptr = (volatile void**) p;
+  while (*ptr == NULL) sched_yield();
+  return (void*)*ptr;
+}
+
+ncclResult_t initParams(struct ncclComm* comm) {
+  struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
+  params->args = &comm->argsptr;
+  params->stream = NULL;
+  params->sharedMem = 0;
+  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
+  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
+  return ncclSuccess;
+}
+
+// Allocate/Set Intra Process Structures and set CG options
+ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
+  comm->intraRank = rank;
+  comm->intraRanks = ranks;
+  comm->intraPhase = 0;
+
+  // Alloc shared structures
+  if (rank == 0) {
+    assert(comm == comm0);
+    int* bar;
+    NCCLCHECK(ncclCalloc(&bar, 2));
+    bar[0] = bar[1] = 0;
+    comm->intraBarrier = bar;
+    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
+    int* CGMode;
+    NCCLCHECK(ncclCalloc(&CGMode, 1));
+    *CGMode = 0x11;
+    comm->intraCGMode = CGMode;
+    int* CC;
+    NCCLCHECK(ncclCalloc(&CC, 1));
+    *CC = ncclCudaFullCompCap();
+    comm->intraCC = CC;
+  } else {
+    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
+    comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
+    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
+    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
+  }
+  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+  NCCLCHECK(initParams(comm));
+
+  int cgMdLaunch = 0;
+
+  // Set CG Mode
+  comm->launchMode = ncclComm::GROUP;
+  char* str = getenv("NCCL_LAUNCH_MODE");
+  if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
+    comm->launchMode = ncclComm::PARALLEL;
+  }
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
+#if __CUDACC_VER_MAJOR__ >= 9
+    if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
+      // Check whether the GPU supports Cooperative Group Multi Device Launch
+      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
+    }
+#endif
+  }
+
+  // Disable cgMdLaunch if any rank does not support it
+  if (cgMdLaunch == 0) {
+    *comm->intraCGMode = 0x10;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  void* commState;
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
+
+  struct ncclInfo* allInfo;
+  NCCLCHECK(ncclCalloc(&allInfo, nranks));
+  NCCLCHECK(fillInfo(allInfo+rank, rank));
+  NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
+
+  int* connectTransport;
+  ncclTvalue_t* connectValue;
+  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
+  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+
+  NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
+  NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
+  NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
+  //if (rank == 0) dumpMatrix(connectTransport, nranks);
+  //if (rank == 0) dumpMatrix(connectValue, nranks);
+
+  // Get my rings
+  int nrings;
+  int* prev, *next;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+  comm->nThreads = getDefaultThreads();
+  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
+  free(connectTransport);
+  free(connectValue);
+
+  // Find max nThreads
+  int allData[nranks];
+  allData[rank] = comm->nThreads;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    comm->nThreads = std::max(allData[i], comm->nThreads);
+  if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads);
+
+  // Determine the minimum CUDA Compute capability of all GPUs
+  int myCompCap = ncclCudaCompCap();
+  int minCompCap = myCompCap;
+  allData[rank] = myCompCap;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    minCompCap = std::min(allData[i], minCompCap);
+  if (rank == 0) INFO(INIT,"Min Comp Cap %d", minCompCap);
+
+  // Find min nrings across ranks
+  allData[rank] = nrings;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    nrings = std::min(allData[i], nrings);
+
+  // Exchange data with others to build complete rings
+  comm->nRings = nrings;
+  for (int r=0; r<nrings; r++) {
+    NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
+    NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
+  }
+  int *rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
+  free(prev);
+  free(next);
+
+  // Connect with prev/next for each ring
+  for (int r=0; r<nrings; r++) {
+    int* ringRanks = rings+r*nranks;
+    struct ncclRing *ring = comm->rings+r;
+    struct ncclConnect connect[2];
+    NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connect));
+    NCCLCHECK(bootstrapRingExchange(commState, connect, ring->userRanks[nranks-1], ring->userRanks[1], sizeof(struct ncclConnect)));
+    NCCLCHECK(ring->send.transport->send.connect(connect+1, &ring->send));
+    NCCLCHECK(ring->recv.transport->recv.connect(connect+0, &ring->recv));
+  }
+  free(rings);
+  free(allInfo);
+
+  // Intra-process barrier setup
+  struct rankInfo {
+    uint64_t hostHash;
+    uint64_t pidHash;
+    struct ncclComm* comm;
+  } rankInfos[nranks];
+  rankInfos[rank].hostHash = getHostHash();
+  rankInfos[rank].pidHash = getPidHash();
+  rankInfos[rank].comm = comm;
+  NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
+
+  // Compute intra ranks
+  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+  for (int r=0; r<nranks; r++) {
+    if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
+        (rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
+      if (intraRanks == 0) intraRank0 = r;
+      if (r == rank) intraRank = intraRanks;
+      intraRanks++;
+    }
+  }
+  TRACE(INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+      rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
+    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+        rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+    return ncclInternalError;
+  }
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
+
+  // Barrier
+  bootstrapClose(commState);
+  return ncclSuccess;
+}
+
+bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false;
+  if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
+  if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
+    WARN("Failed to set CPU affinity");
+    return false;
+  }
+  return true;
+}
+
+ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+
+  // Make sure all host memory allocation are close to the GPU
+  int cudaDev;
+  nvmlDevice_t nvmlDevice;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  SetCpuAffinity(cudaDev, &nvmlDevice);
+  ncclResult_t res;
+
+  NCCLCHECKGOTO(commAlloc(newcomm, ndev, myrank), res, cleanup);
+  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
+  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
+  return ncclSuccess;
+cleanup:
+  *newcomm = NULL;
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  char* env = getenv("NCCL_COMM_ID");
+  if (env && myrank == 0) {
+    NCCLCHECK(bootstrapCreateRoot(&commId, true));
+  }
+
+  NCCLCHECK(ncclInit());
+  if (myrank == 0) showVersion();
+
+  INFO(INIT,"rank %d nranks %d", myrank, nranks);
+
+  // Make sure the CUDA runtime is initialized.
+  CUDACHECK(cudaFree(NULL));
+
+  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
+  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
+    WARN("Invalid rank requested : %d/%d", myrank, nranks);
+    return ncclInvalidArgument;
+  }
+
+  if (ncclAsyncMode()) {
+    int cudaDev;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
+  } else {
+    return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
+  }
+}
+
+static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
+  struct ncclInfo* allInfo;
+  NCCLCHECK(ncclCalloc(&allInfo, nranks));
+  for (int rank=0; rank<nranks; rank++) {
+    CUDACHECK(cudaSetDevice(devs[rank]));
+    NCCLCHECK(fillInfo(allInfo+rank, rank));
+  }
+
+  int* connectTransport;
+  ncclTvalue_t* connectValue;
+  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
+  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+  for (int rank=0; rank<nranks; rank++)
+    NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
+
+  int* prev, *prevFinal, *next, *nextFinal;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
+  int nrings = MAXRINGS;
+  int nthreads=0;
+  int myCompCap = ncclCudaCompCap();
+  int minCompCap = myCompCap;
+  for (int rank=0; rank<nranks; rank++) {
+    CUDACHECK(cudaSetDevice(devs[rank]));
+    int nringsRank;
+    int nthreadsRank = getDefaultThreads();
+    myCompCap = ncclCudaCompCap();
+    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
+    nrings = std::min(nrings, nringsRank);
+    nthreads = std::max(nthreads, nthreadsRank);
+    minCompCap = std::min(minCompCap, myCompCap);
+    for (int ring=0; ring<nrings; ring++) {
+      int index = ring*nranks+rank;
+      prevFinal[index] = prev[index];
+      nextFinal[index] = next[index];
+    }
+  }
+  free(connectTransport);
+  free(connectValue);
+  free(prev);
+  free(next);
+
+  INFO(INIT,"Using %d threads", nthreads);
+  INFO(INIT,"Min Comp Cap %d", minCompCap);
+
+  int* rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
+  free(prevFinal);
+  free(nextFinal);
+
+  for (int rank=0; rank<nranks; rank++) {
+    comms[rank]->nRings = nrings;
+    comms[rank]->nThreads = nthreads;
+  }
+
+  for (int r=0; r<nrings; r++) {
+    struct ncclConnect connect[2*nranks];
+    int* ringRanks = rings+r*nranks;
+    for (int rank=0; rank<nranks; rank++) {
+      CUDACHECK(cudaSetDevice(devs[rank]));
+      NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
+    }
+    // RingExchange connect information
+    for (int rank=0; rank<nranks; rank++) {
+      // Swap rank->prev and prevRank->next
+      struct ncclRing *ring = comms[rank]->rings+r;
+      int prevRank = ring->userRanks[nranks-1];
+      struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
+      struct ncclConnect* rankPrevConnect = connect+2*rank;
+      swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
+    }
+    for (int rank=0; rank<nranks; rank++) {
+      CUDACHECK(cudaSetDevice(devs[rank]));
+      struct ncclRing *ring = comms[rank]->rings+r;
+      NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
+      NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
+    }
+  }
+  free(rings);
+  free(allInfo);
+  return ncclSuccess;
+}
+
+
+NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
+ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+  showVersion();
+
+  INFO(INIT,"nranks %d", ndev);
+
+  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
+  if (ndev < 1) {
+    WARN("Invalid device count requested : %d", ndev);
+    return ncclInvalidArgument;
+  }
+
+  ncclResult_t res;
+  int savedDevice;
+  int rank, cudaDev;
+  ncclComm_t comm = NULL;
+  nvmlDevice_t nvmlDevice;
+  int ncclDevList[ndev];
+  for (int i=0; i<ndev; i++) {
+    ncclDevList[i] = devlist ? devlist[i] : i;
+  }
+
+  cudaGetDevice(&savedDevice);
+
+  for(rank=0; rank<ndev; ++rank)
+    comms[rank] = NULL;
+
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  for (rank=0; rank<ndev; ++rank) {
+    cudaDev = ncclDevList[rank];
+    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+
+    SetCpuAffinity(cudaDev, &nvmlDevice);
+
+    NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
+    comms[rank] = comm;
+
+    NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
+  }
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
+
+  for(rank=0; rank<ndev; ++rank) {
+    cudaDev = ncclDevList[rank];
+    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+    NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
+  }
+
+  res = ncclSuccess;
+  goto final;
+
+cleanup:
+  for(rank=0; rank<ndev; ++rank) {
+    if(comms[rank] != NULL) {
+      commFree(comms[rank]);
+    }
+  }
+
+final:
+  if(wrapNvmlShutdown() != ncclSuccess)
+    INFO(INIT,"NCCL did not shutdown nvml properly");
+  cudaSetDevice(savedDevice);
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+
+  if (comm == NULL)
+    return ncclSuccess;
+  int savedDevice;
+  CUDACHECK(cudaGetDevice(&savedDevice));
+  int commDevice = comm->cudaDev;
+
+  if (savedDevice != commDevice) {
+    CUDACHECK(cudaSetDevice(commDevice));
+  }
+
+  NCCLCHECK(commFree(comm));
+
+  if (savedDevice != commDevice)
+    CUDACHECK(cudaSetDevice(savedDevice));
+
+  return ncclSuccess;
+}
+
+NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
+const char* ncclGetErrorString(ncclResult_t code) {
+  switch (code) {
+    case ncclSuccess                : return "no error";
+    case ncclUnhandledCudaError     : return "unhandled cuda error";
+    case ncclSystemError            : return "unhandled system error";
+    case ncclInternalError          : return "internal error";
+    case ncclInvalidArgument        : return "invalid argument";
+    case ncclInvalidUsage           : return "invalid usage";
+    default                         : return "unknown result code";
+  }
+}
+
+NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
+ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
+  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
+  *count = comm->nRanks;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
+  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
+  *devid = comm->cudaDev;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
+ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
+  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
+  *rank = comm->rank;
+  return ncclSuccess;
+}
--- a/src/libwrap.cu
+++ b/src/libwrap.cu
@ -1,155 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "libwrap.h"
-#include <dlfcn.h>
-#include "core.h"
-
-int symbolsLoaded = 0;
-
-static nvmlReturn_t (*nvmlInternalInit)(void);
-static nvmlReturn_t (*nvmlInternalShutdown)(void);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
-static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
-
-ncclResult_t wrapSymbols(void) {
-
-  if (symbolsLoaded)
-    return ncclSuccess;
-
-  static void* nvmlhandle = NULL;
-  void* tmp;
-  void** cast;
-
-  nvmlhandle=dlopen("libnvidia-ml.so", RTLD_NOW);
-  if (!nvmlhandle) {
-    nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
-    if (!nvmlhandle) {
-      WARN("Failed to open libnvidia-ml.so[.1]");
-      goto teardown;
-    }
-  }
-
-  #define LOAD_SYM(handle, symbol, funcptr) do {         \
-    cast = (void**)&funcptr;                             \
-    tmp = dlsym(handle, symbol);                         \
-    if (tmp == NULL) {                                   \
-      WARN("dlsym failed on %s - %s", symbol, dlerror());\
-      goto teardown;                                     \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
-  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-
-  symbolsLoaded = 1;
-  return ncclSuccess;
-
-  teardown:
-  nvmlInternalInit = NULL;
-  nvmlInternalShutdown = NULL;
-  nvmlInternalDeviceGetHandleByPciBusId = NULL;
-  nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceSetCpuAffinity = NULL;
-  nvmlInternalDeviceClearCpuAffinity = NULL;
-
-  if (nvmlhandle != NULL) dlclose(nvmlhandle);
-  return ncclSystemError;
-}
-
-
-ncclResult_t wrapNvmlInit(void) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalInit();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlInit() failed: %s",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlShutdown(void) {
-  if (nvmlInternalShutdown == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalShutdown();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlShutdown() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  if (nvmlInternalDeviceGetIndex == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetIndex() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}`