Add support for inter-node communication using sockets and InfiniBand/RoCE.
Improve latency.
Add support for aggregation.
Improve LL/regular tuning.
Remove tests as those are now at github.com/nvidia/nccl-tests .
This commit is contained in:
Sylvain Jeaugey 2018-09-24 16:06:59 -07:00
parent 286916a1a3
commit f93fe9bfd9
132 changed files with 12424 additions and 9415 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
/build
*.gcov
/coverage/

View File

@ -1,5 +1,5 @@
Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions

250
Makefile
View File

@ -1,236 +1,30 @@
#
# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENCE.txt for license information
# See LICENSE.txt for license information
#
.PHONY : all clean
CUDA_HOME ?= /usr/local/cuda
PREFIX ?= /usr/local
VERBOSE ?= 0
KEEP ?= 0
DEBUG ?= 0
PROFAPI ?= 0
BUILDDIR ?= build
BUILDDIR := $(abspath $(BUILDDIR))
default : src.build
BUILDDIR ?= $(abspath ./build)
ABSBUILDDIR := $(abspath $(BUILDDIR))
TARGETS := src pkg
clean: ${TARGETS:%=%.clean}
test.build: src.build
LICENSE_FILES := LICENSE.txt
LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
lic: $(LICENSE_TARGETS)
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
NVCC ?= $(CUDA_HOME)/bin/nvcc
${BUILDDIR}/%.txt: %.txt
@printf "Copying %-35s > %s\n" $< $@
mkdir -p ${BUILDDIR}
cp $< $@
NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_60,code=sm_60\
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_60,code=compute_60
src.%:
${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96
# Use addprefix so that we can specify more than one path
LDFLAGS := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
ifeq ($(DEBUG), 0)
NVCUFLAGS += -O3
CXXFLAGS += -O3
else
NVCUFLAGS += -O0 -G
CXXFLAGS += -O0 -g -ggdb3
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
CXXFLAGS += -Wall -Wextra
else
.SILENT:
endif
ifneq ($(KEEP), 0)
NVCUFLAGS += -keep
endif
ifneq ($(PROFAPI), 0)
CXXFLAGS += -DPROFAPI
endif
NCCL_MAJOR := 1
NCCL_MINOR := 3
NCCL_PATCH := 5
CXXFLAGS += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
CXXFLAGS += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
.PHONY : all lib staticlib clean test mpitest install deb debian debclean forlib fortest forclean
.DEFAULT : all
INCEXPORTS := nccl.h
LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
LIBNAME := libnccl.so
STATICLIBNAME := libnccl_static.a
INCDIR := $(BUILDDIR)/include
LIBDIR := $(BUILDDIR)/lib
OBJDIR := $(BUILDDIR)/obj
INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
LIBSONAME := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
LIBTARGET := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
STATICLIBTARGET := $(STATICLIBNAME)
LIBLINK := $(patsubst lib%.so, -l%, $(LIBNAME))
LIBOBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
DEPFILES := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
all : lib staticlib
lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
staticlib : $(INCTARGETS) $(LIBDIR)/$(STATICLIBTARGET)
-include $(DEPFILES)
$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
$(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
mkdir -p $(LIBDIR)
ar cr $@ $(LIBOBJ)
$(INCDIR)/%.h : src/%.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
cp -f $< $@
$(OBJDIR)/%.o : src/%.cu
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p $(OBJDIR)
$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@rm -f $(@:%.o=%.d.tmp)
clean :
rm -rf $(BUILDDIR)
install : lib
mkdir -p $(PREFIX)/lib
mkdir -p $(PREFIX)/include
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
#### TESTS ####
TEST_ONLY ?= 0
# Tests depend on lib, except in TEST_ONLY mode.
ifeq ($(TEST_ONLY), 0)
TSTDEP = $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
endif
NCCL_LIB ?= $(LIBDIR)
NCCL_INC ?= $(INCDIR)
MPI_HOME ?= /usr
MPI_INC ?= $(MPI_HOME)/include
MPI_LIB ?= $(MPI_HOME)/lib
MPIFLAGS := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
TESTS := all_gather_test all_gather_scan \
all_reduce_test all_reduce_scan \
broadcast_test broadcast_scan \
reduce_test reduce_scan \
reduce_scatter_test reduce_scatter_scan
MPITESTS := mpi_test
TSTINC := -I$(NCCL_INC) -Itest/include
TSTLIB := -L$(NCCL_LIB) $(LIBLINK) $(LDFLAGS)
TSTDIR := $(BUILDDIR)/test/single
MPITSTDIR := $(BUILDDIR)/test/mpi
TESTBINS := $(patsubst %, $(TSTDIR)/%, $(TESTS))
MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
test : $(TESTBINS)
$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
@printf "Building %-35s > %s\n" $< $@
mkdir -p $(TSTDIR)
$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
@rm -f $(@:%=%.d.tmp)
mpitest : $(MPITESTBINS)
$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
@printf "Building %-35s > %s\n" $< $@
mkdir -p $(MPITSTDIR)
$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
@rm -f $(@:%=%.d.tmp)
#### PACKAGING ####
DEBIANDIR := $(BUILDDIR)/debian
DEBGEN_IN := $(shell (cd debian ; ls *.in))
DEBGEN := $(DEBGEN_IN:.in=)
DEBFILES := compat copyright libnccl-dev.install libnccl-dev.manpages nccl.7 rules $(DEBGEN)
DEBTARGETS := $(patsubst %, $(DEBIANDIR)/%, $(DEBFILES))
DEB_REVISION ?= 1
DEB_TIMESTAMP := $(shell date -R)
DEB_ARCH ?= amd64
debian : $(DEBTARGETS)
deb : lib debian
@printf "Building Debian package\n"
(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
mkdir -p $(BUILDDIR)/deb/
mv $(BUILDDIR)/../libnccl*.deb $(BUILDDIR)/deb/
debclean :
rm -Rf $(DEBIANDIR)
$(DEBIANDIR)/% : debian/%.in
@printf "Generating %-35s > %s\n" $< $@
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${deb:Revision}/$(DEB_REVISION)/g" \
-e "s/\$${deb:Timestamp}/$(DEB_TIMESTAMP)/g" \
-e "s/\$${deb:Arch}/$(DEB_ARCH)/g" \
$< > $@
$(DEBIANDIR)/% : debian/%
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(DEBIANDIR)
cp -f $< $@
#### FORTRAN BINDINGS ####
export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
forlib : lib
$(MAKE) -C fortran lib
fortest : forlib
$(MAKE) -C fortran test
forclean :
$(MAKE) -C fortran clean
pkg.%:
${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
pkg.debian.prep: lic
pkg.txz.prep: lic

144
README.md
View File

@ -1,128 +1,84 @@
**IMPORTANT NOTE**
**NCCL1 is no longer maintained/updated and has been replaced by NCCL2, available at**
**http://developer.nvidia.com/nccl.**
# NCCL
Optimized primitives for collective multi-GPU communication.
## Introduction
NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
## What's inside
At present, the library implements the following collectives:
At present, the library implements the following collectives operations:
- all-reduce
- all-gather
- reduce-scatter
- reduce
- broadcast
These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
## Requirements
NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
## Build
## Build & run
To build the library and tests.
To build the library :
```shell
$ cd nccl
$ make CUDA_HOME=<cuda install path> test
$ make -j src.build
```
Test binaries are located in the subdirectories nccl/build/test/{single,mpi}.
If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :
```shell
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./build/lib
$ ./build/test/single/all_reduce_test
Error: must specify at least data size in bytes!
Tests nccl AllReduce with user supplied arguments.
Usage: all_reduce_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
$ ./build/test/single/all_reduce_test 10000000
# Using devices
# Device 0 -> 0 [0x0a] GeForce GTX TITAN X
# Device 1 -> 1 [0x09] GeForce GTX TITAN X
# Device 2 -> 2 [0x06] GeForce GTX TITAN X
# Device 3 -> 3 [0x05] GeForce GTX TITAN X
# out-of-place in-place
# bytes N type op time algbw busbw res time algbw busbw res
10000000 10000000 char sum 1.628 6.14 9.21 0e+00 1.932 5.18 7.77 0e+00
10000000 10000000 char prod 1.629 6.14 9.21 0e+00 1.643 6.09 9.13 0e+00
10000000 10000000 char max 1.621 6.17 9.25 0e+00 1.634 6.12 9.18 0e+00
10000000 10000000 char min 1.633 6.12 9.19 0e+00 1.637 6.11 9.17 0e+00
10000000 2500000 int sum 1.611 6.21 9.31 0e+00 1.626 6.15 9.23 0e+00
10000000 2500000 int prod 1.613 6.20 9.30 0e+00 1.629 6.14 9.21 0e+00
10000000 2500000 int max 1.619 6.18 9.26 0e+00 1.627 6.15 9.22 0e+00
10000000 2500000 int min 1.619 6.18 9.27 0e+00 1.624 6.16 9.24 0e+00
10000000 5000000 half sum 1.617 6.18 9.28 4e-03 1.636 6.11 9.17 4e-03
10000000 5000000 half prod 1.618 6.18 9.27 1e-03 1.657 6.03 9.05 1e-03
10000000 5000000 half max 1.608 6.22 9.33 0e+00 1.621 6.17 9.25 0e+00
10000000 5000000 half min 1.610 6.21 9.32 0e+00 1.627 6.15 9.22 0e+00
10000000 2500000 float sum 1.618 6.18 9.27 5e-07 1.622 6.17 9.25 5e-07
10000000 2500000 float prod 1.614 6.20 9.29 1e-07 1.628 6.14 9.21 1e-07
10000000 2500000 float max 1.616 6.19 9.28 0e+00 1.633 6.12 9.19 0e+00
10000000 2500000 float min 1.613 6.20 9.30 0e+00 1.628 6.14 9.21 0e+00
10000000 1250000 double sum 1.629 6.14 9.21 0e+00 1.628 6.14 9.21 0e+00
10000000 1250000 double prod 1.619 6.18 9.26 2e-16 1.628 6.14 9.21 2e-16
10000000 1250000 double max 1.613 6.20 9.30 0e+00 1.630 6.13 9.20 0e+00
10000000 1250000 double min 1.622 6.16 9.25 0e+00 1.623 6.16 9.24 0e+00
$ make src.build CUDA_HOME=<path to cuda install>
```
To install, run `make PREFIX=<install dir> install` and add `<instal dir>/lib` to your `LD_LIBRARY_PATH`.
NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.
## Usage
NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single-process machine, all GPUs can be conveniently initialized using `ncclCommInitAll`. For multi-process applications (e.g., with MPI), `ncclCommInitRank` must be called for each GPU. Internally `ncclCommInitRank` invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single-process example follows, for an MPI example see test/mpi/mpi_test.cu. For details about the API see nccl.h.
```c
#include <nccl.h>
typedef struct {
double* sendBuff;
double* recvBuff;
int size;
cudaStream_t stream;
} PerThreadData;
int main(int argc, char* argv[])
{
int nGPUs;
cudaGetDeviceCount(&nGPUs);
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nGPUs);
ncclCommInitAll(comms, nGPUs); // initialize communicator
// One communicator per process
PerThreadData* data;
... // Allocate data and issue work to each GPU's
// perDevStream to populate the sendBuffs.
for(int i=0; i<nGPUs; ++i) {
cudaSetDevice(i); // Correct device must be set
// prior to each collective call.
ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
ncclDouble, ncclSum, comms[i], data[i].stream);
}
... // Issue work into data[*].stream to consume buffers, etc.
}
By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
```shell
$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
```
## Copyright and License
## Install
NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
rights reserved.
To install NCCL on the system, create a package then install it as root.
Debian/Ubuntu :
```shell
$ make pkg.debian.build
$ ls build/pkg/deb/
```
RedHat/CentOS :
```shell
$ make pkg.redhat.build
$ ls build/pkg/rpm/
```
OS-agnostic tarball :
```shell
$ make pkg.txz.build
$ ls build/pkg/txz/
```
## Tests
Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
```shell
$ git clone https://github.com/NVIDIA/nccl-tests.git
$ cd nccl-tests
$ make
$ ./build/allreduce_perf -b 8 -e 256M -f 2 -g <ngpus>
```
## Copyright
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.

5
debian/changelog.in vendored
View File

@ -1,5 +0,0 @@
nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}-${deb:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
* Automatic Debian package from build
-- cudatools <cudatools@nvidia.com> ${deb:Timestamp}

1
debian/copyright vendored
View File

@ -1 +0,0 @@
../LICENSE.txt

View File

@ -1,2 +0,0 @@
include/nccl.h usr/include
lib/libnccl.so /usr/lib/x86_64-linux-gnu

View File

@ -1 +0,0 @@
debian/nccl.7

View File

@ -1,2 +0,0 @@
lib/libnccl.so.${nccl:Major} /usr/lib/x86_64-linux-gnu
lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/x86_64-linux-gnu

139
debian/nccl.7 vendored
View File

@ -1,139 +0,0 @@
.TH NCCL
.SH NAME
.PP
nccl \- Optimized primitives for collective multi\-GPU communication.
.SH Introduction
.PP
NCCL (pronounced "Nickel") is a stand\-alone library of standard collective communication routines, such as all\-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single\- or multi\-process (e.g., MPI) applications.
.SH What's inside
.PP
At present, the library implements the following collectives:
\- all\-reduce
\- all\-gather
\- reduce\-scatter
\- reduce
\- broadcast
.PP
These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
.SH Requirements
.PP
NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi\-socket configurations are also supported.
.PP
Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
.SH Build & run
.PP
To build the library and tests.
.PP
.RS
.nf
$ cd nccl
$ make CUDA\_HOME=<cuda install path> test
.fi
.RE
.PP
Test binaries are located in the subdirectories nccl/build/test and nccl/build/mpitest.
.PP
.RS
.nf
$ export LD\_LIBRARY\_PATH=$LD\_LIBRARY\_PATH:./build/lib
$ ./build/test/all\_reduce\_test
Error: must specify at least data size in bytes!
Tests nccl AllReduce with user supplied arguments.
Usage: all\_reduce\_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
$ ./build/test/all\_reduce\_test 10000000
# Using devices
# Device 0 \-> 0 [0x0a] GeForce GTX TITAN X
# Device 1 \-> 1 [0x09] GeForce GTX TITAN X
# Device 2 \-> 2 [0x06] GeForce GTX TITAN X
# Device 3 \-> 3 [0x05] GeForce GTX TITAN X
# out\-of\-place in\-place
# bytes N type op time algbw busbw res time algbw busbw res
10000000 10000000 char sum 1.628 6.14 9.21 0e+00 1.932 5.18 7.77 0e+00
10000000 10000000 char prod 1.629 6.14 9.21 0e+00 1.643 6.09 9.13 0e+00
10000000 10000000 char max 1.621 6.17 9.25 0e+00 1.634 6.12 9.18 0e+00
10000000 10000000 char min 1.633 6.12 9.19 0e+00 1.637 6.11 9.17 0e+00
10000000 2500000 int sum 1.611 6.21 9.31 0e+00 1.626 6.15 9.23 0e+00
10000000 2500000 int prod 1.613 6.20 9.30 0e+00 1.629 6.14 9.21 0e+00
10000000 2500000 int max 1.619 6.18 9.26 0e+00 1.627 6.15 9.22 0e+00
10000000 2500000 int min 1.619 6.18 9.27 0e+00 1.624 6.16 9.24 0e+00
10000000 5000000 half sum 1.617 6.18 9.28 4e\-03 1.636 6.11 9.17 4e\-03
10000000 5000000 half prod 1.618 6.18 9.27 1e\-03 1.657 6.03 9.05 1e\-03
10000000 5000000 half max 1.608 6.22 9.33 0e+00 1.621 6.17 9.25 0e+00
10000000 5000000 half min 1.610 6.21 9.32 0e+00 1.627 6.15 9.22 0e+00
10000000 2500000 float sum 1.618 6.18 9.27 5e\-07 1.622 6.17 9.25 5e\-07
10000000 2500000 float prod 1.614 6.20 9.29 1e\-07 1.628 6.14 9.21 1e\-07
10000000 2500000 float max 1.616 6.19 9.28 0e+00 1.633 6.12 9.19 0e+00
10000000 2500000 float min 1.613 6.20 9.30 0e+00 1.628 6.14 9.21 0e+00
10000000 1250000 double sum 1.629 6.14 9.21 0e+00 1.628 6.14 9.21 0e+00
10000000 1250000 double prod 1.619 6.18 9.26 2e\-16 1.628 6.14 9.21 2e\-16
10000000 1250000 double max 1.613 6.20 9.30 0e+00 1.630 6.13 9.20 0e+00
10000000 1250000 double min 1.622 6.16 9.25 0e+00 1.623 6.16 9.24 0e+00
.fi
.RE
.PP
To install, run \fB\fCmake PREFIX=<install dir> install\fR and add \fB\fC<instal dir>/lib\fR to your \fB\fCLD\_LIBRARY\_PATH\fR.
.SH Usage
.PP
NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single\-process machine, all GPUs can be conveniently initialized using \fB\fCncclCommInitAll\fR. For multi\-process applications (e.g., with MPI), \fB\fCncclCommInitRank\fR must be called for each GPU. Internally \fB\fCncclCommInitRank\fR invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single\-process example follows, for an MPI example see src/mpi\_test.cu. For details about the API see nccl.h.
.PP
.RS
.nf
#include <nccl.h>
typedef struct \{
double* sendBuff;
double* recvBuff;
int size;
cudaStream\_t stream;
\} PerThreadData;
int main(int argc, char* argv[])
\{
int nGPUs;
cudaGetDeviceCount(\&nGPUs);
ncclComm\_t* comms = (ncclComm\_t*)malloc(sizeof(ncclComm\_t)*nGPUs);
ncclCommInitAll(comms, nGPUs); // initialize communicator
// One communicator per process
PerThreadData* data;
... // Allocate data and issue work to each GPU's
// perDevStream to populate the sendBuffs.
for(int i=0; i<nGPUs; ++i) \{
cudaSetDevice(i); // Correct device must be set
// prior to each collective call.
ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
ncclDouble, ncclSum, comms[i], data[i].stream);
\}
... // Issue work into data[*].stream to consume buffers, etc.
\}
.fi
.RE
.SH Copyright
.PP
All source code and accompanying documentation is copyright (c) 2015\-2016, NVIDIA CORPORATION. All
rights reserved.

View File

@ -1 +0,0 @@
libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}

View File

@ -1,81 +0,0 @@
FC := gfortran
FCNAME := $(notdir $(FC))
BUILDDIR ?= ../build
INCDIR := $(BUILDDIR)/include
LIBDIR := $(BUILDDIR)/lib
OBJDIR := $(BUILDDIR)/obj
LIBNAME := libncclfor.so
LIBSONAME := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
LIBTARGET := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
LIBLINK += $(patsubst lib%.so,-l%,$(LIBNAME))
LIBCUDAFOR := libcudafor.so
ifneq ($(filter pgf%, $(FCNAME)), )
# PGI compiler (pgfortran, pgf90, pgf95)
FCMODFLAGS := -module $(INCDIR)
FCPREFLAGS := -Mpreprocess
FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
FCFLAGS := -fast -O3
else
# non-PGI compilers do not have CUDA support, compile our own CUDA lib
CUDAFORDEP := $(LIBDIR)/$(LIBCUDAFOR)
CUDALINK := -L$(CUDA_LIB) -lcudart
CUDAFORLINK := -lcudafor
ifeq ($(FCNAME), gfortran)
FCMODFLAGS := -J$(INCDIR)
FCPREFLAGS += -cpp
FCFLAGS += -ffree-line-length-none
else ifeq ($(FCNAME), ifort)
FCMODFLAGS := -module $(INCDIR)
FCPREFLAGS += -fpp
endif
endif
ifeq ($(VERBOSE), 0)
.SILENT:
endif
lib: $(CUDAFORDEP)
$(MAKE) $(LIBDIR)/$(LIBTARGET)
$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
@printf "Linking %-35s > %s\n" $(LIBCUDAFOR) $@
mkdir -p $(LIBDIR)
$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
$(OBJDIR)/%.o: src/%.f90
@printf "Building %-35s > %s\n" $< $@
mkdir -p $(OBJDIR)
mkdir -p $(INCDIR)
$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
ifneq ($(filter pgf%, $(FCNAME)), )
TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
endif
TESTDIR := $(BUILDDIR)/test/fortran
TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
test: lib $(TESTBINS)
$(TESTDIR)/%: test/%.f90 lib
@printf "Building %-35s > %s\n" $< $@
@mkdir -p $(TESTDIR)
$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
clean:
rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
rm -rf $(TESTDIR)/

View File

@ -1,171 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
#ifndef _CUDA
!Start cudaFor module
module cudaFor
use iso_c_binding
implicit none
private
public :: c_devptr
public :: cudaMemcpyKind, &
cudaMemcpyHostToHost, &
cudaMemcpyHostToDevice, &
cudaMemcpyDeviceToHost, &
cudaMemcpyDeviceToDevice, &
cudaMemcpyDefault
public :: cuda_stream_kind
public :: cudaGetDeviceCount
public :: cudaSetDevice
public :: cudaMalloc
public :: cudaMemcpy
public :: cudaFree
public :: cudaStreamCreate
public :: cudaStreamSynchronize
public :: cudaStreamDestroy
!Start types
!Start c_devptr
type, bind(c) :: c_devptr
type(c_ptr) :: member
end type c_devptr
!End c_devptr
!Start cudaMemcpyKind
type, bind(c) :: cudaMemcpyKind
integer(c_int) :: member
end type cudaMemcpyKind
type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost = cudaMemcpyKind(0), &
cudaMemcpyHostToDevice = cudaMemcpyKind(1), &
cudaMemcpyDeviceToHost = cudaMemcpyKind(2), &
cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
cudaMemcpyDefault = cudaMemcpyKind(4)
!End cudaMemcpyKind
!Start cuda_stream_kind
integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
!End cuda_stream_kind
!End types
!Start interfaces
!Start cudaGetDeviceCount
interface cudaGetDeviceCount
integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
import :: c_int
implicit none
integer(c_int) :: count
end function cudaGetDeviceCount
end interface cudaGetDeviceCount
!End cudaGetDeviceCount
!Start cudaSetDevice
interface cudaSetDevice
integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
import :: c_int
implicit none
integer(c_int), value :: device
end function cudaSetDevice
end interface cudaSetDevice
!End cudaSetDevice
!Start cudaMalloc
interface cudaMalloc
integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
import :: c_int, c_size_t
import :: c_devptr
implicit none
type(c_devptr) :: devPtr
integer(c_size_t), value :: size
end function cudaMalloc
end interface cudaMalloc
!End cudaMalloc
!Start cudaMemcpy
interface cudaMemcpy
!Start cudaMemcpyH2D
integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
import :: c_ptr, c_int, c_size_t
import :: c_devptr, cudaMemcpyKind
implicit none
type(c_devptr), value :: dst
type(c_ptr), value :: src
integer(c_size_t), value :: count
type(cudaMemcpyKind), value :: kind
end function cudaMemcpyH2D
!End cudaMemcpyH2D
!Start cudaMemcpyD2H
integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
import :: c_ptr, c_int, c_size_t
import :: c_devptr, cudaMemcpyKind
implicit none
type(c_ptr), value :: dst
type(c_devptr), value :: src
integer(c_size_t), value :: count
type(cudaMemcpyKind), value :: kind
end function cudaMemcpyD2H
!End cudaMemcpyD2H
end interface cudaMemcpy
!End cudaMemcpy
!Start cudaFree
interface cudaFree
integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
import :: c_int
import :: c_devptr
implicit none
type(c_devptr), value :: devPtr
end function cudaFree
end interface cudaFree
!End cudaFree
!Start cudaStreamCreate
interface cudaStreamCreate
integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
import :: c_int
import :: cuda_stream_kind
implicit none
integer(cuda_stream_kind) :: pStream
end function cudaStreamCreate
end interface cudaStreamCreate
!End cudaStreamCreate
!Start cudaStreamSynchronize
interface cudaStreamSynchronize
integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
import :: c_int
import :: cuda_stream_kind
implicit none
integer(cuda_stream_kind), value :: stream
end function cudaStreamSynchronize
end interface cudaStreamSynchronize
!End cudaStreamSynchronize
!Start cudaStreamDestroy
interface cudaStreamDestroy
integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
import :: c_int
import :: cuda_stream_kind
implicit none
integer(cuda_stream_kind), value :: stream
end function cudaStreamDestroy
end interface cudaStreamDestroy
!End cudaStreamDestroy
!End interfaces
end module cudaFor
!End cudaFor module
#endif

View File

@ -1,312 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
!Start defines
#define NCCL_UNIQUE_ID_BYTES 128
!End defines
!Start ncclFor module
module ncclFor
use iso_c_binding
use cudaFor
implicit none
private
public :: ncclUniqueId
public :: ncclComm
public :: ncclResult, &
ncclSuccess, &
ncclUnhandledCudaError, &
ncclSystemError, &
ncclInternalError, &
ncclInvalidDevicePointer, &
ncclInvalidRank, &
ncclUnsupportedDeviceCount, &
ncclDeviceNotFound, &
ncclInvalidDeviceIndex, &
ncclLibWrapperNotSet, &
ncclCudaMallocFailed, &
ncclRankMismatch, &
ncclInvalidArgument, &
ncclInvalidType, &
ncclInvalidOperation, &
nccl_NUM_RESULTS
public :: ncclDataType, &
ncclChar, &
ncclInt, &
#ifdef CUDA_HAS_HALF
ncclHalf, &
#endif
ncclFloat, &
ncclDouble, &
ncclInt64, &
ncclUInt64, &
nccl_NUM_TYPES
public :: ncclRedOp, &
ncclSum, &
ncclProd, &
ncclMax, &
ncclMin, &
nccl_NUM_OPS
public :: ncclGetUniqueId
public :: ncclCommInitRank
public :: ncclCommInitAll
public :: ncclCommCuDevice
public :: ncclCommUserRank
public :: ncclCommCount
public :: ncclCommDestroy
public :: ncclReduce
public :: ncclAllReduce
public :: ncclReduceScatter
public :: ncclBcast
public :: ncclAllGather
!Start types
!Start ncclUniqueId
type, bind(c) :: ncclUniqueId
character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
end type ncclUniqueId
!End ncclUniqueId
!Start ncclComm
type, bind(c) :: ncclComm
type(c_ptr) :: member
end type ncclComm
!End ncclComm
!Start ncclResult
type, bind(c) :: ncclResult
integer(c_int) :: member
end type ncclResult
type(ncclResult), parameter :: ncclSuccess = ncclResult( 0), &
ncclUnhandledCudaError = ncclResult( 1), &
ncclSystemError = ncclResult( 2), &
ncclInternalError = ncclResult( 3), &
ncclInvalidDevicePointer = ncclResult( 4), &
ncclInvalidRank = ncclResult( 5), &
ncclUnsupportedDeviceCount = ncclResult( 6), &
ncclDeviceNotFound = ncclResult( 7), &
ncclInvalidDeviceIndex = ncclResult( 8), &
ncclLibWrapperNotSet = ncclResult( 9), &
ncclCudaMallocFailed = ncclResult(10), &
ncclRankMismatch = ncclResult(11), &
ncclInvalidArgument = ncclResult(12), &
ncclInvalidType = ncclResult(13), &
ncclInvalidOperation = ncclResult(14), &
nccl_NUM_RESULTS = ncclResult(15)
!End ncclResult
!Start ncclDataType
type, bind(c) :: ncclDataType
integer(c_int) :: member
end type ncclDataType
type(ncclDataType), parameter :: ncclChar = ncclDataType(0), &
ncclInt = ncclDataType(1), &
#ifdef CUDA_HAS_HALF
ncclHalf = ncclDataType(2), &
#endif
ncclFloat = ncclDataType(3), &
ncclDouble = ncclDataType(4), &
ncclInt64 = ncclDataType(5), &
ncclUInt64 = ncclDataType(6), &
nccl_NUM_TYPES = ncclDataType(7)
!End ncclDataType
!Start ncclRedOp
type, bind(c) :: ncclRedOp
integer(c_int) :: member
end type ncclRedOp
type(ncclRedOp), parameter :: ncclSum = ncclRedOp(0), &
ncclProd = ncclRedOp(1), &
ncclMax = ncclRedOp(2), &
ncclMin = ncclRedOp(3), &
nccl_NUM_OPS = ncclRedOp(4)
!End ncclRedOp
!End types
!Start interfaces
!Start ncclGetUniqueId
interface ncclGetUniqueId
type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
import :: ncclResult, ncclUniqueId
implicit none
type(ncclUniqueId) :: uniqueId
end function ncclGetUniqueId
end interface ncclGetUniqueId
!End ncclGetUniqueId
!Start ncclCommInitRank
interface ncclCommInitRank
type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
import :: c_int
import :: ncclResult, ncclUniqueId, ncclComm
implicit none
type(ncclComm) :: comm(*)
integer(c_int), value :: ndev
type(ncclUniqueId), value :: commId
integer(c_int), value :: rank
end function ncclCommInitRank
end interface ncclCommInitRank
!End ncclCommInitRank
!Start ncclCommInitAll
interface ncclCommInitAll
type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
import :: c_int
import :: ncclResult, ncclComm
implicit none
type(ncclComm) :: comm(*)
integer(c_int), value :: ndev
integer(c_int) :: devlist(*)
end function ncclCommInitAll
end interface ncclCommInitAll
!End ncclCommInitAll
!Start ncclCommCuDevice
interface ncclCommCuDevice
type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
import :: c_int
import :: ncclResult, ncclComm
implicit none
type(ncclComm), value :: comm
integer(c_int) :: devid
end function ncclCommCuDevice
end interface ncclCommCuDevice
!End ncclCommCuDevice
!Start ncclCommUserRank
interface ncclCommUserRank
type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
import :: c_int
import :: ncclResult, ncclComm
implicit none
type(ncclComm), value :: comm
integer(c_int) :: rank
end function ncclCommUserRank
end interface ncclCommUserRank
!End ncclCommUserRank
!Start ncclCommCount
interface ncclCommCount
type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
import :: c_int
import :: ncclResult, ncclComm
implicit none
type(ncclComm), value :: comm
integer(c_int) :: count
end function ncclCommCount
end interface ncclCommCount
!End ncclCommCount
!Start ncclCommDestroy
interface ncclCommDestroy
subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
import :: ncclComm
implicit none
type(ncclComm), value :: comm
end subroutine ncclCommDestroy
end interface ncclCommDestroy
!End ncclCommDestroy
!Start ncclReduce
interface ncclReduce
type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
implicit none
type(c_devptr), value :: sendbuff
type(c_devptr), value :: recvbuff
integer(c_int), value :: count
type(ncclDataType), value :: datatype
type(ncclRedOp), value :: op
integer(c_int), value :: root
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclReduce
end interface ncclReduce
!End ncclReduce
!Start ncclAllReduce
interface ncclAllReduce
type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
implicit none
type(c_devptr), value :: sendbuff
type(c_devptr), value :: recvbuff
integer(c_int), value :: count
type(ncclDataType), value :: datatype
type(ncclRedOp), value :: op
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclAllReduce
end interface ncclAllReduce
!End ncclAllReduce
!Start ncclReduceScatter
interface ncclReduceScatter
type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
implicit none
type(c_devptr), value :: sendbuff
type(c_devptr), value :: recvbuff
integer(c_int), value :: recvcount
type(ncclDataType), value :: datatype
type(ncclRedOp), value :: op
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclReduceScatter
end interface ncclReduceScatter
!End ncclReduceScatter
!Start ncclBcast
interface ncclBcast
type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType
implicit none
type(c_devptr), value :: buff
integer(c_int), value :: count
type(ncclDataType), value :: datatype
integer(c_int), value :: root
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclBcast
end interface ncclBcast
!End ncclBcast
!Start ncclAllGather
interface ncclAllGather
type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType
implicit none
type(c_devptr), value :: sendbuff
integer(c_int), value :: count
type(ncclDataType), value :: datatype
type(c_devptr), value :: recvbuff
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclAllGather
end interface ncclAllGather
!End ncclAllGather
!End interfaces
end module ncclFor
!End nccl module

View File

@ -1,162 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: sendBuff(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
real(real32), allocatable, device :: recvBuff(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl * nDev, nDev + 1))
call random_number(hostBuff)
print "(a)", "before allgather:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(sendBuff(nEl))
sendBuffPtr(i) = c_devloc(sendBuff)
sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(recvBuff(nEl * nDev))
recvBuffPtr(i) = c_devloc(recvBuff)
recvBuff = hostBuff(:, i)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
hostBuff(:, i) = recvBuff
end do
print "(a)", ""
print "(a)", "after allgather:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
end do
err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a)", ""
print "(a, e11.4e2)", "maximum error in sendbuff = ", err
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
deallocate(recvBuff)
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
deallocate(sendBuff)
end do
deallocate(sendBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,171 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl * nDev, nDev + 1))
call random_number(hostBuff)
print "(a)", "before allgather:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
end do
allocate(hostBuffPtr(nDev))
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
end do
print "(a)", ""
print "(a)", "after allgather:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
end do
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
end do
err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a)", ""
print "(a, e11.4e2)", "maximum error in sendbuff = ", err
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(recvBuffPtr(i))
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(sendBuffPtr(i))
end do
deallocate(sendBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,165 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: sendBuff(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
real(real32), allocatable, device :: recvBuff(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before allreduce:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(sendBuff(nEl))
sendBuffPtr(i) = c_devloc(sendBuff)
sendBuff = hostBuff(:, i)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(recvBuff(nEl))
recvBuffPtr(i) = c_devloc(recvBuff)
recvBuff = hostBuff(:, i)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
print "(a)", ""
print "(a)", "after allreduce:"
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
hostBuff(:, nDev + 1) = recvBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
hostBuff(:, nDev + 1) = sendBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
deallocate(recvBuff)
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
deallocate(sendBuff)
end do
deallocate(sendBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,166 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before allreduce:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
allocate(hostBuffPtr(nDev + 1))
do i = 1, nDev + 1
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
print "(a)", ""
print "(a)", "after allreduce:"
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(recvBuffPtr(i))
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(sendBuffPtr(i))
end do
deallocate(sendBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,137 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev, root
type(ncclDataType) :: dataType
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: devBuff(:)
type(c_devptr), allocatable :: devBuffPtr(:)
nEl = 2621440
! nDev = 2
! root = 0
stat = cudaGetDeviceCount(nDev)
root = nDev - 1
dataType = ncclFloat
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 1))
call random_number(hostBuff(:, 1:nDev))
hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
print "(a)", "before broadcast:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
end do
allocate(devBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(devBuff(nEl))
devBuffPtr(i) = c_devloc(devBuff)
devBuff = hostBuff(:, i)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
hostBuff(:, i) = devBuff
end do
print "(a)", ""
print "(a)", "after broadcast:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
deallocate(devBuff)
end do
deallocate(devBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,142 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev, root
type(ncclDataType) :: dataType
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: devBuffPtr(:)
nEl = 2621440
! nDev = 2
! root = 0
stat = cudaGetDeviceCount(nDev)
root = nDev - 1
dataType = ncclFloat
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 1))
call random_number(hostBuff(:, 1:nDev))
hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
print "(a)", "before broadcast:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
end do
allocate(hostBuffPtr(nDev))
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(devBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
end do
print "(a)", ""
print "(a)", "after broadcast:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(devBuffPtr(i))
end do
deallocate(devBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,164 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev, root
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: sendBuff(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
real(real32), allocatable, device :: recvBuff(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
! root = 0
stat = cudaGetDeviceCount(nDev)
root = nDev - 1
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before reduce:"
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(sendBuff(nEl))
sendBuffPtr(i) = c_devloc(sendBuff)
sendBuff = hostBuff(:, i)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(recvBuff(nEl))
recvBuffPtr(i) = c_devloc(recvBuff)
recvBuff = hostBuff(:, i)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
stat = cudaSetDevice(devList(root + 1))
call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
hostBuff(:, nDev + 1) = recvBuff
print "(a)", ""
print "(a)", "after reduce:"
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
hostBuff(:, nDev + 1) = sendBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
deallocate(recvBuff)
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
deallocate(sendBuff)
end do
deallocate(sendBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,165 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev, root
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
! root = 0
stat = cudaGetDeviceCount(nDev)
root = nDev - 1
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before reduce:"
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
allocate(hostBuffPtr(nDev + 1))
do i = 1, nDev + 1
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
stat = cudaSetDevice(devList(root + 1))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
print "(a)", ""
print "(a)", "after reduce:"
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(recvBuffPtr(i))
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(sendBuffPtr(i))
end do
deallocate(sendBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,165 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: sendBuff(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
real(real32), allocatable, device :: recvBuff(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl * nDev, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before reducescatter:"
do i = 1, nDev
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(sendBuff(nEl * nDev))
sendBuffPtr(i) = c_devloc(sendBuff)
sendBuff = hostBuff(:, i)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(recvBuff(nEl))
recvBuffPtr(i) = c_devloc(recvBuff)
recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
print "(a)", ""
print "(a)", "after reducescatter:"
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
hostBuff(:, nDev + 1) = sendBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
deallocate(recvBuff)
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
deallocate(sendBuff)
end do
deallocate(sendBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -1,174 +0,0 @@
!*************************************************************************
!* Copyright (c) 2016 Research Computing Services (RCS), University of
!* Cambridge. All rights reserved.
!*
!* See LICENSE.txt for license information
!*************************************************************************
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl * nDev, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before reducescatter:"
do i = 1, nDev
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
allocate(hostBuffPtr(nDev + 1))
do i = 1, nDev + 1
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
end do
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
print "(a)", ""
print "(a)", "after reduceScatter:"
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
do i = 1, nDev + 1
hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(recvBuffPtr(i))
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(sendBuffPtr(i))
end do
deallocate(sendBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

87
makefiles/common.mk Normal file
View File

@ -0,0 +1,87 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
CUDA_HOME ?= /usr/local/cuda
PREFIX ?= /usr/local
VERBOSE ?= 0
KEEP ?= 0
DEBUG ?= 0
TRACE ?= 0
PROFAPI ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
# Better define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
# Include Volta support if we're using CUDA9 or above
ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
else
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS += -Wall -Wno-sign-compare
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
########## GCOV ##########
GCOV ?= 0 # disable by default.
GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
CXXFLAGS += ${GCOV_FLAGS}
NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
LDFLAGS += ${GCOV_FLAGS}
NVLDFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
# $(warning GCOV_FLAGS=${GCOV_FLAGS})
########## GCOV ##########
ifeq ($(DEBUG), 0)
NVCUFLAGS += -O3
CXXFLAGS += -O3 -g
else
NVCUFLAGS += -O0 -G -g
CXXFLAGS += -O0 -g -ggdb3
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
CXXFLAGS += -Wall -Wextra
else
.SILENT:
endif
ifneq ($(TRACE), 0)
CXXFLAGS += -DENABLE_TRACE
endif
ifneq ($(KEEP), 0)
NVCUFLAGS += -keep
endif
ifneq ($(PROFAPI), 0)
CXXFLAGS += -DPROFAPI
endif

33
makefiles/formatting.mk Normal file
View File

@ -0,0 +1,33 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
# Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
# As this file defines a new target (format), it should be included at least after the definition of the
# default target.
ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
ASTYLEDIR := $(BUILDDIR)/contrib
ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
ASTYLEVER := 3.1
ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
$(ASTYLEDIR) :
@mkdir -p $(ASTYLEDIR)
$(ASTYLETAR) : $(ASTYLEDIR)
@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
$(ASTYLEBLD) : $(ASTYLETAR)
@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
$(ASTYLEBIN) : $(ASTYLEBLD)
${MAKE} -C $(ASTYLEBLD)
.PHONY : format
format : $(ASTYLEBIN)
@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)

6
makefiles/version.mk Normal file
View File

@ -0,0 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 3
NCCL_PATCH := 5
NCCL_SUFFIX :=
PKG_REVISION := 5

26
pkg/Makefile Normal file
View File

@ -0,0 +1,26 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
.PHONY : all clean
default : build
build : debian.build txz.build
BUILDDIR ?= $(abspath ../build)
ABSBUILDDIR := $(abspath $(BUILDDIR))
TARGETS := debian txz
all: ${TARGETS:%=%.build}
prep: ${TARGETS:%=%.prep}
build: ${TARGETS:%=%.build}
clean: ${TARGETS:%=%.clean}
%.prep:
${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
%.build:
${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
%.clean:
${MAKE} -C $* clean

58
pkg/debian/Makefile Normal file
View File

@ -0,0 +1,58 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
DEBPREPDIR := $(BUILDDIR)/debian
PKGDIR := $(BUILDDIR)/pkg/deb/
DEBGEN_IN := $(wildcard *.in)
DEBGEN := $(DEBGEN_IN:.in=)
DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN)
DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
PKG_TIMESTAMP := $(shell date -R)
ARCH := $(shell uname -m)
PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch)
ifeq ($(PKG_MULTIARCH),)
# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
PKG_MULTIARCH := $(ARCH)-linux-gnu
endif
prep : $(DEBTARGETS)
$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
build : prep
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
@printf "Building Debian package\n"
(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
clean:
rm -Rf $(DEBPREPDIR) $(PKGDIR)
$(DEBPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(DEBPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
-e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
-e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
-e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
$< > $@
$(DEBPREPDIR)/% : %
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(DEBPREPDIR)
cp -f $< $@

5
pkg/debian/changelog.in Normal file
View File

@ -0,0 +1,5 @@
nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
* Automatic Debian package from build
-- cudatools <cudatools@nvidia.com> ${pkg:Timestamp}

View File

@ -7,22 +7,24 @@ Standards-Version: 3.9.5
Package: libnccl${nccl:Major}
Section: libs
Architecture: ${deb:Arch}
Architecture: ${pkg:Arch}
Depends: ${misc:Depends}, ${shlibs:Depends}
Description: NVIDIA Collectives Communication Library (NCCL) Runtime
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
applications.
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
It has been optimized to achieve high bandwidth on any platform using PCIe,
NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.
Package: libnccl-dev
Section: libdevel
Architecture: ${deb:Arch}
Architecture: ${pkg:Arch}
Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
Description: NVIDIA Collectives Communication Library (NCCL) Development Files
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
applications.
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
It has been optimized to achieve high bandwidth on any platform using PCIe,
NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.

1
pkg/debian/copyright Symbolic link
View File

@ -0,0 +1 @@
../../LICENSE.txt

9
pkg/debian/gbp.conf Normal file
View File

@ -0,0 +1,9 @@
[DEFAULT]
debian-branch = master
upstream-branch = master
ignore-new = True
[git-buildpackage]
no-purge = True

View File

@ -0,0 +1,3 @@
include/nccl.h /usr/include
lib/libnccl.so /usr/lib/${pkg:MultiArch}
lib/libnccl_static.a /usr/lib/${pkg:MultiArch}

View File

@ -0,0 +1,2 @@
lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}

62
pkg/redhat/Makefile Normal file
View File

@ -0,0 +1,62 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
RPMPREPDIR := $(BUILDDIR)/redhat
PKGDIR := $(BUILDDIR)/pkg/rpm/
RPMGEN_IN := $(wildcard *.in)
RPMGEN := $(RPMGEN_IN:.in=)
RPMFILES := $(RPMGEN)
RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
PKG_TIMESTAMP := $(shell date -R)
ARCH := $(shell uname -m)
PKG_ARCH ?= $(shell uname -m)
PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch)
ifeq ($(PKG_MULTIARCH),)
# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
PKG_MULTIARCH := $(ARCH)-linux-gnu
endif
prep : $(RPMTARGETS)
$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
build : prep
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
@printf "Building Redhat package\n"
mkdir -p $(PKGDIR)
rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
--define "_rpmdir $(PKGDIR)" \
--define "_builddir $(PKGDIR)/build/" \
--define "_buildrootdir $(PKGDIR)/buildroot/" \
-bb $(BUILDDIR)/redhat/nccl.spec
clean:
rm -Rf $(RPMPREPDIR) $(PKGDIR)
$(RPMPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(RPMPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
-e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
-e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
-e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
$< > $@
$(RPMPREPDIR)/% : %
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(RPMPREPDIR)
cp -f $< $@

73
pkg/redhat/nccl.spec.in Normal file
View File

@ -0,0 +1,73 @@
Name: libnccl
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
Release: ${pkg:Revision}
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Group: Development/Libraries
License: BSD
URL: http://developer.nvidia.com/nccl
Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
%description
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
It has been optimized to achieve high bandwidth on any platform using PCIe,
NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.
%package devel
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Group: Development/Libraries
%description devel
NCCL development files
%package static
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Group: Development/Libraries
%description static
NCCL static library
%define debug_package %{nil}
%prep
%setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
%build
%install
rm -rf $RPM_BUILD_ROOT
install -m 755 -d $RPM_BUILD_ROOT
install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
# devel
install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
# static
install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
%clean
rm -rf $RPM_BUILD_ROOT
%files devel
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_includedir}/nccl.h
%{_libdir}/libnccl.so
%files static
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_libdir}/libnccl_static.a
%files
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_libdir}/libnccl.so.${nccl:Major}
%{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
%changelog

39
pkg/srctxz/Makefile Normal file
View File

@ -0,0 +1,39 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
TXZPREPDIR := $(BUILDDIR)/srctxz
PKGDIR := $(BUILDDIR)/pkg/srctxz/
TXZGEN_IN := $(wildcard *.in)
TXZGEN := $(TXZGEN_IN:.in=)
TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
PKG_REVISION ?= 3
PKG_ARCH := $(shell uname -m)
prep: $(TXZTARGETS)
build: prep
$(MAKE) -C ../../src clean
@printf "Building source tar.xz package\n"
(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
clean:
rm -Rf $(TXZPREPDIR) $(PKGDIR)
$(TXZPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(TXZPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
$< > $@

View File

@ -0,0 +1,34 @@
#!/bin/bash
#
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
# To run from $BUILDDIR/
cd ..
NCCLDIR=`basename $PWD`
echo "Checking for unclean directory ..."
git clean -x -i
echo "Clean done."
echo "Checking for uncommited files ..."
if [ "`git status -s | wc -l`" != "0" ]; then
git status -s
echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
read
fi
cd ..
NCCL_MAJOR=${nccl:Major}
NCCL_MINOR=${nccl:Minor}
NCCL_PATCH=${nccl:Patch}
NCCL_SUFFIX=${nccl:Suffix}
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
tar --exclude build \
--exclude ".git*" \
--exclude pkg/srctxz \
--transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR

43
pkg/txz/Makefile Normal file
View File

@ -0,0 +1,43 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
TXZPREPDIR := $(BUILDDIR)/txz
PKGDIR := $(BUILDDIR)/pkg/txz/
TXZGEN_IN := $(wildcard *.in)
TXZGEN := $(TXZGEN_IN:.in=)
TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
PKG_ARCH := $(shell uname -m)
prep: $(TXZTARGETS)
$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
build: prep
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
@printf "Building tar.xz package\n"
(cd $(BUILDDIR); bash txz/create_txz.sh)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
clean:
rm -Rf $(TXZPREPDIR) $(PKGDIR)
$(TXZPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(TXZPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
-e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
$< > $@

24
pkg/txz/create_txz.sh.in Normal file
View File

@ -0,0 +1,24 @@
#!/bin/bash
#
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
# To run from $BUILDDIR/
BUILDDIR=`basename $PWD`
cd ..
NCCL_MAJOR=${nccl:Major}
NCCL_MINOR=${nccl:Minor}
NCCL_PATCH=${nccl:Patch}
NCCL_SUFFIX=${nccl:Suffix}
CUDA_MAJOR=${cuda:Major}
CUDA_MINOR=${cuda:Minor}
PKG_REVISION=${pkg:Revision}
PKG_ARCH=${pkg:Arch}
NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt

106
src/Makefile Normal file
View File

@ -0,0 +1,106 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../makefiles/common.mk
include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h
LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
##### lib files
LIBNAME := libnccl.so
STATICLIBNAME := libnccl_static.a
##### dirs
BUILDDIR ?= $(abspath ../build)
INCDIR := $(BUILDDIR)/include
LIBDIR := $(BUILDDIR)/lib
OBJDIR := $(BUILDDIR)/obj
##### target files
INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
STATICLIBTARGET := $(STATICLIBNAME)
LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
DEPFILES := $(LIBOBJ:%.o=%.d)
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lrt
DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a
##### rules
build : lib staticlib
lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
staticlib : $(LIBDIR)/$(STATICLIBTARGET)
devicelib: nccl.h
$(MAKE) -C collectives/device
-include $(DEPFILES)
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
nccl.h : nccl.h.in
# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
printf "Generating %-35s > %s\n" $< $@
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
$< > $@
$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
mkdir -p $(LIBDIR)
$(eval TMP := $(shell mktemp -d))
cp $(LIBOBJ) $(TMP)
cd $(TMP) && ar x $(DEVICELIB) && cd -
ar cr $@ $(LIBOBJ) $(TMP)/*.o
rm -Rf $(TMP)
$(INCDIR)/%.h : %.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
cp -f $< $@
$(OBJDIR)/%.o : %.cu
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -I. -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
@$(NVCC) -I. -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@rm -f $(@:%.o=%.d.tmp)
clean :
rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR} nccl.h
$(MAKE) -C collectives/device clean
install : lib
mkdir -p $(PREFIX)/lib
mkdir -p $(PREFIX)/include
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
# as the BUILDDIR variable.
include ../makefiles/formatting.mk

View File

@ -1,202 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "primitives.h"
#define NUM_SUBSTEPS 2
#define NUM_BUFCHUNKS 2
// Increase Step and poffset/noffset for buffer sync
#define NEXT_STEP \
step++; \
poffset = noffset; \
noffset += sliceSize; \
if (noffset == buffSize) noffset = 0;
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
template<int THREADS, int UNROLL, class FUNC, typename T>
__launch_bounds__(THREADS+WARP_SIZE, 1)
__global__ void AllGatherKernel(const KernelArgs<T> args) {
const int tid = threadIdx.x;
__shared__ T* sharedNextOutput;
__shared__ DevRing<T> ring;
bool pushrecv = args.pushrecv;
LoadRing<THREADS>(args.ring, &ring);
__syncthreads();
if (tid == 0) {
WaitFlag prevCommOp(ring.prevOpCounter, 0);
WaitFlag nextCommOp(ring.nextOpCounter, 0);
prevCommOp.wait(args.opIndex);
nextCommOp.wait(args.opIndex);
if (pushrecv) {
*ring.sendPtrToPrev = (T*)args.ThisOutput;
Wait([=] {
return *ring.recvPtrFromNext != nullptr;
});
sharedNextOutput = *ring.recvPtrFromNext;
*ring.recvPtrFromNext = nullptr;
}
}
__syncthreads();
WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
PostFlag postReadyToNext(ring.sendFlagToNext, 0);
typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
const int size = args.N;
const int nranks = args.nRanks;
const int buffSize = args.buffSize / sizeof(T);
const int sliceSize = buffSize / NUM_BUFCHUNKS;
int step = 0;
int poffset, noffset = 0;
// Compute pointers
const T * __restrict__ thisInput = args.ThisInput;
T * __restrict__ thisOutput = args.ThisOutput;
T * __restrict__ prevInput = ring.recvBuffer;
T * __restrict__ nextOutput = ring.sendBuffer;
for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
/////////////// begin AllGather steps ///////////////
int offset;
int maxOffset = size-chunkOffset;
int rankDest;
// step 0: push data to next GPU
rankDest = ring.userRank[0];
offset = chunkOffset + rankDest * size;
if (thisInput == thisOutput) {
Prims::Copy(
thisInput + offset,
pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
} else {
Prims::DoubleCopy(
thisInput + chunkOffset,
thisOutput + offset,
pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
}
NEXT_STEP; // Increases step, poffset, noffset
// k-2 steps: copy to next GPU
if (pushrecv) {
for (int j=1; j<nranks-1; ++j) {
rankDest = ring.userRank[nranks-j];
offset = chunkOffset + rankDest * size;
Prims::Copy(
thisOutput + offset,
sharedNextOutput + offset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
} else {
for (int j=1; j<nranks-1; ++j) {
rankDest = ring.userRank[nranks-j];
offset = chunkOffset + rankDest * size;
Prims::DoubleCopy(
prevInput + poffset,
thisOutput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// Make final copy from buffer to dest.
rankDest = ring.userRank[1];
offset = chunkOffset + rankDest * size;
// Here we need to copy from buffer to this output.
Prims::Copy(
prevInput + poffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
}
// wait for the last data to be pushed to us
if (tid == 0) {
// Wait for last update from next then reset the flag
waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
*ring.recvFlagFromNext = 0;
// Wait for last update from prev then reset the flag
waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
*ring.recvFlagFromPrev = 0;
incrementOpCounter(&args);
}
}
#define THREADS 512
#define UNROLL 8
template<class FUNC, typename T>
ncclResult_t RingAllGather(const void* sendbuff, void* recvbuff,
const int count, ncclComm* comm, cudaStream_t stream) {
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
} else {
KernelArgs<T> args;
ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
LAUNCH_KERNEL(AllGatherKernel, THREADS, UNROLL, FUNC, T, args, stream);
}
return ncclSuccess;
}
template<typename T, template<typename> class RedOp>
class AllGather {
public:
static ncclResult_t entry(const void* sendbuff, void* recvbuff,
int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
return RingAllGather<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
}
};
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, int count, ncclDataType_t datatype,
void* recvbuff, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
void* recvbuff, ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, "AllGather"));
return enqueue<AllGather, FuncNull>(sendbuff, recvbuff, count, datatype, 0, comm, stream);
}

View File

@ -1,234 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "primitives.h"
#define NUM_SUBSTEPS 2
#define NUM_BUFCHUNKS 2
// Increase Step and poffset/noffset for buffer sync
#define NEXT_STEP \
step++; \
poffset = noffset; \
noffset += sliceSize; \
if (noffset == buffSize) noffset = 0;
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
template<int THREADS, int UNROLL, class FUNC, typename T>
__launch_bounds__(THREADS+WARP_SIZE, 1)
__global__ void AllReduceKernel(const KernelArgs<T> args) {
const int tid = threadIdx.x;
__shared__ T* sharedNextOutput;
__shared__ DevRing<T> ring;
bool pushrecv = args.pushrecv;
LoadRing<THREADS>(args.ring, &ring);
__syncthreads();
if (tid == 0) {
WaitFlag prevCommOp(ring.prevOpCounter, 0);
WaitFlag nextCommOp(ring.nextOpCounter, 0);
prevCommOp.wait(args.opIndex);
nextCommOp.wait(args.opIndex);
if (pushrecv) {
*ring.sendPtrToPrev = (T*)args.ThisOutput;
Wait([=] {
return *ring.recvPtrFromNext != nullptr;
});
sharedNextOutput = *ring.recvPtrFromNext;
*ring.recvPtrFromNext = nullptr;
}
}
__syncthreads();
WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
PostFlag postReadyToNext(ring.sendFlagToNext, 0);
typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
const int size = args.N;
const int nranks = args.nRanks;
const int buffSize = args.buffSize / sizeof(T);
const int sliceSize = buffSize / NUM_BUFCHUNKS;
int step = 0;
int poffset, noffset = 0;
// Compute pointers
const T * __restrict__ thisInput = args.ThisInput;
T * __restrict__ thisOutput = args.ThisOutput;
T * __restrict__ prevInput = ring.recvBuffer;
T * __restrict__ nextOutput = ring.sendBuffer;
for (int chunkOffset = 0; chunkOffset < size; chunkOffset += nranks*sliceSize) {
/////////////// begin AllReduce steps ///////////////
int offset;
int maxOffset;
int slice;
int chunkSize = min(sliceSize, DIVUP(size-chunkOffset,nranks));
ALIGN_SIZE(chunkSize, THREADS*UNROLL);
// step 0: push data to next GPU
slice = ring.userRank[nranks-1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::Copy(
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP; // Increases step, poffset, noffset
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring.userRank[nranks-j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::Reduce(
prevInput + poffset,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring.userRank[0];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::ReduceCopy(
prevInput + poffset,
thisInput + offset,
pushrecv ? (sharedNextOutput + offset) : (nextOutput + noffset),
thisOutput + offset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
if (pushrecv) {
// k-2 steps: copy result to next GPU
for (int j=1; j<nranks-1; ++j) {
slice = ring.userRank[nranks - j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::Copy(
thisOutput + offset,
sharedNextOutput + offset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
} else {
// k-2 steps: copy result to next GPU
for (int j=1; j<nranks-1; ++j) {
slice = ring.userRank[nranks - j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::DoubleCopy(
prevInput + poffset,
thisOutput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// Make final copy from buffer to dest.
slice = ring.userRank[1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
Prims::Copy(
prevInput + poffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
}
// wait for the last data to be pushed to us
if (tid == 0) {
// Wait for last update from next then reset the flag
waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
*ring.recvFlagFromNext = 0;
// Wait for last update from prev then reset the flag
waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
*ring.recvFlagFromPrev = 0;
incrementOpCounter(&args);
}
}
#define THREADS 512
#define UNROLL 8
template<class FUNC, typename T>
ncclResult_t RingAllReduce(const void* sendbuff, void* recvbuff,
const int count, ncclComm* comm, cudaStream_t stream) {
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
} else {
KernelArgs<T> args;
ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
LAUNCH_KERNEL(AllReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
}
return ncclSuccess;
}
template<typename T, template <typename> class RedOp>
class AllReduce {
public:
static ncclResult_t entry(const void* sendbuff, void* recvbuff,
int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
return RingAllReduce<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
}
};
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, int count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, 0, comm, "AllReduce"));
return enqueue<AllReduce>(sendbuff, recvbuff, count, datatype, op, 0, comm, stream);
}

292
src/bootstrap.cu Normal file
View File

@ -0,0 +1,292 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "core.h"
#include "utils.h"
#include "bootstrap.h"
#include "net.h"
#include <unistd.h>
#include <sys/types.h>
// Always use sockets for bootstrap
ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
// Additional sync functions based on async + test for bootstrap, using host ptrs.
static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
void* request;
NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
int done = 0;
while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
return ncclSuccess;
}
static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
void* request;
NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
int done = 0;
while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
return ncclSuccess;
}
struct extId {
ncclNetHandle_t extHandle;
void* extListenComm;
uint64_t hostHash;
pid_t pid;
int fd;
pthread_t boostrapThread;
};
struct bootstrapOp {
int op;
int size;
};
struct extInfo {
int rank;
int nranks;
ncclNetHandle_t extHandle;
};
enum {
BOOTSTRAP_ALLGATHER = 1,
BOOTSTRAP_RINGEXCHANGE,
};
#include <sys/resource.h>
static ncclResult_t setFilesLimit() {
struct rlimit filesLimit;
SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
filesLimit.rlim_cur = filesLimit.rlim_max;
SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
return ncclSuccess;
}
static void *bootstrapRoot(void* commId) {
struct extInfo info;
struct extId* id = (struct extId*)commId;
struct bootstrapOp bop;
void **extSendComm = NULL;
void **extRecvComm = NULL;
int size, alloc_size = 0;
char* data = NULL;
ncclResult_t res;
setFilesLimit();
/* Receive addresses from all ranks */
int nranks = 0, c = 0;
do {
void* tmpRecvComm;
NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpRecvComm), res, out);
NCCLCHECKGOTO(bootstrapRecv(tmpRecvComm, &info, sizeof(info)), res, out);
if (!c) {
extSendComm = (void**)calloc(info.nranks, sizeof(void*));
extRecvComm = (void**)calloc(info.nranks, sizeof(void*));
if (extSendComm == NULL || extRecvComm == NULL) {
WARN("Bootstrap thread : failed to allocate memory");
goto out;
}
nranks = info.nranks;
}
if (nranks != info.nranks) {
WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
goto out;
}
extRecvComm[info.rank] = tmpRecvComm;
NCCLCHECKGOTO(bootstrapConnect(0, info.extHandle, extSendComm+info.rank), res, out);
c++;
} while (c < nranks);
do {
NCCLCHECKGOTO(bootstrapRecv(extRecvComm[0], &bop, sizeof(struct bootstrapOp)), res, out);
if (bop.size == -1) {
break;
} else {
size = bop.size;
if (size*nranks*2 > alloc_size) {
if (data) free(data); data = NULL;
NCCLCHECKGOTO(ncclCalloc(&data, size*nranks*2), res, out);
alloc_size = size*nranks*2;
}
}
if (bop.op == BOOTSTRAP_ALLGATHER) {
for (int r=0; r<nranks; r++) {
NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], data+size*r, size), res, out);
}
for (int r=0; r<nranks; r++) {
NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data, size*nranks), res, out);
}
} else if (bop.op == BOOTSTRAP_RINGEXCHANGE) {
// Receive from all and build total table
for (int r=0; r<nranks; r++) {
NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], data+r*2*size, 2*size), res, out);
}
// Get prev/next request from everyone and answer.
for (int r=0; r<nranks; r++) {
int offset;
NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], &offset, sizeof(int)), res, out);
NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data+offset, size), res, out);
NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], &offset, sizeof(int)), res, out);
NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data+offset, size), res, out);
}
} else {
WARN("Bootstrap Root : invalid op type received %d", bop.op);
break;
}
} while (1);
out:
bootstrapCloseListen(id->extListenComm);
for (int r=0; r<nranks; r++) {
if (extSendComm[r]) bootstrapCloseSend(extSendComm[r]);
if (extRecvComm[r]) bootstrapCloseRecv(extRecvComm[r]);
}
free(commId);
if (data) free(data);
if (extSendComm) free(extSendComm);
if (extRecvComm) free(extRecvComm);
return NULL;
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
struct extId* id = (struct extId*)commId;
id->hostHash = getHostHash();
NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandle, &id->extListenComm));
ncclUniqueId* threadIdCopy;
NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
return ncclSuccess;
}
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
extId* id = (extId*)out;
char* env = getenv("NCCL_COMM_ID");
if (env) {
if (ncclSocketCreateHandle(&id->extHandle, env) != 0) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
id->pid = -1;
} else {
id->pid = getpid();
NCCLCHECK(bootstrapCreateRoot(out, false));
}
return ncclSuccess;
}
struct extState {
void* extRecvComm;
void* extSendComm;
int rank;
int nranks;
};
ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
struct extId* id = (struct extId*)commId;
bool idFromEnv = id->pid < 0;
struct extState* state;
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
state->nranks = nranks;
*commState = state;
struct extInfo info;
info.rank = rank;
info.nranks = nranks;
void* tmpListenComm;
// Pass the remote address to listen via info
if (idFromEnv) {
memcpy(&info.extHandle, &id->extHandle, sizeof(ncclNetHandle_t));
}
// listen will return the local address via info ('findSubnetIf' indicates that the net device is unknown)
int dev = idFromEnv ? findSubnetIf : 0;
NCCLCHECK(bootstrapListen(dev, &info.extHandle, &tmpListenComm));
NCCLCHECK(bootstrapConnect(dev, id->extHandle, &state->extSendComm));
NCCLCHECK(bootstrapSend(state->extSendComm, &info, sizeof(info)));
NCCLCHECK(bootstrapAccept(tmpListenComm, &state->extRecvComm));
NCCLCHECK(bootstrapCloseListen(tmpListenComm));
return ncclSuccess;
}
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
struct extState* state = (struct extState*)commState;
char* data = (char*)allData;
struct bootstrapOp bop;
bop.op = BOOTSTRAP_ALLGATHER;
bop.size = size;
if (!state->rank) {
NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
}
NCCLCHECK(bootstrapSend(state->extSendComm, data+state->rank*size, size));
NCCLCHECK(bootstrapRecv(state->extRecvComm, data, size*state->nranks));
return ncclSuccess;
}
ncclResult_t bootstrapRingExchange(void* commState, void* prevNextData, int prev, int next, int size) {
struct extState* state = (struct extState*)commState;
char* mydata = (char*)prevNextData;
int prev_offset = prev*2*size+size, next_offset = next*2*size;
struct bootstrapOp bop;
bop.op = BOOTSTRAP_RINGEXCHANGE;
bop.size = size;
if (!state->rank) {
NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
}
// Send data to root
NCCLCHECK(bootstrapSend(state->extSendComm, mydata, 2*size));
// Receive prev and next data
NCCLCHECK(bootstrapSend(state->extSendComm, &prev_offset, sizeof(int)));
NCCLCHECK(bootstrapRecv(state->extRecvComm, mydata, size));
NCCLCHECK(bootstrapSend(state->extSendComm, &next_offset, sizeof(int)));
NCCLCHECK(bootstrapRecv(state->extRecvComm, mydata+size, size));
return ncclSuccess;
}
ncclResult_t bootstrapClose(void* commState) {
struct extState* state = (struct extState*)commState;
struct bootstrapOp bop;
bop.size = -1;
if (!state->rank) {
NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
}
NCCLCHECK(bootstrapCloseSend(state->extSendComm));
NCCLCHECK(bootstrapCloseRecv(state->extRecvComm));
free(state);
return ncclSuccess;
}

View File

@ -1,164 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "primitives.h"
#define NUM_SUBSTEPS 4
#define NUM_BUFCHUNKS 2
// Increase Step and boffset for buffer sync
#define NEXT_STEP \
step++; \
boffset += sliceSize; \
if (boffset == buffSize) boffset = 0;
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
template<int THREADS, int UNROLL, class FUNC, typename T>
__launch_bounds__(THREADS+WARP_SIZE, 1)
__global__ void BroadcastKernel(const KernelArgs<T> args) {
const int tid = threadIdx.x;
__shared__ T* sharedNextOutput;
__shared__ DevRing<T> ring;
bool pushrecv = args.pushrecv;
LoadRing<THREADS>(args.ring, &ring);
__syncthreads();
if (tid == 0) {
WaitFlag prevCommOp(ring.prevOpCounter, 0);
WaitFlag nextCommOp(ring.nextOpCounter, 0);
prevCommOp.wait(args.opIndex);
nextCommOp.wait(args.opIndex);
if (pushrecv) {
*ring.sendPtrToPrev = (T*)args.ThisOutput;
Wait([=] {
return *ring.recvPtrFromNext != nullptr;
});
sharedNextOutput = *ring.recvPtrFromNext;
*ring.recvPtrFromNext = nullptr;
}
}
__syncthreads();
WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
PostFlag postReadyToNext(ring.sendFlagToNext, 0);
typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
const int size = args.N;
const int rank = ring.userRank[0];
const int nextRank = ring.userRank[1];
const int root = args.root;
const int buffSize = args.buffSize / sizeof(T);
const int sliceSize = buffSize / NUM_BUFCHUNKS;
int step = 0;
int boffset = 0;
// Compute pointers
const T * __restrict__ thisInput = args.ThisInput;
T * __restrict__ thisOutput = args.ThisOutput;
T * __restrict__ prevInput = ring.recvBuffer;
T * __restrict__ nextOutput = ring.sendBuffer;
for (int offset = 0; offset < size; offset += sliceSize) {
int maxOffset = size-offset;
if (rank == root) {
Prims::Copy(
thisInput + offset,
pushrecv ? sharedNextOutput + offset : nextOutput + boffset,
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
} else if (nextRank == root) {
if (pushrecv) maxOffset = 0; // Only wait for signals
Prims::Copy(
prevInput + boffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
} else {
if (pushrecv) {
Prims::Copy(
thisOutput + offset,
sharedNextOutput + offset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
} else {
Prims::DoubleCopy(
prevInput + boffset,
thisOutput + offset,
nextOutput + boffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
}
}
NEXT_STEP; // Increases step, boffset
}
// wait for the last data to be pushed to us
if (tid == 0) {
if (nextRank != root) {
// Wait for last update from next then reset the flag
waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
*ring.recvFlagFromNext = 0;
}
if (rank != root) {
// reset the flag
*ring.recvFlagFromPrev = 0;
}
incrementOpCounter(&args);
}
}
#define THREADS 256
#define UNROLL 8
template<class FUNC, typename T>
ncclResult_t RingBroadcast(void* buff, const int count, const int root,
ncclComm* comm, cudaStream_t stream) {
if (comm->nRanks != 1) {
KernelArgs<T> args;
ArgsSetup(&args, buff, buff, root, count, comm);
LAUNCH_KERNEL(BroadcastKernel, THREADS, UNROLL, FUNC, T, args, stream);
}
return ncclSuccess;
}
template<typename T, template<typename> class RedOp>
class Broadcast {
public:
static ncclResult_t entry(const void* sendbuff, void* recvbuff,
int count, int root, ncclComm* comm, cudaStream_t stream) {
return RingBroadcast<RedOp<T>, T>(recvbuff, count, root, comm, stream);
}
};
NCCL_API(ncclResult_t, ncclBcast, void* buff, int count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ArgsCheck(buff, buff, count, datatype, ncclSum, root, comm, "Bcast"));
return enqueue<Broadcast, FuncNull>(nullptr, buff, count, datatype, root, comm, stream);
}

View File

@ -0,0 +1,32 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
ncclSum, 0, comm, stream);
}

View File

@ -0,0 +1,32 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
op, 0, comm, stream);
}

View File

@ -0,0 +1,42 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
}
return ncclSuccess;
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
ncclSum, root, comm, stream);
}
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
ncclSum, root, comm, stream);
}

View File

@ -0,0 +1,66 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
#define NCCL_KERN_NAME(coll, op, dtype) \
coll##Kernel_##op##_##dtype
/* Declare all collective operations */
#define DECL_COLL4(coll, op, dtype) \
extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##LL, op, dtype) \
DECL_COLL4(coll, op, dtype)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
DECL_COLL3(coll, op, u8) \
DECL_COLL3(coll, op, i32) \
DECL_COLL3(coll, op, u32) \
DECL_COLL3(coll, op, i64) \
DECL_COLL3(coll, op, u64) \
DECL_COLL3(coll, op, f16) \
DECL_COLL3(coll, op, f32) \
DECL_COLL3(coll, op, f64)
#define DECL_COLL(coll) \
DECL_COLL2(coll, sum) \
DECL_COLL2(coll, prod) \
DECL_COLL2(coll, min) \
DECL_COLL2(coll, max)
#define DECL_ALL_COLLS \
DECL_COLL2(ncclBroadcast, copy) \
DECL_COLL(ncclReduce) \
DECL_COLL2(ncclAllGather, copy) \
DECL_COLL(ncclReduceScatter) \
DECL_COLL(ncclAllReduce) \
DECL_ALL_COLLS
#define ALLREDUCE_SUBSTEPS 2
#define ALLREDUCE_BUFCHUNKS 2
#define ALLGATHER_SUBSTEPS 2
#define ALLGATHER_BUFCHUNKS 2
#define REDUCESCATTER_SUBSTEPS 2
#define REDUCESCATTER_BUFCHUNKS 2
#define BROADCAST_SUBSTEPS 8
#define BROADCAST_BUFCHUNKS 2
#define REDUCE_SUBSTEPS 8
#define REDUCE_BUFCHUNKS 2
#endif

View File

@ -0,0 +1,86 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../../makefiles/common.mk
include ../../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../../build)
OBJDIR := $(BUILDDIR)/obj/collectives/device
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
LIBOBJ := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
$(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
$(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
$(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
$(OBJDIR)/functions.o
LIBSRCFILES += functions.cu
DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
DEPENDFILES := $(DEPFILES:%.d=%.dep)
STATICLIB := $(OBJDIR)/colldevice.a
DEVOBJ := $(OBJDIR)/devlink.o
NVCUFLAGS += -I. -I.. -I../.. -I../../include --compiler-options "-fPIC -fvisibility=hidden"
all: $(STATICLIB)
# Dummy rule so that the extra dependency (%.dep) files are preserved by make
all_deps: $(DEPENDFILES)
-include $(DEPFILES)
$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@printf "Archiving %-35s > %s\n" objects $@
ar cr $@ $^
# We do not want make to build *.d when running make clean.
# So we only provide targets for .dep which will produce .dep and .d,
# with only .d being included, and .dep keeping track of what needs to
# be regenerated.
$(OBJDIR)/%.dep : %.cu
@mkdir -p $(OBJDIR)
@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $@
@rm -f $@.tmp
@cp $@ $(@:.dep=.d)
# Compiled kernels and collectives with relocatable device code ...
$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
# ... and create the device-side linked object with all those.
$(DEVOBJ) : $(LIBOBJ)
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
clean:
rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test

View File

@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "common.h"
#include "all_gather.h"
#include "collectives.h"
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
#endif

View File

@ -0,0 +1,269 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and poffset/noffset for buffer sync
#define NEXT_STEP \
step++; \
poffset = noffset; \
noffset += sliceSize; \
if (noffset == buffSize) noffset = 0;
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
__shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
int prevdirect = ring->recv.conn.direct;
int nextdirect = ring->send.conn.direct;
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
if (tid == 0) {
// Update in case we skipped some collectives
*ring->recv.conn.opCount = args->opCount;
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
if (prevdirect) {
*ring->recv.conn.ptrExchange = args->ThisOutput;
}
if (nextdirect) {
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
while (*ptr == nullptr);
sharedNextOutput = (T*)*ptr;
*ptr = nullptr;
}
}
__syncthreads();
uint64_t step = 0ULL;
int poffset, noffset = 0;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int maxOffset = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
Prims::Copy(tid, nthreads,
thisInput + chunkOffset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
} else {
Prims::DoubleCopy(tid, nthreads,
thisInput + chunkOffset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
}
NEXT_STEP; // Increases step, poffset, noffset
// k-2 steps: copy to next GPU
if (prevdirect) {
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
Prims::Copy(tid, nthreads,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
Prims::Copy(tid, nthreads,
NULL,
NULL,
0, 0,
step,
waitReadyFromPrev,
postDoneToPrev);
} else {
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
Prims::DoubleCopy(tid, nthreads,
prevInput + poffset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// Make final copy from buffer to dest.
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
// Here we need to copy from buffer to this output.
Prims::Copy(tid, nthreads,
prevInput + poffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
}
}
if (tid == 0) {
waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
*ring->send.conn.head = 0ULL;
*ring->recv.conn.tail = 0ULL;
__threadfence_system();
*ring->recv.conn.opCount = args->opCount+1;
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
poffset = noffset; \
pflag = nflag; \
noffset += NCCL_LL_SLICE_LINES; \
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
nflag++; \
step++;
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
typedef LLPrimitives<T, FUNC> LL;
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t pflag, nflag = step + 1;
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int maxOffset = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
WAIT_NEXT;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LL::ReduceCopy(
thisInput + chunkOffset,
nextOutput + noffset,
maxOffset, nflag, llNthreads);
} else {
LL::ReduceCopy(
thisInput + chunkOffset,
thisOutput + offset,
nextOutput + noffset,
maxOffset, nflag, llNthreads);
}
POST_SIZE;
NEXT_STEP_LL;
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
WAIT_NEXT;
LL::ReduceCopy(
prevInput + poffset,
thisOutput + offset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LL::ReduceCopy(
prevInput + poffset,
thisOutput + offset,
maxOffset, pflag, llNthreads);
ACK_PREV;
}
FIFO_CLEANING_AND_SAVE_STEP(nflag);
}

View File

@ -0,0 +1,21 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "common.h"
#include "all_reduce.h"
#include "collectives.h"
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum);
#elif NCCL_OP == 1
IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
#elif NCCL_OP == 2
IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin);
#elif NCCL_OP == 3
IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax);
#endif

View File

@ -0,0 +1,332 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and poffset/noffset for buffer sync
#define NEXT_STEP \
step++; \
poffset = noffset; \
noffset += sliceSize; \
if (noffset == buffSize) noffset = 0;
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
__shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
int prevdirect = ring->recv.conn.direct;
int nextdirect = ring->send.conn.direct;
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
if (tid == 0) {
// Update in case we skipped some collectives
*ring->recv.conn.opCount = args->opCount;
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
if (prevdirect) {
*ring->recv.conn.ptrExchange = args->ThisOutput;
}
if (nextdirect) {
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
while (*ptr == nullptr);
sharedNextOutput = (T*)*ptr;
*ptr = nullptr;
}
}
__syncthreads();
uint64_t step = 0ULL;
int poffset, noffset = 0;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int maxOffset;
int slice;
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::Copy(tid, nthreads,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
NEXT_STEP; // Increases step, poffset, noffset
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::Reduce(tid, nthreads,
prevInput + poffset,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::ReduceCopy(tid, nthreads,
prevInput + poffset,
thisInput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
thisOutput + offset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
// k-2 steps: copy to next GPU
if (prevdirect) {
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks - j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::Copy(tid, nthreads,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
Prims::Copy(tid, nthreads,
NULL,
NULL,
0, 0,
step,
waitReadyFromPrev,
postDoneToPrev);
} else {
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks - j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::DoubleCopy(tid, nthreads,
prevInput + poffset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
Prims::Copy(tid, nthreads,
prevInput + poffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
}
}
if (tid == 0) {
// Wait for next to have consumed all data before we reset the flag
waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
*ring->send.conn.head = 0ULL;
*ring->recv.conn.tail = 0ULL;
__threadfence_system();
*ring->recv.conn.opCount = args->opCount+1;
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
poffset = noffset; \
pflag = nflag; \
noffset += NCCL_LL_SLICE_LINES; \
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
nflag++; \
step++;
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
typedef LLPrimitives<T, FUNC> LL;
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*nranks*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t pflag, nflag = step + 1;
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int maxOffset;
int slice;
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
nextOutput + noffset,
maxOffset, nflag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
prevInput + poffset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
prevInput + poffset,
thisOutput + offset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks - j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
WAIT_NEXT;
LL::ReduceCopy(
prevInput + poffset,
thisOutput + offset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
}
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
LL::ReduceCopy(
prevInput + poffset,
thisOutput + offset,
maxOffset, pflag, llNthreads);
ACK_PREV;
}
FIFO_CLEANING_AND_SAVE_STEP(nflag);
}

View File

@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "common.h"
#include "broadcast.h"
#include "collectives.h"
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
#endif

View File

@ -0,0 +1,228 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and boffset for buffer sync
#define NEXT_STEP \
step++; \
boffset += sliceSize; \
if (boffset == buffSize) boffset = 0;
template<int UNROLL, class FUNC, typename T>
__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
__shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
int prevdirect = ring->recv.conn.direct;
int nextdirect = ring->send.conn.direct;
WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
const ssize_t size = args->N;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->root;
if (tid == 0) {
// Update in case we skipped some collectives
*ring->recv.conn.opCount = args->opCount;
if (nextRank != root) {
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
}
if (rank != root && prevdirect) {
*ring->recv.conn.ptrExchange = args->ThisOutput;
}
if (nextRank != root && nextdirect) {
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
while (*ptr == nullptr);
sharedNextOutput = (T*)*ptr;
*ptr = nullptr;
}
}
__syncthreads();
uint64_t step = 0ULL;
int boffset = 0;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*chunkSize;
int maxOffset = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
Prims::Copy(tid, nthreads,
thisInput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
} else {
Prims::DoubleCopy(tid, nthreads,
thisInput + offset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
}
} else if (nextRank == root) {
if (prevdirect) maxOffset = 0; // Only wait for signals
Prims::Copy(tid, nthreads,
prevInput + boffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
} else {
if (prevdirect) {
Prims::Copy(tid, nthreads,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
} else {
Prims::DoubleCopy(tid, nthreads,
prevInput + boffset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
}
}
NEXT_STEP; // Increases step, boffset
}
if (tid == 0) {
if (nextRank != root) {
// Wait for next to have consumed data before resetting the flag
waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
*ring->send.conn.head = 0ULL;
}
*ring->recv.conn.tail = 0ULL;
__threadfence_system();
*ring->recv.conn.opCount = args->opCount+1;
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
boffset += NCCL_LL_SLICE_LINES; \
if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
flag++; \
step++;
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
const int rank = comm->rank;
const int nextRank = ring->devUserRanks[1];
const int root = args->root;
typedef LLPrimitives<T, FUNC> LL;
const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t flag = step + 1;
int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
int maxOffset = min(chunkSize, size-offset);
if (rank == root) {
WAIT_NEXT;
if (thisInput == thisOutput) {
LL::ReduceCopy(
thisInput + offset,
nextOutput + boffset,
maxOffset, flag, llNthreads);
} else {
LL::ReduceCopy(
thisInput + offset,
thisOutput + offset,
nextOutput + boffset,
maxOffset, flag, llNthreads);
}
POST_SIZE;
NEXT_STEP_LL;
} else if (nextRank == root) {
LL::ReduceCopy(
prevInput + boffset,
thisOutput + offset,
maxOffset, flag, llNthreads);
NEXT_STEP_LL;
ACK_PREV;
} else {
WAIT_NEXT;
LL::ReduceCopy(
prevInput + boffset,
thisOutput + offset,
nextOutput + boffset,
maxOffset, flag, flag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
ACK_PREV;
}
}
// We need everyone to acknowledge data even if they didn't receive anything
// so that the next collective can start right away.
ACK_PREV;
FIFO_CLEANING_AND_SAVE_STEP(flag);
}

View File

@ -0,0 +1,90 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEVICE_COMMON_H_
#define NCCL_DEVICE_COMMON_H_
#include "../collectives.h"
#include "core.h"
#include "nccl.h"
typedef void(*ncclKern_t)(struct CollectiveArgs* args);
extern __device__ ncclKern_t ncclFuncs[];
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
int* d = (int*)dst;
int* s = (int*)src;
__syncthreads();
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
__syncthreads();
}
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
if (tid == 0) hostColl->active = 0;
}
/* Functions for aggregation case */
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
}
/* Kernels with the first operation inlined */
#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int tid = threadIdx.x; \
int bid = blockIdx.x; \
__shared__ struct ncclColl localColl; \
\
struct ncclComm* comm = firstColl.args.comm; \
struct ncclRing* ring = comm->rings+bid; \
struct ncclColl* c; \
if (bid == 0) { \
/* To optimize for latency, (only) the first operation is passed as argument.*/ \
c = &firstColl; \
} else { \
c = &localColl; \
load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
} \
while (1) { \
if (tid < c->nThreads) { \
if (c->funcIndex == fIndex) { \
coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
} else { \
ncclFuncs[c->funcIndex](&c->args); \
} \
} \
int nextIndex = c->nextIndex; \
if (tid == 0) ring->collFifoHead = nextIndex; \
\
if (c->active == 2) { \
return; \
} \
\
/* Load next collective operation*/ \
c = &localColl; /* for bid 0 */ \
load_coll(c, ring->devCollectives+nextIndex, tid); \
} \
}
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \
IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \
IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \
IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64)
#endif

View File

@ -0,0 +1,372 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COMMON_KERNEL_H_
#define NCCL_COMMON_KERNEL_H_
#include "core.h"
#include <cstdio>
#include <cstdint>
#include <cuda_runtime.h>
// Define min for ssize_t
static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
typedef uint64_t PackType;
// unpack x and y to elements of type T and apply FUNC to each element
template<class FUNC, typename T>
struct MULTI {
__device__ PackType operator()(const PackType x, const PackType y) const;
};
template<class FUNC>
struct MULTI<FUNC, int8_t> {
static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
"PackType must be twice the size of uint32_t.");
union converter {
PackType storage;
struct {
uint32_t a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
// for char, we do these as vector ops
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
template<class FUNC>
struct MULTI<FUNC, uint8_t> {
static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
"PackType must be twice the size of uint32_t.");
union converter {
PackType storage;
struct {
uint32_t a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
// for char, we do these as vector ops
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
template<class FUNC>
struct MULTI<FUNC, int32_t> {
static_assert(sizeof(PackType) == 2 * sizeof(int32_t),
"PackType must be twice the size of int.");
union converter {
PackType storage;
struct {
int32_t a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
template<class FUNC>
struct MULTI<FUNC, uint32_t> {
static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
"PackType must be twice the size of int.");
union converter {
PackType storage;
struct {
uint32_t a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
template<class FUNC>
struct MULTI<FUNC, half> {
static_assert(sizeof(PackType) == 4 * sizeof(half),
"PackType must be four times the size of half.");
struct PackHalf2 {
half2 a, b;
};
__device__ PackType operator()(const PackType x, const PackType y) const {
struct PackHalf2 cx, cy, cr;
cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return *(reinterpret_cast<PackType*>(&cr));
}
};
template<class FUNC>
struct MULTI<FUNC, float> {
static_assert(sizeof(PackType) == 2 * sizeof(float),
"PackType must be twice the size of float.");
union converter {
PackType storage;
struct {
float a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
template<class FUNC>
struct MULTI<FUNC, double> {
static_assert(sizeof(PackType) == sizeof(double),
"PackType must be the same size as double.");
__device__ PackType operator()(const PackType x, const PackType y) const {
double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
return __double_as_longlong(rv);
}
};
template<class FUNC>
struct MULTI<FUNC, uint64_t> {
static_assert(sizeof(PackType) == sizeof(uint64_t),
"PackType must be the same size as uint64_t.");
__device__ PackType operator()(const PackType x, const PackType y) const {
uint64_t rv = FUNC()(x, y);
return rv;
}
};
template<class FUNC>
struct MULTI<FUNC, int64_t> {
static_assert(sizeof(PackType) == sizeof(int64_t),
"PackType must be the same size as int64_t.");
__device__ PackType operator()(const PackType x, const PackType y) const {
int64_t rv = FUNC()((int64_t)x, (int64_t)y);
return rv;
}
};
#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a))
template<typename T>
__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
size_t ptrval = reinterpret_cast<size_t>(ptr);
return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
}
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
return *ptr;
}
template<typename T> inline __device__
void vStore(volatile T* ptr, const T val) {
*ptr = val;
}
#if CUDART_VERSION < 9000
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
half r;
r.x = ptr->x;
return r;
}
template<> inline __device__
void vStore<half>(volatile half* ptr, const half val) {
ptr->x = val.x;
}
#else
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
half r;
r = ((half*)ptr)[0];
return r;
}
template<> inline __device__
void vStore<half>(volatile half* ptr, const half val) {
((half*)ptr)[0] = val;
}
#endif
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
__device__ inline void ReduceCopy(
const int tid, const int nthreads,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1,
volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1, const int N) {
for (int idx = tid; idx < N; idx += nthreads) {
T val = vFetch(src0+idx);
if (TWO_INPUTS) {
val = FUNC()(val, vFetch(src1+idx));
}
vStore(dest0+idx, val);
if (TWO_OUTPUTS) {
vStore(dest1+idx, val);
}
}
}
typedef ulong2 Pack128;
template<class FUNC, typename T>
struct MULTI128 {
__device__ void operator()(Pack128& x, Pack128& y) {
x.x = MULTI<FUNC, T>()(x.x, y.x);
x.y = MULTI<FUNC, T>()(x.y, y.y);
}
};
inline __device__ void Fetch128(Pack128& v, Pack128* p) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
}
inline __device__ void Store128(Pack128* p, Pack128& v) {
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
}
#define WARP_SIZE 32
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
const int N) {
Pack128 t0[UNROLL];
Pack128 t1[UNROLL];
const Pack128* src0_end = src0 + N;
const int inc = nw * UNROLL * WARP_SIZE;
const int offset = w * UNROLL * WARP_SIZE + t;
src0 += offset; if (TWO_INPUTS) src1 += offset;
dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
while (src0 < src0_end) {
#pragma unroll
for (int u = 0; u < UNROLL; ++u) {
Fetch128(t0[u], src0+u*WARP_SIZE);
if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
}
#pragma unroll
for (int u = 0; u < UNROLL; ++u) {
if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
Store128(dest0+u*WARP_SIZE, t0[u]);
if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
}
src0 += inc; if (TWO_INPUTS) src1 += inc;
dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
}
}
template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
int N) {
int Nrem = N;
if (Nrem <= 0) return;
int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
// stage 0: check if we'll be able to use the fast, 128-bit aligned path.
// If not, we'll just use the slow preamble path for the whole operation
bool alignable = (((AlignUp(src0, alignof(Pack128)) == src0 + Npreamble)) &&
(!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
(!HAS_SRC1 || (AlignUp(src1, alignof(Pack128)) == src1 + Npreamble)));
if (!alignable) {
Npreamble = Nrem;
}
// stage 1: preamble: handle any elements up to the point of everything coming
// into alignment
ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
Nrem -= Npreamble;
if (Nrem == 0) return;
dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; }
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit alignable.
int w = tid / WARP_SIZE; // Warp number
int nw = nthreads / WARP_SIZE; // Number of warps
int t = tid % WARP_SIZE; // Thread (inside the warp)
const int PackFactor = sizeof(Pack128) / sizeof(T);
// stage 2a: main loop
int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
* (UNROLL * nthreads); // round down
ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
int Ndone2a = Nalign2a * PackFactor;
Nrem -= Ndone2a;
if (Nrem == 0) return;
dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; }
// stage 2b: slightly less optimized for section when we don't have full
// UNROLLs
int Nalign2b = Nrem / PackFactor;
ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
int Ndone2b = Nalign2b * PackFactor;
Nrem -= Ndone2b;
if (Nrem == 0) return;
dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; }
// stage 2c: tail
ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
}
#endif // COMMON_KERNEL_H_

View File

@ -0,0 +1,64 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "collectives.h"
#include "common.h"
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_COLL_NAME(coll, op, dtype), \
NCCL_COLL_NAME(coll##LL, op, dtype) \
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum ), \
NCCL_FUNCS3A(coll, prod), \
NCCL_FUNCS3A(coll, max ), \
NCCL_FUNCS3A(coll, min )
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
// Must be consistent with ncclColl_t
#define NCCL_FUNCS() { \
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
NCCL_FUNCS2A(ncclReduceScatter), \
NCCL_FUNCS2A(ncclAllReduce) }
// Must be consistent with the ncclFuncSet enum
__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce)
};

View File

@ -0,0 +1,154 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_LL_KERNEL_H_
#define NCCL_LL_KERNEL_H_
static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
uint32_t data1, flag1, data2, flag2;
do {
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
} while ((flag1 != flag) || (flag2 != flag));
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
return val64;
}
static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
}
// Using memcpy handles misaligned pointers.
static __device__ uint64_t readAL(uint64_t* src) {
uint64_t val;
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
return val;
}
static __device__ void storeAL(uint64_t* dst, uint64_t val) {
memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
}
template <typename T, class FUNC>
class LLPrimitives {
private:
template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
if (size <= 0) return;
size_t size64 = size * sizeof(T) / sizeof(uint64_t);
uint64_t* src1A = (uint64_t*)src1;
uint64_t* dst1A = (uint64_t*)dst1;
int offset = threadIdx.x;
// Do multiples of 64 bits
#pragma unroll 1
for (; offset < size64; offset += nthreads) {
uint64_t val;
if (HAS_SRC1) {
val = readAL(src1A+offset);
if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
} else if (HAS_SRC2) {
val = readLL(src2+offset, iflag);
}
if (HAS_DST1) storeAL(dst1A+offset, val);
if (HAS_DST2) storeLL(dst2+offset, val, oflag);
}
// Finish last word
int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
int sizeRem = size - sizeDone;
if (threadIdx.x == 0 && sizeRem) {
const T* src1B = src1 + sizeDone;
T* dst1B = dst1 + sizeDone;
uint64_t lastVal;
T* vals = (T*)&lastVal;
if (HAS_SRC2) {
uint64_t lastVal2 = readLL(src2+size64, iflag);
T* src2B = (T*)&lastVal2;
for (int offset = 0; offset < sizeRem; offset++) {
vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
}
} else if (HAS_SRC1) {
for (int offset = 0; offset < sizeRem; offset++) {
vals[offset] = src1B[offset];
}
}
if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
if (HAS_DST1) {
for (int offset = 0; offset < sizeRem; offset++) {
dst1B[offset] = vals[offset];
}
}
}
}
public:
static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
}
static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
}
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
}
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
}
static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
}
static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
}
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
}
};
// Common macros
#define STEP_TO_SLOT(step) \
(step % NCCL_LL_CHUNKS)
#define WAIT_NEXT \
if (tid == 0) { \
while (sendHead + NCCL_LL_CHUNKS <= step) { \
sendHead = sendHeadPtr[0]; \
} \
} \
asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads));
#define POST_SIZE \
if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T));
#define ACK_PREV \
asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \
if (tid == 0) recvHeadPtr[0] = step;
#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \
/* Reset all flags */ \
static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
} \
__threadfence_system(); \
/* Restart from the same slot, only make sure sender waits for data to be reset */ \
step += NCCL_LL_CHUNKS; \
ACK_PREV; \
while (sendHeadPtr[0] < step); \
if (tid == 0) ring->send.conn.llLastCleaning = step; \
} \
ring->send.conn.llStep = step; \
} while (0);
#endif

View File

@ -0,0 +1,226 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PRIMITIVES_H_
#define NCCL_PRIMITIVES_H_
#include <type_traits>
#include "reduce_kernel.h" // for reduction funcs
/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
*
* In order to reduce the reptetion of template arguments, the operations
* are bundled as static methods of the Primitives class.
*
* Each primitive operation copies/reduces a contiguous buffer and syncs
* an optional set of flags against a sub-step counter. The sync value is
* based on the step parameter. Sync flags must be of type WaitFlag or
* PostFlag. The primitive routines wait for all WaitFlag args to attain
* at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
* corresponding substep by previous step) before executing the transfer.
* After each substep is transfered, all PostFlag arguments get updated to
* the value SUBSTEPS*step+substep+1.
*/
class WaitFlag {
volatile uint64_t * const flag;
const int shift;
public:
__device__ __forceinline__
WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
__device__ __forceinline__
void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; }
};
class PostFlag {
volatile uint64_t * const flag;
const int shift;
volatile int * const fifo;
const int fifo_size;
public:
__device__ __forceinline__
PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
__device__ __forceinline__
void post(uint64_t val) { *flag = (val - shift); }
__device__ __forceinline__
void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; };
};
// Helper to check if any argument is of type T.
// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
template<typename T> __device__ __forceinline__
bool AnyAre() { return false; }
template<typename T, typename FIRST_T, typename... TAIL_Ts>
__device__ __forceinline__
bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
}
// Wait on all WaitFlags, ignore PostFlags
__device__ __forceinline__
void WaitOnFlags(uint64_t val) { }
template <typename... TAIL_Ts> __device__ __forceinline__
void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
flag.wait(val);
WaitOnFlags(val, tail...);
}
template <typename... TAIL_Ts> __device__ __forceinline__
void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
WaitOnFlags(val, tail...);
}
// Post all PostFlags, ignore WaitFlags
__device__ __forceinline__
void PostToFlags(uint64_t val) { }
template <typename... TAIL_Ts> __device__ __forceinline__
void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
PostToFlags(val, tail...);
}
template <typename... TAIL_Ts> __device__ __forceinline__
void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
flag.post(val);
PostToFlags(val, tail...);
}
// Post sizes for PostFlags, ignore WaitFlags
__device__ __forceinline__
void PostSizeToFlags(uint64_t step, int size) { }
template <typename... TAIL_Ts> __device__ __forceinline__
void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
PostSizeToFlags(step, size, tail...);
}
template <typename... TAIL_Ts> __device__ __forceinline__
void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
flag.postSize(step, size);
PostSizeToFlags(step, size, tail...);
}
// Create pointer arithmetic syntax that doesn't break for nullptr_t
template <typename Tptr> __device__ __forceinline__
Tptr ptradd(Tptr ptr, int i) {
return ptr + i;
}
__device__ __forceinline__
nullptr_t ptradd(nullptr_t ptr, int i) {
return nullptr;
}
// Implementation of primitive types
template <int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
class Primitives {
private:
template <typename SRC2_T, // either T* or nullptr_t
typename DST2_T, // either T* or nullptr_t
typename... SYNC_Ts> // either WaitFunc or PostFunc
static __device__ __forceinline__ void
GenericOp(const int tid, const int nthreads,
const T* src1,
const SRC2_T src2,
T* dst1,
DST2_T dst2,
int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
enum { noSrc2 = std::is_same<SRC2_T, nullptr_t>::value };
enum { noDst2 = std::is_same<DST2_T, nullptr_t>::value };
static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
"src2 must be of type T* or nullptr_t");
static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
"dst2 must be of type T* or nullptr_t");
using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
int sliceSize = len / SUBSTEPS;
int sliceOffset = 0;
#pragma unroll 1
for (int sub=0; sub<SUBSTEPS; ++sub) {
int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
if (tid < nthreads) {
if (AnyAre<WaitFlag>(flags...)) {
if (tid == 0) {
WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
}
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
}
ReduceOrCopy
<
UNROLL,
OpType,
T,
!std::is_same<DST2_T, nullptr_t>::value, // HAS_DEST1
!std::is_same<SRC2_T, nullptr_t>::value // HAS_SRC1
>
(
tid, nthreads,
ptradd(dst1, sliceOffset),
ptradd(dst2, sliceOffset),
ptradd(src1, sliceOffset),
ptradd(src2, sliceOffset),
realSize
);
if (AnyAre<PostFlag>(flags...)) {
__syncthreads();
}
} else {
if (AnyAre<PostFlag>(flags...)) {
__syncthreads();
PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
__threadfence_system();
PostToFlags(SUBSTEPS*step + sub + 1, flags...);
}
}
sliceOffset += sliceSize;
}
}
public:
template <typename... SYNC_Ts>
static __device__ __forceinline__ void
Copy(const int tid, const int nthreads, const T* src, T* dst,
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
}
template <typename... SYNC_Ts>
static __device__ __forceinline__ void
DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
}
template <typename... SYNC_Ts>
static __device__ __forceinline__ void
Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
}
template <typename... SYNC_Ts>
static __device__ __forceinline__ void
ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
}
};
#endif // end include guard

View File

@ -0,0 +1,21 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "common.h"
#include "reduce.h"
#include "collectives.h"
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL2(ncclReduce, sum, FuncSum, ncclCollReduce, ncclSum);
#elif NCCL_OP == 1
IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
#elif NCCL_OP == 2
IMPL_COLL2(ncclReduce, min, FuncMin, ncclCollReduce, ncclMin);
#elif NCCL_OP == 3
IMPL_COLL2(ncclReduce, max, FuncMax, ncclCollReduce, ncclMax);
#endif

View File

@ -0,0 +1,190 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and boffset for buffer sync
#define NEXT_STEP \
step++; \
boffset += sliceSize; \
if (boffset == buffSize) boffset = 0;
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
const int rank = ring->devUserRanks[0];
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->root;
if (tid == 0) {
// Update in case we skipped some collectives
*ring->recv.conn.opCount = args->opCount;
if (rank != root) {
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
}
}
__syncthreads();
uint64_t step = 0ULL;
int boffset = 0;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*chunkSize;
int maxOffset = min(chunkSize, size-offset);
if (prevRank == root) {
Prims::Copy(tid, nthreads,
thisInput + offset,
nextOutput + boffset,
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
} else if (rank == root) {
Prims::Reduce(tid, nthreads,
prevInput + boffset,
thisInput + offset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
} else {
Prims::Reduce(tid, nthreads,
prevInput + boffset,
thisInput + offset,
nextOutput + boffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
}
NEXT_STEP; // Increases step, boffset
}
if (tid == 0) {
if (rank != root) {
// Wait for next to have consumed data before resetting the flag
waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
*ring->send.conn.head = 0ULL;
}
*ring->recv.conn.tail = 0ULL;
__threadfence_system();
*ring->recv.conn.opCount = args->opCount+1;
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
boffset += NCCL_LL_SLICE_LINES; \
if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
flag++; \
step++;
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
const int nranks = comm->nRanks;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->root;
typedef LLPrimitives<T, FUNC> LL;
const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t flag = step + 1;
int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
int maxOffset = min(chunkSize, size-offset);
if (prevRank == root) {
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
nextOutput + boffset,
maxOffset, flag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
} else if (rank == root) {
LL::ReduceCopy(
thisInput + offset,
prevInput + boffset,
thisOutput + offset,
maxOffset, flag, llNthreads);
NEXT_STEP_LL;
ACK_PREV;
} else {
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
prevInput + boffset,
nextOutput + boffset,
maxOffset, flag, flag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
ACK_PREV;
}
}
// We need everyone to acknowledge data even if they didn't receive anything
// so that the next collective can start right away.
ACK_PREV;
FIFO_CLEANING_AND_SAVE_STEP(flag);
}

View File

@ -0,0 +1,364 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_REDUCE_KERNEL_H_
#define NCCL_REDUCE_KERNEL_H_
#include "common_kernel.h"
#include <limits>
template<typename T>
struct FuncNull {
__device__ T operator()(const T x, const T y) const {
return 0;
}
};
template<typename T>
struct FuncSum {
__device__ T operator()(const T x, const T y) const {
return x + y;
}
};
template<typename T>
struct FuncProd {
__device__ T operator()(const T x, const T y) const {
return x * y;
}
};
template<typename T>
struct FuncMax {
__device__ T operator()(const T x, const T y) const {
return (x < y) ? y : x;
}
};
template<typename T>
struct FuncMin {
__device__ T operator()(const T x, const T y) const {
return (x < y) ? x : y;
}
};
template<>
struct FuncSum<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vadd.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
"vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = cx.a.x + cy.a.x;
cr.a.y = cx.a.y + cy.a.y;
cr.a.z = cx.a.z + cy.a.z;
cr.a.w = cx.a.w + cy.a.w;
return cr.storage;
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
return x+y;
}
};
template<>
struct FuncSum<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vadd.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
"vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = cx.a.x + cy.a.x;
cr.a.y = cx.a.y + cy.a.y;
cr.a.z = cx.a.z + cy.a.z;
cr.a.w = cx.a.w + cy.a.w;
return cr.storage;
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
return x+y;
}
};
static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
/* This can be used both for signed and unsigned 8-bit multiplication */
#if (__CUDA_ARCH__ >= 300)
uint32_t rv;
asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
" vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
" vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
" shl.b32 t3, t3, 16;\n\t"
" shl.b32 t2, t2, 16;\n\t"
" vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
" shl.b32 t1, t1, 8;\n\t"
" vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
" and.b32 t1, t1, 0xff00ff00;\n\t"
" and.b32 t0, t0, 0x00ff00ff;\n\t"
" or.b32 %0, t0, t1;\n\t"
"}" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
union converter { uint32_t storage; char4 a; };
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = cx.a.x * cy.a.x;
cr.a.y = cx.a.y * cy.a.y;
cr.a.z = cx.a.z * cy.a.z;
cr.a.w = cx.a.w * cy.a.w;
return cr.storage;
#endif
}
template<>
struct FuncProd<int8_t> {
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
return mulChar4(x, y);
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
return x*y;
}
};
template<>
struct FuncProd<uint8_t> {
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
return mulChar4(x, y);
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
return x*y;
}
};
template<>
struct FuncMax<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
"vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = max(cx.a.x, cy.a.x);
cr.a.y = max(cx.a.y, cy.a.y);
cr.a.z = max(cx.a.z, cy.a.z);
cr.a.w = max(cx.a.w, cy.a.w);
return cr.storage;
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
return (x>y) ? x : y;
}
};
template<>
struct FuncMax<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
"vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = max(cx.a.x, cy.a.x);
cr.a.y = max(cx.a.y, cy.a.y);
cr.a.z = max(cx.a.z, cy.a.z);
cr.a.w = max(cx.a.w, cy.a.w);
return cr.storage;
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
return (x>y) ? x : y;
}
};
template<>
struct FuncMin<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
"vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = min(cx.a.x, cy.a.x);
cr.a.y = min(cx.a.y, cy.a.y);
cr.a.z = min(cx.a.z, cy.a.z);
cr.a.w = min(cx.a.w, cy.a.w);
return cr.storage;
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
return (x<y) ? x : y;
}
};
template<>
struct FuncMin<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
"vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = min(cx.a.x, cy.a.x);
cr.a.y = min(cx.a.y, cy.a.y);
cr.a.z = min(cx.a.z, cy.a.z);
cr.a.w = min(cx.a.w, cy.a.w);
return cr.storage;
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
return (x<y) ? x : y;
}
};
template<>
struct FuncSum<half> {
__device__ half2 operator()(const half2 x, const half2 y) const {
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
return __hadd2(x, y);
#else
float2 fx, fy, fr;
fx = __half22float2(x);
fy = __half22float2(y);
fr.x = fx.x + fy.x;
fr.y = fx.y + fy.y;
return __float22half2_rn(fr);
#endif
}
__device__ half operator()(const half x, const half y) const {
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
return __hadd(x, y);
#else
return __float2half( __half2float(x) + __half2float(y) );
#endif
}
};
template<>
struct FuncProd<half> {
__device__ half2 operator()(const half2 x, const half2 y) const {
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
return __hmul2(x, y);
#else
float2 fx, fy, fr;
fx = __half22float2(x);
fy = __half22float2(y);
fr.x = fx.x * fy.x;
fr.y = fx.y * fy.y;
return __float22half2_rn(fr);
#endif
}
__device__ half operator()(const half x, const half y) const {
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
return __hmul(x, y);
#else
return __float2half( __half2float(x) * __half2float(y) );
#endif
}
};
template<>
struct FuncMax<half> {
__device__ half2 operator()(const half2 x, const half2 y) const {
float2 fx, fy, fr;
fx = __half22float2(x);
fy = __half22float2(y);
fr.x = fmaxf(fx.x, fy.x);
fr.y = fmaxf(fx.y, fy.y);
return __float22half2_rn(fr);
}
__device__ half operator()(const half x, const half y) const {
float fx, fy, fm;
fx = __half2float(x);
fy = __half2float(y);
fm = fmaxf(fx, fy);
return __float2half(fm);
}
};
template<>
struct FuncMin<half> {
__device__ half2 operator()(const half2 x, const half2 y) const {
float2 fx, fy, fr;
fx = __half22float2(x);
fy = __half22float2(y);
fr.x = fminf(fx.x, fy.x);
fr.y = fminf(fx.y, fy.y);
return __float22half2_rn(fr);
}
__device__ half operator()(const half x, const half y) const {
float fx, fy, fm;
fx = __half2float(x);
fy = __half2float(y);
fm = fminf(fx, fy);
return __float2half(fm);
}
};
#endif // REDUCE_KERNEL_H_

View File

@ -0,0 +1,21 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "common.h"
#include "reduce_scatter.h"
#include "collectives.h"
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum);
#elif NCCL_OP == 1
IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
#elif NCCL_OP == 2
IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin);
#elif NCCL_OP == 3
IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax);
#endif

View File

@ -0,0 +1,217 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and poffset/noffset for buffer sync
#define NEXT_STEP \
step++; \
poffset = noffset; \
noffset += sliceSize; \
if (noffset == buffSize) noffset = 0;
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
if (tid == 0) {
// Update in case we skipped some collectives
*ring->recv.conn.opCount = args->opCount;
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
}
__syncthreads();
uint64_t step = 0ULL;
int poffset, noffset = 0;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int maxOffset = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
Prims::Copy(tid, nthreads,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
NEXT_STEP; // Increases step, poffset, noffset
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
Prims::Reduce(tid, nthreads,
prevInput + poffset,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
Prims::Reduce(tid, nthreads,
prevInput + poffset,
thisInput + offset,
thisOutput + chunkOffset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
}
if (tid == 0) {
waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
*ring->send.conn.head = 0ULL;
*ring->recv.conn.tail = 0ULL;
__threadfence_system();
*ring->recv.conn.opCount = args->opCount+1;
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
poffset = noffset; \
pflag = nflag; \
noffset += NCCL_LL_SLICE_LINES; \
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
nflag++; \
step++;
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
typedef LLPrimitives<T, FUNC> LL;
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t pflag, nflag = step + 1;
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int maxOffset = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
nextOutput + noffset,
maxOffset, nflag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
prevInput + poffset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LL::ReduceCopy(
thisInput + offset,
prevInput + poffset,
thisOutput + chunkOffset,
maxOffset, pflag, llNthreads);
ACK_PREV;
}
FIFO_CLEANING_AND_SAVE_STEP(nflag);
}

33
src/collectives/reduce.cu Normal file
View File

@ -0,0 +1,33 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
op, root, comm, stream);
}

View File

@ -0,0 +1,32 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
op, 0, comm, stream);
}

View File

@ -1,115 +0,0 @@
/*************************************************************************
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_COLL_H_
#define COMMON_COLL_H_
#include "core.h"
static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
cudaPointerAttributes attr;
cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
if (err != cudaSuccess || attr.devicePointer == NULL) {
WARN("%s : %s is not a valid pointer\n", opname, ptrname);
return ncclInvalidDevicePointer;
}
if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
WARN("%s : %s allocated on device %d mismatchs with NCCL device %d \n", opname, ptrname, attr.device, comm->cudaDev);
return ncclInvalidDevicePointer;
}
return ncclSuccess;
}
static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
if (ptr == NULL) {
WARN("%s : %s argument is NULL", opname, ptrname);
return ncclInvalidArgument;
}
return ncclSuccess;
}
static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, int count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
NCCLCHECK(PtrCheck(comm, opname, "comm"));
// First, the easy ones
if (root < 0 || root >= comm->nRanks) {
WARN("%s : invalid root %d (root should be in the 0..%d range)\n", opname, root, comm->nRanks);
return ncclInvalidRank;
}
if (type < 0 || type >= nccl_NUM_TYPES) {
WARN("%s : invalid type %d\n", opname, type);
return ncclInvalidType;
}
if (op < 0 || op >= nccl_NUM_OPS) {
WARN("%s : invalid reduction operation %d\n", opname, op);
return ncclInvalidOperation;
}
if (count < 0) {
WARN("%s : invalid count %d\n", opname, count);
return ncclInvalidArgument;
}
// Check pointers
NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname))
if (strcmp(opname, "Reduce") == 0 && comm->rank != root) {
// No need to check recvbuff pointer for non-root reduce
return ncclSuccess;
}
NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname))
return ncclSuccess;
}
// Kernel launch
template<typename T>
struct KernelArgs {
// general parameters
int nRanks;
int root;
int buffSize;
int N;
int opIndex;
volatile int * __restrict__ opCounter;
int * __restrict__ doneCount;
bool pushrecv;
// some pre-computed sizes
int SliceSize;
int SliceOffset;
int ChunkSize;
int NumChunks;
// local and remote input, output, and buffer
const T * __restrict__ ThisInput;
T * __restrict__ ThisOutput;
DevRing<char>* ring;
};
template<typename T>
void ArgsSetup(KernelArgs<T> *args, const void* sendbuff, void* recvbuff,
const int root, const int count, ncclComm *comm) {
args->nRanks = comm->nRanks;
args->root = root;
args->buffSize = comm->buffSize;
args->N = count;
args->opIndex = comm->opSched;
args->opCounter = comm->opCounter;
args->ThisInput = (const T*)sendbuff;
args->ThisOutput = (T*)recvbuff;
args->ring = comm->devRing;
args->pushrecv = comm->globalMemSpace;
}
#define LAUNCH_KERNEL(K, THREADS, UNROLL, FUNC, T, \
args, stream) do { \
dim3 grid(1, 1, 1); \
dim3 block(THREADS+1, 1, 1); \
void* argptrs[] = {&args}; \
CUDACHECK(cudaLaunchKernel( \
(void*)K<THREADS, UNROLL, FUNC, T>, \
grid, block, argptrs, 0, stream), ncclUnhandledCudaError); \
} while (0)
#endif

View File

@ -1,362 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_KERNEL_H_
#define COMMON_KERNEL_H_
#include <cstdio>
#include <cstdint>
#include <cuda_runtime.h>
// BAR macro and helpers
#define WARP_SIZE 32
#define ROUNDUP(x, y) \
(((((x) + (y) - 1) / (y))) * (y))
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define BAR_EXEC(type, barid, nthreads) \
asm("bar." #type " " #barid ", " #nthreads ";\n\t")
#define BAR_EXPAND(type, barid, nthreads) \
BAR_EXEC(type, barid, (nthreads))
// Named barrier macro.
// Expands to asm("bar.type barid, nthreads") where
// nthreads has been rounded up to WARP_SIZE.
#define BAR(type, barid, nthreads) \
BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
return *ptr;
}
template<typename T> inline __device__
void vStore(volatile T* ptr, const T val) {
*ptr = val;
}
#ifdef CUDA_HAS_HALF
#if CUDART_VERSION < 9000
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
half r;
r.x = ptr->x;
return r;
}
template<> inline __device__
void vStore<half>(volatile half* ptr, const half val) {
ptr->x = val.x;
}
#else
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
return *((half*)ptr);
}
template<> inline __device__
void vStore<half>(volatile half* ptr, const half val) {
*((half*)ptr) = val;
}
#endif
#endif
__device__ unsigned int spinct;
// Spin wait until func evaluates to true
template<typename FUNC>
__device__ inline void Wait(const FUNC& func) {
while (!func()) {
// waste time
atomicInc(&spinct, 10);
}
}
typedef uint64_t PackType;
// unpack x and y to elements of type T and apply FUNC to each element
template<class FUNC, typename T>
struct MULTI {
__device__ PackType operator()(const PackType x, const PackType y) const;
};
template<class FUNC>
struct MULTI<FUNC, char> {
static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
"PackType must be twice the size of uint32_t.");
union converter {
PackType storage;
struct {
uint32_t a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
// for char, we do these as vector ops
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
template<class FUNC>
struct MULTI<FUNC, int> {
static_assert(sizeof(PackType) == 2 * sizeof(int),
"PackType must be twice the size of int.");
union converter {
PackType storage;
struct {
int a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
#ifdef CUDA_HAS_HALF
template<class FUNC>
struct MULTI<FUNC, half> {
static_assert(sizeof(PackType) == 4 * sizeof(half),
"PackType must be four times the size of half.");
struct PackHalf2 {
half2 a, b;
};
__device__ PackType operator()(const PackType x, const PackType y) const {
struct PackHalf2 cx, cy, cr;
cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return *(reinterpret_cast<PackType*>(&cr));
}
};
#endif
template<class FUNC>
struct MULTI<FUNC, float> {
static_assert(sizeof(PackType) == 2 * sizeof(float),
"PackType must be twice the size of float.");
union converter {
PackType storage;
struct {
float a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
template<class FUNC>
struct MULTI<FUNC, double> {
static_assert(sizeof(PackType) == sizeof(double),
"PackType must be the same size as double.");
__device__ PackType operator()(const PackType x, const PackType y) const {
double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
return __double_as_longlong(rv);
}
};
template<class FUNC>
struct MULTI<FUNC, unsigned long long> {
static_assert(sizeof(PackType) == sizeof(unsigned long long),
"PackType must be the same size as unsigned long long.");
__device__ PackType operator()(const PackType x, const PackType y) const {
unsigned long long rv = FUNC()(x, y);
return rv;
}
};
template<class FUNC>
struct MULTI<FUNC, long long> {
static_assert(sizeof(PackType) == sizeof(long long),
"PackType must be the same size as long long.");
__device__ PackType operator()(const PackType x, const PackType y) const {
long long rv = FUNC()((long long)x, (long long)y);
return rv;
}
};
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
__device__ inline void ReduceCopy(
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1,
volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1, const int idx) {
T val = vFetch(src0+idx);
if (TWO_INPUTS) {
val = FUNC()(val, vFetch(src1+idx));
}
vStore(dest0+idx, val);
if (TWO_OUTPUTS) {
vStore(dest1+idx, val);
}
}
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL, int THREADS>
__device__ inline void ReduceCopy64b(
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1,
volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1, const int offset) {
PackType t0[UNROLL];
PackType t1[UNROLL];
#pragma unroll
for (int u = 0; u < UNROLL; ++u) {
int idx = offset + u*THREADS;
t0[u] = (reinterpret_cast<const volatile PackType *>(src0))[idx];
if (TWO_INPUTS) {
t1[u] = (reinterpret_cast<const volatile PackType *>(src1))[idx];
}
}
#pragma unroll
for (int u = 0; u < UNROLL; ++u) {
int idx = offset + u*THREADS;
PackType val = TWO_INPUTS ? MULTI<FUNC, T>()(t0[u], t1[u]) : t0[u];
(reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
if (TWO_OUTPUTS) {
(reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
}
}
}
#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a))
template<typename T>
__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
size_t ptrval = reinterpret_cast<size_t>(ptr);
return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
}
// Assumptions:
// - there is exactly 1 block
// - THREADS is the number of producer threads
// - this function is called by all producer threads
template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
bool HAS_SRC1>
__device__ inline void ReduceOrCopy(const int tid,
volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
int N) {
if (N<=0) {
return;
}
int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;
// stage 0: check if we'll be able to use the fast, 64-bit aligned path.
// If not, we'll just use the slow preamble path for the whole operation
bool alignable = (((AlignUp(src0, alignof(PackType)) == src0 + Npreamble)) &&
(!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
(!HAS_SRC1 || (AlignUp(src1, alignof(PackType)) == src1 + Npreamble)));
if (!alignable) {
Npreamble = N;
}
// stage 1: preamble: handle any elements up to the point of everything coming
// into alignment
for (int idx = tid; idx < Npreamble; idx += THREADS) {
// ought to be no way this is ever more than one iteration, except when
// alignable is false
ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
}
// stage 2: fast path: use 64b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 64-bit alignable.
if (alignable) {
const int PackFactor = sizeof(PackType) / sizeof(T);
int Nrem = N - Npreamble;
dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; }
// stage 2a: main loop
int Nalign2a = (Nrem / (PackFactor * UNROLL * THREADS))
* (UNROLL * THREADS); // round down
#pragma unroll 1 // don't unroll this loop
for (int idx = tid; idx < Nalign2a; idx += UNROLL * THREADS) {
ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL, THREADS>(src0, src1, dest0, dest1, idx);
}
int Ndone2a = Nalign2a * PackFactor;
Nrem -= Ndone2a;
// stage 2b: slightly less optimized for section when we don't have full
// UNROLLs
int Nalign2b = Nrem / PackFactor;
#pragma unroll 4
for (int idx = Nalign2a + tid; idx < Nalign2a + Nalign2b; idx += THREADS) {
ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, 1, 0>(src0, src1, dest0, dest1, idx);
}
int Ndone2b = Nalign2b * PackFactor;
Nrem -= Ndone2b;
int Ndone2 = Ndone2a + Ndone2b;
dest0 += Ndone2; if (HAS_DEST1) { dest1 += Ndone2; }
src0 += Ndone2; if (HAS_SRC1) { src1 += Ndone2; }
// stage 2c: tail
for (int idx = tid; idx < Nrem; idx += THREADS) {
// never ought to make it more than one time through this loop. only a
// few threads should even participate
ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
}
} // done fast path
}
template <typename T>
__device__ inline void incrementOpCounter(const KernelArgs<T> *args) {
// increment comm's operation counts
__threadfence_system(); // Technically need to ensure that cleared flags
// are visible before incrementing op counter.
*args->opCounter = args->opIndex+1;
}
template <int THREADS, typename T> __device__ __forceinline__
void LoadRing(const DevRing<char>* src, DevRing<T>* dst) {
enum { NUM_WORDS = sizeof(DevRing<char>) / sizeof(long long) };
static_assert(sizeof(DevRing<char>) % sizeof(long long) == 0, "Bad alignment");
static_assert(THREADS >= NUM_WORDS, "Not enough threads to load DevRing");
static_assert(sizeof(DevRing<char>) == sizeof(DevRing<T>), "DevRing size mismatch");
long long* lldst = reinterpret_cast<long long*>(dst);
const long long* llsrc = reinterpret_cast<const long long*>(src);
if (threadIdx.x < NUM_WORDS) {
lldst[threadIdx.x] = llsrc[threadIdx.x];
}
}
#endif // COMMON_KERNEL_H_

View File

@ -1,55 +0,0 @@
/*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COPY_KERNEL_H_
#define COPY_KERNEL_H_
#include "common_kernel.h"
template<typename T>
struct FuncPassA {
__device__ T operator()(const T x, const T y) const {
return x;
}
};
#ifdef CUDA_HAS_HALF
template <>
struct FuncPassA<half> {
__device__ half2 operator()(const half2 x, const half2 y) const {
return x;
}
__device__ half operator()(const half x, const half y) const {
return x;
}
};
#endif
// Assumptions:
// - there is exactly 1 block
// - THREADS is the number of producer threads
// - this function is called by all producer threads
template<int UNROLL, int THREADS, typename T>
__device__ void Copy(volatile T * __restrict__ const dest,
const volatile T * __restrict__ const src, const int N) {
ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, false, false>(threadIdx.x,
dest, nullptr, src, nullptr, N);
}
// Assumptions:
// - there is exactly 1 block
// - THREADS is the number of producer threads
// - this function is called by all producer threads
template<int UNROLL, int THREADS, typename T>
__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1,
const volatile T * __restrict__ const src, const int N) {
ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, true, false>(threadIdx.x,
dest0, dest1, src, nullptr, N);
}
#endif // COPY_KERNEL_H_

File diff suppressed because it is too large Load Diff

View File

@ -1,162 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef CORE_H_
#define CORE_H_
#include "nccl.h"
#include <cstdio>
#include <cuda_runtime.h>
#define MAXRANKS 32
#define DEFAULT_BUFFER_SIZE_BYTES (1UL << 21)
#define NCCL_MEM_PAD_ALIGN 65536
struct ncclMem {
union { // Pad this block so that devBuff is correctly aligned
struct {
int flags[2];
void* recvPtrs;
int opCounter; // Used to determine when remote Communicators are ready.
// Only used in host memory.
};
char pad[NCCL_MEM_PAD_ALIGN];
};
// devBuff will be bigger ; we only use its offset/address.
char buff[1];
};
template <typename T>
struct alignas(long long) DevRing {
volatile int* __restrict__ prevOpCounter;
volatile int* __restrict__ nextOpCounter;
volatile int* __restrict__ sendFlagToNext;
volatile int* __restrict__ sendFlagToPrev;
volatile int* __restrict__ recvFlagFromNext;
volatile int* __restrict__ recvFlagFromPrev;
T* volatile * __restrict__ recvPtrFromNext;
T* volatile * __restrict__ sendPtrToPrev;
T* __restrict__ recvBuffer;
T* __restrict__ sendBuffer;
int userRank[MAXRANKS];
};
struct NodeRef {
ncclMem* remote; // TODO: Verify if these
ncclMem* local; // are still needed.
enum {DEVICE, HOST} type;
ncclMem* devCleanup; // Used only when remote comm uses same process & GPU
ncclMem* hostCleanup; // Used whenever target is in different process
int* opCounter; // TODO: see if this can be removed too.
};
struct ncclComm {
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
// Device and Host allocated chunks. Stored here to correctly free() memory.
ncclMem* devMem;
ncclMem* hostMem;
int hostMemState;
int opSched; // Scheduling operation index
int* opCounter; // Counter of completed operations
cudaStream_t prevStream; // cache last used stream
cudaEvent_t doneEvent; // orders operations in different streams
// Maps an internal nccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userFromRing;
// copy of the above stored on each device
int* devUserFromRing;
// Ring order
int* ncclFromRing; // TODO: REMOVE IF NOT NEEDED BEYOND CORE.CU
// Size of temp buffer in bytes.
size_t buffSize;
// Whether we have remote access to the recvbuff pointers passed from remote
// GPUs. In single process mode this can be used as long as QPI links are
// not present. In multi-process, we never push to a remote recvbuff.
int globalMemSpace;
// Device copy of the communicator
struct ncclComm *devComm; // TODO: Remove this if not useful
// Device-side ring view
DevRing<char>* devRing;
// Device-to-device communication structures to access remote or local device
// memory. Actual allocation larger than 1.
NodeRef ptrs[1];
};
typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
extern DebugLevel ncclDebugLevel;
#define WARN(...) do { \
if (ncclDebugLevel >= WARN) { \
printf("WARN %s:%d ", __FILE__, __LINE__); \
printf(__VA_ARGS__); \
printf("\n"); \
fflush(stdout); \
if (ncclDebugLevel >= ABORT) abort(); \
} \
} while(0)
#define INFO(...) do { \
if (ncclDebugLevel >= INFO) { \
printf("INFO "); printf(__VA_ARGS__); printf("\n"); \
fflush(stdout); \
} \
} while(0)
// Check CUDA calls
#define CUDACHECK(cmd, retcode) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'\n", cudaGetErrorString(e)); \
return retcode; \
} \
} while(false)
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
if (res != ncclSuccess) { \
return res; \
} \
} while (0);
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
__attribute__ ((visibility("default"))) \
__attribute__ ((alias(#func))) \
ret p##func (args); \
extern "C" \
__attribute__ ((visibility("default"))) \
__attribute__ ((weak)) \
ret func(args)
#else
#define NCCL_API(ret, func, args...) \
extern "C" \
__attribute__ ((visibility("default"))) \
ret func(args)
#endif // end PROFAPI
#endif // end include guard

View File

@ -1,112 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef enqueue_h_
#define enqueue_h_
#include "core.h"
#include "reduce_kernel.h"
/* Syncronize previous collective (if in different stream) and enqueue
* collective. Work is performed asynchronously with the host thread.
* The ColFunc class should be templated on the datatype and reduction
* operator (if applicable) and define a static entry() method as
* follows.
* template <typename T, template <typename> class RedOp>
* class CollectiveFunctor {
* public:
* static ncclResult_t entry(const void* sendbuff, void* recvbuff, int count,
* int root, ncclComm* comm, cudaStream_t stream);
* };
* The entry() method can assume that the appropriate cuda device has been set. */
template< template<typename, template<typename> class> class ColFunc,
typename T,
template<typename> class Op >
ncclResult_t enqueue(const void* sendbuff,
void* recvbuff,
int count,
int root,
ncclComm_t comm,
cudaStream_t stream)
{
if (stream != comm->prevStream) { // sync required for calls in different streams
comm->prevStream = stream;
CUDACHECK(cudaStreamWaitEvent(stream, comm->doneEvent, 0), ncclUnhandledCudaError);
}
ncclResult_t ret;
ret = ColFunc<T, Op>::entry(sendbuff, recvbuff, count, root, comm, stream);
// Always have to record done event because we don't know what stream next
// collective will be in.
CUDACHECK(cudaEventRecord(comm->doneEvent, stream), ncclUnhandledCudaError);
comm->opSched += 1;
return ret;
}
// This version decodes type
template< template<typename, template<typename> class> class ColFunc,
template<typename> class Op >
ncclResult_t enqueue(const void* sendbuff,
void* recvbuff,
int count,
ncclDataType_t type,
int root,
ncclComm_t comm,
cudaStream_t stream)
{
switch(type) {
case ncclChar:
return enqueue<ColFunc, char, Op>(sendbuff, recvbuff, count, root, comm, stream);
case ncclInt:
return enqueue<ColFunc, int, Op>(sendbuff, recvbuff, count, root, comm, stream);
#ifdef CUDA_HAS_HALF
case ncclHalf:
return enqueue<ColFunc, half, Op>(sendbuff, recvbuff, count, root, comm, stream);
#endif
case ncclFloat:
return enqueue<ColFunc, float, Op>(sendbuff, recvbuff, count, root, comm, stream);
case ncclDouble:
return enqueue<ColFunc, double, Op>(sendbuff, recvbuff, count, root, comm, stream);
case ncclInt64:
return enqueue<ColFunc, long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
case ncclUint64:
return enqueue<ColFunc, unsigned long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
default:
WARN("Invalid ncclType %d", type);
return ncclInvalidType;
}
}
// This version decodes both type and reduction op
template< template<typename, template<typename> class> class ColFunc>
ncclResult_t enqueue(const void* sendbuff,
void* recvbuff,
int count,
ncclDataType_t type,
ncclRedOp_t op,
int root,
ncclComm_t comm,
cudaStream_t stream)
{
switch(op) {
case ncclSum:
return enqueue<ColFunc, FuncSum>(sendbuff, recvbuff, count, type, root, comm, stream);
case ncclProd:
return enqueue<ColFunc, FuncProd>(sendbuff, recvbuff, count, type, root, comm, stream);
case ncclMax:
return enqueue<ColFunc, FuncMax>(sendbuff, recvbuff, count, type, root, comm, stream);
case ncclMin:
return enqueue<ColFunc, FuncMin>(sendbuff, recvbuff, count, type, root, comm, stream);
default:
WARN("Invalid ncclRedOp: %d", op);
return ncclInvalidOperation;
}
}
#endif // End include guard

18
src/include/bootstrap.h Normal file
View File

@ -0,0 +1,18 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_BOOTSTRAP_H_
#define NCCL_BOOTSTRAP_H_
#include "nccl.h"
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapRingExchange(void* commState, void* prevNextData, int prev, int next, int size);
ncclResult_t bootstrapClose(void* commState);
#endif

195
src/include/common_coll.h Normal file
View File

@ -0,0 +1,195 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_COLL_H_
#define COMMON_COLL_H_
#include "core.h"
#include "enqueue.h"
#include "collectives/collectives.h"
static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
cudaPointerAttributes attr;
cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
if (err != cudaSuccess || attr.devicePointer == NULL) {
WARN("%s : %s is not a valid pointer", opname, ptrname);
return ncclInvalidArgument;
}
#if __CUDACC_VER_MAJOR__ >= 10
if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
#else
if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
#endif
WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
return ncclInvalidArgument;
}
return ncclSuccess;
}
static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
if (ptr == NULL) {
WARN("%s : %s argument is NULL", opname, ptrname);
return ncclInvalidArgument;
}
return ncclSuccess;
}
static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
NCCLCHECK(PtrCheck(comm, opname, "comm"));
// First, the easy ones
if (root < 0 || root >= comm->nRanks) {
WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
return ncclInvalidArgument;
}
if (type < 0 || type >= ncclNumTypes) {
WARN("%s : invalid type %d", opname, type);
return ncclInvalidArgument;
}
if (op < 0 || op >= ncclNumOps) {
WARN("%s : invalid reduction operation %d", opname, op);
return ncclInvalidArgument;
}
if (comm->checkPointers) {
// Check CUDA device pointers
if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
}
if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
}
}
return ncclSuccess;
}
static __inline__ int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
return 1;
case ncclFloat16:
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
// In : comm, nbytes ; Out : nrings, nthreads, ll
// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
// If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
// This ensures we don't use a large number of rings with a small number of threads
// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
// we use NCCL_THREAD_THRESHOLD when we reach the max
// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
*ll = 0;
int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
if (comm->llThreshold >= 0) { /* user sets total LL threshold */
if (nbytes > comm->llThreshold) { /* non-LL */
*nthreads = comm->nThreads+1;
*nrings = comm->nRings;
return;
} else {
llEnforced = 1; /* user wants to use LL */
}
}
int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
size_t nr;
int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
while (nt < ll_max_nthreads && *ll == 0) {
nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
if (nr <= maxRings) { /* avoid using few threads but many rings */
nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
*ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
}
if (*ll == 0) {
nt = nt << 1;
}
}
if (*ll == 1) {
*nthreads = nt;
*nrings = (int)nr;
return; /* we can use smaller number of threads to make LL work, stop here */
}
nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
*ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
*nthreads = *ll ? ll_max_nthreads : comm->nThreads+1;
*nrings = *ll ? (int)nr : comm->nRings;
}
static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) {
int llMode, nBlocks, nThreads;
ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
if (comm->userStreamSet == false) {
comm->userStream = stream;
comm->userStreamSet = true;
} else if (stream != comm->userStream) {
WARN("Error : mixing different streams within a group call is not supported.");
return ncclInvalidUsage;
}
int lastChunkSize = 0;
if (llMode == 1) {
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
}
for (int bid=0; bid<nBlocks; bid++) {
struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
if (ring->collCount == NCCL_MAX_OPS) {
WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
return ncclInvalidUsage;
}
comm->myParams->gridDim.x++;
int opIndex = ring->collFifoTail;
struct ncclColl* c = ring->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (activePtr[0] != 0) sched_yield();
struct CollectiveArgs* args = &c->args;
args->root = root;
args->N = count;
args->ThisInput = sendbuff;
args->ThisOutput = recvbuff;
args->comm = comm->devComm;
args->opCount = comm->opCount;
args->bid = bid;
args->nRings = nBlocks;
args->nThreads = nThreads;
args->lastChunkSize = lastChunkSize;
c->nThreads = nThreads;
c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
c->active = 1;
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
ring->collFifoTail = opIndex;
ring->collCount++;
}
/*if (llMode == 0)*/ comm->opCount++;
return ncclSuccess;
}
extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
#endif

385
src/include/core.h Normal file
View File

@ -0,0 +1,385 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CORE_H_
#define NCCL_CORE_H_
#define NCCL_MAX_OPS 2048
#include "nccl.h"
#include "transport.h"
#include "debug.h"
#include <cstdio>
#include <algorithm> // std::min/std::max
#include <unistd.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#if __CUDACC_VER_MAJOR__ < 9
struct cudaLaunchParams {
void *func;
dim3 gridDim;
dim3 blockDim;
void **args;
size_t sharedMem;
cudaStream_t stream;
};
#endif
#define MAXRINGS 16
#define MAXTHREADS 256
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
// Rings / LL tuning
#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
#define NCCL_THREAD_THRESHOLD 32 // Per thread size before we switch to non-LL
#define NCCL_LL_MAX_NTHREADS 256
#define NCCL_LL_MIN_NTHREADS 64
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
struct ncclConnInfo {
// Regular comm mechanism
char *buff; // Local for recv, remote for send
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
uint64_t *opCount; // Local for recv, remote for send
int direct; // Direct communication
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
// Low latency mechanism
char *llBuff; // Local for recv, remote for send
uint64_t *llHead; // Local for send, remote for recv
int *llFifo; // LL Size fifo for proxy
uint64_t llStep; // Keep where we are
uint64_t llLastCleaning;
};
struct ncclConnector {
struct transportProxyInfo* proxyInfo;
struct ncclTransport* transport;
void* transportResources; // Host-side resources
struct ncclConnInfo conn;
};
#define CACHE_LINE_SIZE 128
#define MEM_ALIGN 4096
#define SIZES_FIFO_SIZE 32
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
#define NCCL_LL_CHUNKS 8
#define NUM_LINES_PER_THREAD 2
#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 64K
#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
#define NCCL_LL_CLEAN_FREQ 0x10000000
struct ncclSendMem {
union {
struct {
uint64_t head;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
uint64_t llHead;
};
char pad3[MEM_ALIGN];
};
};
struct ncclRecvMem {
union {
struct {
uint64_t tail;
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
uint64_t opCount;
char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[SIZES_FIFO_SIZE];
int llSizesFifo[SIZES_FIFO_SIZE];
};
char pad5[MEM_ALIGN];
};
char llBuff[NCCL_LL_BUFF_SIZE];
char buff[1]; // Actually larger than that
};
struct ncclRing {
union {
struct {
int id;
int nthreads;
// Per ring resources
struct ncclSendMem* devMemSend; // CUDA-size resources
struct ncclRecvMem* devMemRecv; // CUDA-size resources
int buffSize;
int devMemSendSize; // Keep the size for IPCs
int devMemRecvSize; // Keep the size for IPCs
struct ncclConnector send;
struct ncclConnector recv;
// Maps an internal nccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int* devUserRanks;
// Operation list for aggregation
struct ncclColl* collectives;
struct ncclColl* devCollectives;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
};
int data[0x80];
};
};
static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
struct CollectiveArgs {
struct ncclComm* comm;
uint64_t opCount;
// local and remote input, output, and buffer
const void * ThisInput;
void * ThisOutput;
// general parameters
size_t N;
uint32_t root;
uint8_t bid;
uint8_t nRings;
uint16_t nThreads;
int lastChunkSize;
};
struct ncclColl {
union {
struct {
struct CollectiveArgs args;
uint16_t nThreads;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
};
int data[0x10];
};
};
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
struct ncclComm {
struct ncclRing rings[MAXRINGS];
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
bool userStreamSet;
cudaEvent_t doneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
uint64_t opCount;
// Rings for collectives
int nRings;
int nThreads;
// Low-latency algorithm threshold
ssize_t llThreshold;
ssize_t threadThreshold;
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
cudaStream_t groupStream;
// Device copy of the communicator
struct ncclComm *devComm;
// Intra-process sync
int intraRank;
int intraRanks;
int* intraBarrier;
int intraPhase;
// Storage for deferred intra-process launch
struct cudaLaunchParams * intraParams;
struct cudaLaunchParams *myParams;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclColl args;
void* argsptr;
};
// Check CUDA calls
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
res = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
int ret = -1; \
while (ret == -1) { \
SYSCHECKVAL(call, name, ret); \
if (ret == -1) { \
INFO(ALL,"Got %s, retrying", strerror(errno)); \
}\
} \
} while (0);
#define SYSCHECKVAL(call, name, retval) do { \
retval = call; \
if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define SYSCHECKNTIMES(call, name, times, usec, exptype) do { \
int ret = -1; \
int count = 0; \
while (ret == -1 && count < times) { \
SYSCHECKVALEXP(call, name, ret, exptype); \
count++; \
if (ret == -1) { \
usleep(usec); \
}\
} \
if (ret == -1) { \
WARN("Call to " name " timeout : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define SYSCHECKVALEXP(call, name, retval, exptype) do { \
retval = call; \
if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN && errno != exptype) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \
} \
} while (0);
#define NCCLCHECKGOTO(call, res, label) do { \
res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
goto label; \
} \
} while (0);
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
__attribute__ ((visibility("default"))) \
__attribute__ ((alias(#func))) \
ret p##func (args); \
extern "C" \
__attribute__ ((visibility("default"))) \
__attribute__ ((weak)) \
ret func(args)
#else
#define NCCL_API(ret, func, args...) \
extern "C" \
__attribute__ ((visibility("default"))) \
ret func(args)
#endif // end PROFAPI
int ncclCudaCompCap();
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
memset(*ptr, 0, size);
*devPtr = *ptr;
return ncclSuccess;
}
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(cudaFreeHost(ptr));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
}
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
return ncclSuccess;
}
#endif // end include guard

179
src/include/debug.h Normal file
View File

@ -0,0 +1,179 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEBUG_H_
#define NCCL_DEBUG_H_
#include <pthread.h>
#include <stdio.h>
#include <chrono>
#include <unistd.h>
#include <sys/syscall.h>
#include <limits.h>
#include <string.h>
#include "nccl.h"
#define gettid() (pid_t) syscall(SYS_gettid)
typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel;
typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys;
extern DebugLevel ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen);
#define WARN(...) do { \
if (ncclDebugLevel >= WARN) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
int cudaDev; \
cudaGetDevice(&cudaDev); \
pthread_mutex_lock(&ncclDebugOutputLock); \
fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \
fprintf(ncclDebugFile,__VA_ARGS__); \
fprintf(ncclDebugFile,"\n"); \
fflush(ncclDebugFile); \
pthread_mutex_unlock(&ncclDebugOutputLock); \
if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \
} \
} while(0)
#define INFO(FLAGS, ...) do { \
if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
int cudaDev; \
cudaGetDevice(&cudaDev); \
pthread_mutex_lock(&ncclDebugOutputLock); \
fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \
fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
fflush(ncclDebugFile); \
pthread_mutex_unlock(&ncclDebugOutputLock); \
} \
} while(0)
#ifdef ENABLE_TRACE
#define TRACE(FLAGS, ...) do { \
if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
int cudaDev; \
cudaGetDevice(&cudaDev); \
pthread_mutex_lock(&ncclDebugOutputLock); \
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; \
fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \
fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
fflush(ncclDebugFile); \
pthread_mutex_unlock(&ncclDebugOutputLock); \
} \
} while(0)
extern std::chrono::high_resolution_clock::time_point ncclEpoch;
#else
#define TRACE(...)
#endif
#include <stdlib.h>
static inline void initDebug() {
const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) {
ncclDebugLevel = NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
ncclDebugLevel = WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
ncclDebugLevel = INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
ncclDebugLevel = ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
ncclDebugLevel = TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var
* This can be a comma separated list such as INIT,COLL
* or ^INIT,COLL etc
*/
char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
if (nccl_debug_subsys != NULL) {
char *subsys = strtok(nccl_debug_subsys, ",");
while (subsys != NULL) {
int invert = 0;
uint64_t mask = 0;
if (subsys[0] == '^') { invert = 1; subsys++; }
if (strcasecmp(subsys, "INIT") == 0) {
mask = INIT;
} else if (strcasecmp(subsys, "COLL") == 0) {
mask = COLL;
} else if (strcasecmp(subsys, "P2P") == 0) {
mask = P2P;
} else if (strcasecmp(subsys, "SHM") == 0) {
mask = SHM;
} else if (strcasecmp(subsys, "NET") == 0) {
mask = NET;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = ALL;
}
if (mask) {
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
}
subsys = strtok(NULL, ",");
}
}
/* Parse and expand the NCCL_DEBUG_FILE path and
* then create the debug file. But don't bother unless the
* NCCL_DEBUG level is > VERSION
*/
const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) {
int c = 0;
char debug_fn[PATH_MAX+1] = "";
char *dfn = debug_fn;
while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
if (nccl_debug_file[c++] != '%') {
*dfn++ = nccl_debug_file[c-1];
continue;
}
switch (nccl_debug_file[c++]) {
case '%': // Double %
*dfn++ = '%';
break;
case 'h': // %h = hostname
char hostname[1024];
getHostName(hostname, 1024);
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = nccl_debug_file[c-1];
break;
}
}
*dfn = '\0';
if (debug_fn[0] != '\0') {
FILE *file = fopen(debug_fn, "w");
if (file != NULL) {
INFO(ALL,"DEBUG file is '%s'", debug_fn);
ncclDebugFile = file;
}
}
}
pthread_mutex_init(&ncclDebugOutputLock, NULL);
#ifdef ENABLE_TRACE
ncclEpoch = std::chrono::high_resolution_clock::now();
#endif
}
#endif

26
src/include/enqueue.h Normal file
View File

@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ENQUEUE_H_
#define NCCL_ENQUEUE_H_
#include "core.h"
#include "group.h"
typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
ncclResult_t ncclBarrierEnqueue(ncclComm_t comm);
ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm);
ncclResult_t ncclEnqueueEvents(ncclComm_t comm);
#endif // End include guard

24
src/include/group.h Normal file
View File

@ -0,0 +1,24 @@
/*************************************************************************
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_GROUP_H_
#define NCCL_GROUP_H_
#include "nccl.h"
#include "core.h"
bool ncclAsyncMode();
ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAsyncColl(ncclComm_t comm);
#endif

1109
src/include/ibvwrap.h Normal file

File diff suppressed because it is too large Load Diff

64
src/include/nccl_net.h Normal file
View File

@ -0,0 +1,64 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NET_H_
#define NCCL_NET_H_
#include "nccl.h"
#define NCCL_NET_MAJOR 1
#define NCCL_NET_MINOR 0
#define NCCL_NET_HANDLE_MAXSIZE 64
#define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2
#define NCCL_MAX_SCORE 0x7
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Return the number of network devices along with their scores relative to the
// current CUDA device. The per device score should be a value from 1-7 with a
// higher score representing a better choice for performance.
// This call should allocate the 'scores' array using malloc(3), and it
// will then be freed automatically by NCCL.
ncclResult_t (*devices)(int* ndev, int** scores);
// Return whether this device supports host pointers and/or CUDA pointers
// as data from the current GPU. Supported types should be composed with
// NCCL_PTR_HOST and NCCL_PTR_CUDA.
ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*isend)(void* sendComm, void* data, int size, int type, void** request);
// Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*irecv)(void* recvComm, void* data, int size, int type, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* recvComm, void* data, int size);
// Test whether a request is complete and return the size received (can be less than requested).
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_t;
extern
#ifdef __cplusplus
"C"
#endif
ncclNet_t* ncclNet;
#endif // end include guard

40
src/include/net.h Normal file
View File

@ -0,0 +1,40 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_INT_NET_H_
#define NCCL_INT_NET_H_
#include "nccl.h"
#include "nccl_net.h"
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
/* Socket Interface Selection type */
typedef enum { findSubnetIf = -1,
dontCareIf = -2
} ncclSocketIfSl_t;
// Translation to external API
static const char* ncclNetName() { return ncclNet->name; }
static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; }
static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
extern bool ncclIbSupport();
extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
extern ncclNet_t ncclNetIb;
extern ncclNet_t ncclNetSocket;
#endif

155
src/include/nvlink.h Normal file
View File

@ -0,0 +1,155 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVLINK_H_
#define NCCL_NVLINK_H_
#include <sys/stat.h>
#include <fcntl.h>
#include "nvmlwrap.h"
#include "topo.h"
#define CONNECT_NVLINK 0x10
#define CONNECT_NVSWITCH 0x100
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
};
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
char* rPath = realpath(classPath, NULL);
int fd;
SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
free(rPath);
char pciClass[9];
strncpy(pciClass, "0x000000", 9);
int len;
SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
SYSCHECK(close(fd), "close");
if (strcmp(pciClass, "0x068000") == 0) {
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
*type = ncclNvLinkDeviceSwitch;
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
*type = ncclNvLinkDeviceGpu;
} else {
// Ignore if we don't know what's on the other side.
return ncclSystemError;
}
return ncclSuccess;
}
/* Get the maximum number of NVLinks based on the GPU generation */
static ncclResult_t getMaxNvlinks(int* maxLinks) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
int ccMajor;
CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
// 6 for Volta, 4 for Pascal
*maxLinks = (ccMajor > 6) ? 6 : 4;
// INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
return ncclSuccess;
}
static int getNvlinkGpu(const char* busId1, const char* busId2) {
// Determine if that connection is through NVLink
int links = 0;
int nvswitch_links = 0;
int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
nvmlDevice_t nvmlDev;
ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
if (res != ncclSuccess) return 0;
for(int l=0; l<maxNvLinks; ++l) {
// nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
// report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
// don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
// the POWER CPU case, so it seems best to check this as well.
unsigned canP2P;
if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
// nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
// if the links don't exist, or are disabled. So checking for that return
// here would probably make the nvmlDeviceGetNvLinkCapability check above
// redundant. Presumably, we still need to check the P2P capability above,
// since even non-GPUs would possess PCI info.
nvmlPciInfo_t remoteProc;
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
// Old versions of NVML return a lowercase PCI ID
char* p = remoteProc.busId;
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
if (p[c] == 0) break;
p[c] = toupper(p[c]);
}
if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
links++;
} else {
// Make a lower case copy of the bus ID for calling ncclDeviceType
// PCI system path is in lower case
char* p = remoteProc.busId;
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
if (p[c] == 0) break;
lowerId[c] = tolower(p[c]);
}
// Determine if the remote side is NVswitch
enum ncclNvLinkDeviceType type;
if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
//TODO: we are making an assumption that all GPUs are connected to this switch
//This assumption may change for future architectures
nvswitch_links++;
}
}
}
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
}
static int getNumNvlinks(const char* busId) {
nvmlDevice_t nvmlDev;
ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
if (res != ncclSuccess) return 0;
int nvlinks = 0, nvswitch_links = 0;
int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
for(int l=0; l<maxNvLinks; ++l) {
unsigned canP2P;
nvmlEnableState_t isActive;
if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
nvlinks++;
} else {
continue;
}
nvmlPciInfo_t remoteProc;
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
// Make a lower case copy of the bus ID for calling ncclDeviceType
// PCI system path is in lower case
char* p = remoteProc.busId;
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
if (p[c] == 0) break;
lowerId[c] = tolower(p[c]);
}
// Determine if the remote side is NVswitch
enum ncclNvLinkDeviceType type;
if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
//TODO: we are making an assumption that all GPUs are connected to this switch
//This assumption may change for future architectures
nvswitch_links++;
}
}
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
}
#endif

149
src/include/nvmlwrap.h Normal file
View File

@ -0,0 +1,149 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVMLWRAP_H_
#define NCCL_NVMLWRAP_H_
#include "core.h"
//#define NVML_DIRECT 1
#ifdef NVML_DIRECT
#include "nvml.h"
#define NVMLCHECK(cmd) do { \
nvmlReturn_t e = cmd; \
if( e != NVML_SUCCESS ) { \
WARN("NVML failure '%s'", nvmlErrorString(e)); \
return ncclSystemError; \
} \
} while(false)
static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
NVMLCHECK(nvmlDeviceGetIndex(device, index));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult) {
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
return ncclSuccess;
}
#else
// Dynamically handle dependencies on NVML
/* Extracted from nvml.h */
typedef struct nvmlDevice_st* nvmlDevice_t;
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
typedef enum nvmlEnableState_enum
{
NVML_FEATURE_DISABLED = 0, //!< Feature disabled
NVML_FEATURE_ENABLED = 1 //!< Feature enabled
} nvmlEnableState_t;
typedef enum nvmlNvLinkCapability_enum
{
NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported
NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported
NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported
NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported
NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link
NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device
// should be last
NVML_NVLINK_CAP_COUNT
} nvmlNvLinkCapability_t;
typedef enum nvmlReturn_enum
{
NVML_SUCCESS = 0, //!< The operation was successful
NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit()
NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid
NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device
NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation
NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful
NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough
NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached
NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded
NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed
NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded
NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted
NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again
NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups
NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch
NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use
NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred
} nvmlReturn_t;
typedef struct nvmlPciInfo_st
{
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff
unsigned int bus; //!< The bus on which the device resides, 0 to 0xff
unsigned int device; //!< The device's id on the bus, 0 to 31
unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id
// Added in NVML 2.285 API
unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
// NVIDIA reserved for internal use only
unsigned int reserved0;
unsigned int reserved1;
unsigned int reserved2;
unsigned int reserved3;
} nvmlPciInfo_t;
/* End of nvml.h */
ncclResult_t wrapNvmlSymbols(void);
ncclResult_t wrapNvmlInit(void);
ncclResult_t wrapNvmlShutdown(void);
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
#endif // NVML_DIRECT
#endif // End include guard

81
src/include/param.h Normal file
View File

@ -0,0 +1,81 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PARAM_H_
#define NCCL_PARAM_H_
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <pwd.h>
static const char* userHomeDir() {
struct passwd *pwUser = getpwuid(getuid());
return pwUser == NULL ? NULL : pwUser->pw_dir;
}
static void setEnvFile(const char* fileName) {
FILE * file = fopen(fileName, "r");
if (file == NULL) return;
char *line = NULL;
char envVar[1024];
char envValue[1024];
size_t n = 0;
ssize_t read;
while ((read = getline(&line, &n, file)) != -1) {
if (line[read-1] == '\n') line[read-1] = '\0';
int s=0; // Env Var Size
while (line[s] != '\0' && line[s] != '=') s++;
if (line[s] == '\0') continue;
strncpy(envVar, line, std::min(1024,s));
envVar[s] = '\0';
s++;
strncpy(envValue, line+s, 1024);
setenv(envVar, envValue, 0);
char *str = getenv(envVar);
}
if (line) free(line);
fclose(file);
}
static void initEnv() {
char confFilePath[1024];
const char * userDir = userHomeDir();
if (userDir) {
sprintf(confFilePath, "%s/.nccl.conf", userDir);
setEnvFile(confFilePath);
}
sprintf(confFilePath, "/etc/nccl.conf");
setEnvFile(confFilePath);
}
#define NCCL_PARAM(name, env, default_value) \
pthread_mutex_t ncclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
int64_t ncclParam##name() { \
static_assert(default_value != -1LL, "default value cannot be -1"); \
static int64_t value = -1LL; \
pthread_mutex_lock(&ncclParamMutex##name); \
if (value == -1LL) { \
value = default_value; \
char* str = getenv("NCCL_" env); \
if (str && strlen(str) > 0) { \
errno = 0; \
int64_t v = strtoll(str, NULL, 0); \
if (errno) { \
INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
} else { \
value = v; \
INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value); \
} \
} \
} \
pthread_mutex_unlock(&ncclParamMutex##name); \
return value; \
}
#endif

14
src/include/ring.h Normal file
View File

@ -0,0 +1,14 @@
/*************************************************************************
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_RING_H_
#define NCCL_RING_H_
#include "core.h"
ncclResult_t initRing(struct ncclComm* comm, int ringid);
ncclResult_t freeRing(struct ncclRing* ring);
#endif

17
src/include/rings.h Normal file
View File

@ -0,0 +1,17 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_RINGS_H_
#define NCCL_RINGS_H_
static int getDefaultThreads() {
// On Kepler, rings are doubled later.
return ncclCudaCompCap() == 3 ? 128 : 256;
}
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
#endif

76
src/include/shm.h Normal file
View File

@ -0,0 +1,76 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_SHM_H_
#define NCCL_SHM_H_
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
*shmPtr = NULL;
int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
if (fd == -1) {
WARN("shm_open failed to open %s : %s", shmname, strerror(errno));
return ncclSystemError;
}
if (create) {
int res = posix_fallocate(fd, 0, shmsize);
if (res != 0) {
WARN("Unable to allocate shared memory (%d bytes) : %s", shmsize, strerror(res));
shm_unlink(shmname);
close(fd);
return ncclSystemError;
}
}
void *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
close(fd);
if (ptr == MAP_FAILED) {
WARN("failure in mmap of %s (size %d) : %s", shmname, shmsize, strerror(errno));
shm_unlink(shmname);
return ncclSystemError;
}
if (create) {
memset(ptr, 0, shmsize);
}
cudaError_t e;
if ((e=cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped)) != cudaSuccess) {
WARN("failed to register host buffer %p : %s", ptr, cudaGetErrorString(e));
if (create) shm_unlink(shmname);
munmap(ptr, shmsize);
return ncclUnhandledCudaError;
}
if ((e=cudaHostGetDevicePointer(devShmPtr, ptr, 0)) != cudaSuccess) {
WARN("failed to get device pointer for local shmem %p : %s", ptr, cudaGetErrorString(e));
if (create) shm_unlink(shmname);
munmap(ptr, shmsize);
return ncclUnhandledCudaError;
}
*shmPtr = ptr;
return ncclSuccess;
}
static ncclResult_t shmUnlink(const char* shmname) {
if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink");
return ncclSuccess;
}
static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
CUDACHECK(cudaHostUnregister(shmPtr));
if (munmap(shmPtr, shmsize) != 0) {
WARN("munmap of shared memory failed");
return ncclSystemError;
}
return ncclSuccess;
}
#endif

401
src/include/socket.h Normal file
View File

@ -0,0 +1,401 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_SOCKET_H_
#define NCCL_SOCKET_H_
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <unistd.h>
#include <netdb.h>
#include <ifaddrs.h>
#include <net/if.h>
#include "utils.h"
#define MAX_IF_NAME_SIZE 16
#define SLEEP_INT 1000 // sleep interval in usec
#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
/* Common socket address storage structure for IPv4/IPv6 */
union socketAddress {
struct sockaddr sa;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
};
/* Format a string representation of a (struct sockaddr *) socket address using getnameinfo()
*
* Output: "IPv4/IPv6 address<port>"
*/
static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
if (buf == NULL || saddr == NULL) return NULL;
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
char host[NI_MAXHOST], service[NI_MAXSERV];
(void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
sprintf(buf, "%s<%s>", host, service);
return buf;
}
/* Allow the user to force the IPv4/IPv6 interface selection */
static inline int envSocketFamily(void) {
int family = -1; // Family selection is not forced, will use first one found
char* env = getenv("NCCL_SOCKET_FAMILY");
if (env == NULL)
return family;
if (strcmp(env, "AF_INET") == 0)
family = AF_INET; // IPv4
else if (strcmp(env, "AF_INET6") == 0)
family = AF_INET6; // IPv6
return family;
}
static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
char line[1024];
struct netIf userIfs[maxIfs];
bool searchNot = prefixList && prefixList[0] == '^';
int nUserIfs = parseStringList(prefixList, userIfs, maxIfs);
int found = 0;
struct ifaddrs *interfaces, *interface;
getifaddrs(&interfaces);
for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
if (interface->ifa_addr == NULL) continue;
/* We only support IPv4 & IPv6 */
int family = interface->ifa_addr->sa_family;
if (family != AF_INET && family != AF_INET6)
continue;
TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
/* Allow the caller to force the socket family type */
if (sock_family != -1 && family != sock_family)
continue;
/* We also need to skip IPv6 loopback interfaces */
if (family == AF_INET6) {
struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
}
// check against user specified interfaces
if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
continue;
}
// Check that this interface has not already been saved
// getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
bool duplicate = false;
for (int i = 0; i < found; i++) {
if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
}
if (!duplicate) {
// Store the interface name
strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
// Store the IP address
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
memcpy(addrs+found, interface->ifa_addr, salen);
INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
found++;
}
}
freeifaddrs(interfaces);
return found;
}
static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
/* Check family first */
int family = local_if.ifa_addr->sa_family;
if (family != remote.sa.sa_family) {
return false;
}
if (family == AF_INET) {
struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
struct sockaddr_in& remote_addr = remote.sin;
struct in_addr local_subnet, remote_subnet;
local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
} else if (family == AF_INET6) {
struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
struct sockaddr_in6& remote_addr = remote.sin6;
struct in6_addr& local_in6 = local_addr->sin6_addr;
struct in6_addr& mask_in6 = mask->sin6_addr;
struct in6_addr& remote_in6 = remote_addr.sin6_addr;
bool same = true;
int len = 16; //IPv6 address is 16 unsigned char
for (int c = 0; c < len; c++) { //Network byte order is big-endian
char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
if (c1 ^ c2) {
same = false;
break;
}
}
// At last, we need to compare scope id
// Two Link-type addresses can have the same subnet address even though they are not in the same scope
// For Global type, this field is 0, so a comparison wouldn't matter
same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
return same;
} else {
WARN("Net : Unsupported address family type");
return false;
}
}
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
char line[1024], line_a[1024];
int found = 0;
struct ifaddrs *interfaces, *interface;
getifaddrs(&interfaces);
for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
if (interface->ifa_addr == NULL) continue;
/* We only support IPv4 & IPv6 */
int family = interface->ifa_addr->sa_family;
if (family != AF_INET && family != AF_INET6)
continue;
// check against user specified interfaces
if (!matchSubnet(*interface, remoteAddr)) {
continue;
}
// Store the local IP address
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
memcpy(localAddrs+found, interface->ifa_addr, salen);
// Store the interface name
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
found++;
if (found == maxIfs) break;
}
if (found == 0) {
WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
}
freeifaddrs(interfaces);
return found;
}
static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) {
if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
WARN("Net : string is null");
return ncclInvalidArgument;
}
bool ipv6 = ip_port_pair[0] == '[';
/* Construct the sockaddress structure */
if (!ipv6) {
struct netIf ni;
// parse <ip_or_hostname>:<port> string, expect one pair
if (parseStringList(ip_port_pair, &ni, 1) != 1) {
WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
return ncclInvalidArgument;
}
struct addrinfo hints, *p;
int rv;
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
return ncclInvalidArgument;
}
// use the first
if (p->ai_family == AF_INET) {
struct sockaddr_in& sin = ua->sin;
memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
sin.sin_family = AF_INET; // IPv4
//inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address
sin.sin_port = htons(ni.port); // port
} else if (p->ai_family == AF_INET6) {
struct sockaddr_in6& sin6 = ua->sin6;
memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
sin6.sin6_family = AF_INET6; // IPv6
sin6.sin6_port = htons(ni.port); // port
sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
sin6.sin6_scope_id = 0; // should be global scope, set to 0
} else {
WARN("Net : unsupported IP family");
return ncclInvalidArgument;
}
freeaddrinfo(p); // all done with this structure
} else {
int i, j = -1, len = strlen(ip_port_pair);
for (i = 1; i < len; i++) {
if (ip_port_pair[i] == '%') j = i;
if (ip_port_pair[i] == ']') break;
}
if (i == len) {
WARN("Net : No valid [IPv6]:port pair found");
return ncclInvalidArgument;
}
bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope
char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
memset(ip_str, '\0', sizeof(ip_str));
memset(port_str, '\0', sizeof(port_str));
memset(if_name, '\0', sizeof(if_name));
strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
strncpy(port_str, ip_port_pair+i+2, len-i-1);
int port = atoi(port_str);
if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
struct sockaddr_in6& sin6 = ua->sin6;
sin6.sin6_family = AF_INET6; // IPv6
inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address
sin6.sin6_port = htons(port); // port
sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
}
return ncclSuccess;
}
static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
int nIfs = 0;
// Allow user to force the INET socket family selection
int sock_family = envSocketFamily();
// User specified interface
char* env = getenv("NCCL_SOCKET_IFNAME");
if (env && strlen(env) > 1) {
// Specified by user : find or fail
nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
} else {
// Try to automatically pick the right one
// Start with IB
nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
// else see if we can get some hint from COMM ID
if (nIfs == 0) {
char* commId = getenv("NCCL_COMM_ID");
if (commId && strlen(commId) > 1) {
// Try to find interface that is in the same subnet as the IP in comm id
union socketAddress idAddr;
GetSocketAddrFromString(&idAddr, commId);
nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
}
}
// Then look for anything else (but not docker or lo)
if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
// Finally look for docker, then lo.
if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
}
return nIfs;
}
static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) {
/* IPv4/IPv6 support */
int family = localAddr->sa.sa_family;
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
/* Create socket and bind it to a port */
int sockfd = socket(family, SOCK_STREAM, 0);
if (sockfd == -1) {
WARN("Net : Socket creation failed : %s", strerror(errno));
return ncclSystemError;
}
int opt = 1;
SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
// localAddr port should be 0 (Any port)
SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind");
/* Get the assigned Port */
socklen_t size = salen;
SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
#ifdef ENABLE_TRACE
char line[1024];
TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
#endif
/* Put the socket in listen mode */
SYSCHECK(listen(sockfd, 128), "listen");
*fd = sockfd;
return ncclSuccess;
}
static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
/* IPv4/IPv6 support */
int family = remoteAddr->sa.sa_family;
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
/* Connect to a hostname / port */
*fd = socket(family, SOCK_STREAM, 0);
if (*fd == -1) {
WARN("Net : Socket creation failed : %s", strerror(errno));
return ncclSystemError;
}
const int one = 1;
SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
/* const int bufsize = 128*1024;
SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
#ifdef ENABLE_TRACE
char line[1024];
TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
#endif
SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
return ncclSuccess;
}
static ncclResult_t socketReceive(int fd, void* ptr, int size) {
char* data = (char*)ptr;
int offset = 0;
while (offset < size) {
int recvsize;
SYSCHECKVAL(recv(fd, data, size-offset, 0), "recv", recvsize);
if (recvsize == 0) {
WARN("Net : Connection closed by remote peer");
return ncclSystemError;
}
if (recvsize == -1) {
INFO(NET,"Recv : got retcode %d, retrying", errno);
continue;
}
data += recvsize;
offset += recvsize;
}
return ncclSuccess;
}
static ncclResult_t socketSend(int fd, void* ptr, int size) {
char* data = (char*)ptr;
int offset = 0;
while (offset < size) {
int sendsize;
SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
if (sendsize == -1) {
INFO(NET,"Send : got retcode %d, retrying", errno);
continue;
}
data += sendsize;
offset += sendsize;
}
return ncclSuccess;
}
#endif

83
src/include/topo.h Normal file
View File

@ -0,0 +1,83 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_TOPO_H_
#define NCCL_TOPO_H_
#include "nccl.h"
#include <ctype.h>
#define MAXPATHSIZE 1024
static ncclResult_t getCudaPath(int cudaDev, char** path) {
char busId[16];
CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
char busPath[] = "/sys/class/pci_bus/0000:00/device";
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
char* cudaRpath = realpath(busPath, NULL);
char pathname[MAXPATHSIZE];
strncpy(pathname, cudaRpath, MAXPATHSIZE);
strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname));
strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname));
free(cudaRpath);
*path = realpath(pathname, NULL);
if (*path == NULL) {
WARN("Could not find real path of %s", pathname);
return ncclSystemError;
}
return ncclSuccess;
}
static ncclResult_t getMlxPath(char* ibName, char** path) {
char devicepath[MAXPATHSIZE];
snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName);
*path = realpath(devicepath, NULL);
if (*path == NULL) {
WARN("Could not find real path of %s", devicepath);
return ncclSystemError;
}
return ncclSuccess;
}
static ncclResult_t getSockPath(char* ifName, char** path) {
char devicepath[MAXPATHSIZE];
snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName);
*path = realpath(devicepath, NULL);
if (*path == NULL) {
INFO(NET|INIT, "Could not find real path of %s", devicepath);
return ncclSystemError;
}
return ncclSuccess;
}
enum ncclIbPathDist {
PATH_PIX = 0,
PATH_PXB = 1,
PATH_PHB = 2,
PATH_SOC = 3
};
static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
static int pciDistance(char* path1, char* path2) {
int score = 0;
int depth = 0;
int same = 1;
for (int i=0; i<strlen(path1); i++) {
if (path1[i] != path2[i]) same = 0;
if (path1[i] == '/') {
depth++;
if (same == 1) score++;
}
}
if (score == 3) return PATH_SOC;
if (score == 4) return PATH_PHB;
if (score == depth-1) return PATH_PIX;
return PATH_PXB;
}
#endif

113
src/include/transport.h Normal file
View File

@ -0,0 +1,113 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_TRANSPORT_H_
#define NCCL_TRANSPORT_H_
#include "nccl.h"
#include <stdint.h>
#define NTRANSPORTS 3
extern struct ncclTransport ncclTransports[];
// Forward declarations
struct ncclRing;
struct ncclConnector;
struct ncclComm;
#define RANK_INFO_SIZE 64
typedef char ncclTinfo_t[RANK_INFO_SIZE];
struct ncclInfo {
ncclTinfo_t tinfo[NTRANSPORTS];
};
// Used to hold the transport connection values
typedef int64_t ncclTvalue_t;
#define CONNECT_SIZE 128
struct ncclConnect {
char data[CONNECT_SIZE];
};
struct ncclProxyArgs {
struct ncclRing* ring;
int substeps;
int nsteps;
uint64_t opCount;
int llMode;
bool needProxy;
int active; // add component before this line -- it is left out during initialization
};
struct ncclTransportComm {
ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
};
struct ncclTransport {
const char name[4];
ncclResult_t (*fillInfo)(ncclTinfo_t*, int);
ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
struct ncclTransportComm send;
struct ncclTransportComm recv;
};
#include <pthread.h>
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
struct transportProxyInfo {
struct ncclComm* comm;
pthread_t thread;
threadFunc_t func;
volatile int proxyReady;
struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
volatile uint64_t argsFifoHead;
volatile uint64_t argsFifoTail;
pthread_cond_t cond;
pthread_mutex_t mutex;
};
ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
enum proxyMode {
proxyRing = 0,
proxyFrom = 1,
proxyTo = 2
};
static int proxyPatternRing = proxyRing;
static inline int proxyPatternFrom(int root) { return 1+root; }
static inline int proxyPatternTo(int root) { return -1-root; }
static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
ncclResult_t transportStartProxies(struct ncclComm* comm);
#include <unistd.h>
// Spin wait until func evaluates to true
template<typename FUNC>
inline void transportProxyWait(const FUNC& func) {
while (!func()) {
sched_yield();
}
}
inline void transportProxyIdle(int idle) {
sched_yield();
}
#endif

25
src/include/utils.h Normal file
View File

@ -0,0 +1,25 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_UTILS_H_
#define NCCL_UTILS_H_
#include "nccl.h"
#include <stdint.h>
ncclResult_t getHostName(char* hostname, int maxlen);
uint64_t getHostHash();
uint64_t getPidHash();
struct netIf {
char prefix[64];
int port;
};
int parseStringList(const char* string, struct netIf* ifList, int maxList);
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
#endif

815
src/init.cu Normal file
View File

@ -0,0 +1,815 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "core.h"
#include "ring.h"
#include "param.h"
#include "nvmlwrap.h"
#include "rings.h"
#include "bootstrap.h"
#include "transport.h"
#include "common_coll.h"
#include "group.h"
#include "utils.h"
#include "net.h"
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sched.h>
#include <fcntl.h>
#include <unistd.h>
#include <cuda_runtime.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
DebugLevel ncclDebugLevel;
uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT
pthread_mutex_t ncclDebugOutputLock;
FILE *ncclDebugFile = stdout;
#ifdef ENABLE_TRACE
std::chrono::high_resolution_clock::time_point ncclEpoch;
#endif
#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
#else
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
#endif
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
extern "C" __attribute__ ((visibility("default")))
ncclNet_t* ncclNet = NULL;
// We define this as weak to let tests redefine their own
#pragma weak ncclCudaCompCap
int ncclCudaCompCap() {
int cudaDev;
if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
int ccMajor;
if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
return ccMajor;
}
int ncclCudaFullCompCap() {
int cudaDev;
if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
int ccMajor, ccMinor;
if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
return ccMajor*10+ccMinor;
}
void initNet() {
if (ncclNet != NULL) {
INFO(INIT,"Using external Network %s", ncclNetName());
} else {
ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket;
INFO(INIT,"Using internal Network %s", ncclNetName());
}
}
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", NCCL_THREAD_THRESHOLD);
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
static bool initialized = false;
static ncclResult_t ncclInit() {
if (initialized) return ncclSuccess;
pthread_mutex_lock(&initLock);
if (!initialized) {
initEnv();
initDebug();
initNet();
initialized = true;
}
pthread_mutex_unlock(&initLock);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
ncclResult_t ncclGetVersion(int* version) {
if (version == NULL) return ncclInvalidArgument;
*version = NCCL_VERSION_CODE;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
NCCLCHECK(ncclInit());
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
return bootstrapGetUniqueId(out);
}
static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
CUDACHECK(cudaFree(comm->devComm));
for (int ring=0; ring<comm->nRings; ring++)
NCCLCHECK(freeRing(comm->rings+ring));
if (comm->doneEvent != NULL)
CUDACHECK(cudaEventDestroy(comm->doneEvent));
if (comm->launchMode == ncclComm::GROUP) {
CUDACHECK(cudaStreamDestroy(comm->groupStream));
}
// Last rank frees shared resources between threads
int isLast;
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
if (isLast) {
free(comm->intraBarrier);
free(comm->intraParams);
free(comm->intraCudaDevs);
free(comm->intraCGMode);
free(comm->intraCC);
}
free(comm);
return ncclSuccess;
}
static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
if (ndev < 1) {
WARN("invalid device count (%d) requested", ndev);
return ncclInvalidArgument;
}
if (rank >= ndev || rank < 0) {
WARN("rank %d exceeds ndev=%d", rank, ndev);
return ncclInvalidArgument;
}
// Try to create a CUDA object right away. If there is something wrong with
// the device we're on (failure cause #1) , better know it early.
cudaEvent_t doneEvent;
CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
comm->rank = rank;
comm->nRanks = ndev;
cudaGetDevice(&comm->cudaDev);
comm->doneEvent = doneEvent;
comm->llThreshold = ncclParamLlThreshold();
comm->threadThreshold = ncclParamThreadThreshold();
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)
comm->groupCudaStream = ncclParamGroupCudaStream();
#else
// Don't allow the user to overload the default setting in older CUDA builds
comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
#endif
comm->argsptr = &comm->args;
*comret = comm;
return ncclSuccess;
}
static ncclResult_t devCommSetup(ncclComm_t comm) {
// Fully duplicate the comm on the device
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
// Copy the comm on the device
NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
// Copy userRanks
for (int r=0; r<comm->nRings; r++) {
NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
}
return ncclSuccess;
}
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
#define STR2(v) #v
#define STR(v) STR2(v)
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
static void showVersion() {
static int shown = 0;
if (shown == 0 && ncclDebugLevel >= VERSION) {
printf("%s\n", VERSION_STRING);
fflush(stdout);
if (ncclDebugFile != stdout)
INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
shown = 1;
}
}
static ncclResult_t fillInfo(struct ncclInfo* info, int rank) {
for (int t=0; t<NTRANSPORTS; t++) {
NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank));
}
return ncclSuccess;
}
template <int type>
static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
for (int t=0; t<NTRANSPORTS; t++) {
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
ncclTvalue_t ret = 0;
NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
if (ret > 0) {
NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
*transportRet = transport;
return ncclSuccess;
}
}
WARN("No transport found !");
*transportRet = NULL;
return ncclInternalError;
}
static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
NCCLCHECK(initRing(comm, ringid));
struct ncclRing* ring = comm->rings+ringid;
// Reorganize ranks to start with rank.
int shift;
for (shift = 0; shift<nranks; shift++) {
if (ringRanks[shift] == rank) {
break;
}
}
for (int i=0; i<nranks; i++) {
ring->userRanks[i] = ringRanks[(i+shift)%nranks];
}
int prev = ring->userRanks[nranks-1];
int next = ring->userRanks[1];
NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
NCCLCHECK(transportCreateProxy(0, ring, comm));
NCCLCHECK(transportCreateProxy(1, ring, comm));
return ncclSuccess;
}
static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
for (int r=0; r<nranks; r++) {
connectTransport[r] = -1;
for (int t=0; t<NTRANSPORTS; t++) {
NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
if (connectValue[r] > 0) {
connectTransport[r] = t;
break;
}
}
}
return ncclSuccess;
}
static void swap(void* mem1, void* mem2, int size) {
char tmp[size];
memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
}
#define MAXWIDTH 20
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN+4*MAXWIDTH)
void dumpMatrix(int* connectMatrix, int nranks) {
char line[STRLENGTH+1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
INFO(INIT,"%s", line);
for (int i=0; i<nranks; i++) {
memset(line, ' ', STRLENGTH);
sprintf(line, "%3d ", i);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
INFO(INIT,"%s", line);
}
}
void dumpLine(int* values, int nranks, const char* prefix) {
int prefixlen = strlen(prefix);
char line[STRLENGTH+1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
strncpy(line, prefix, PREFIXLEN);
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
INFO(INIT,"%s", line);
}
static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
for (int r=0; r<nrings; r++) {
char prefix[30];
/*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
dumpLine(next+r*nranks, nranks, prefix);*/
int current = rank;
for (int i=0; i<nranks; i++) {
rings[r*nranks+i] = current;
current = next[r*nranks+current];
}
sprintf(prefix, "Ring %02d : ", r);
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
if (current != rank) {
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
return ncclInternalError;
}
// Check that all ranks are there
for (int i=0; i<nranks; i++) {
int found = 0;
for (int j=0; j<nranks; j++) {
if (rings[r*nranks+j] == i) {
found = 1;
break;
}
}
if (found == 0) {
WARN("Error : ring %d does not contain rank %d", r, i);
return ncclInternalError;
}
}
}
return ncclSuccess;
}
void* waitForNonNullPtr(void* p) {
volatile void** ptr = (volatile void**) p;
while (*ptr == NULL) sched_yield();
return (void*)*ptr;
}
ncclResult_t initParams(struct ncclComm* comm) {
struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
params->args = &comm->argsptr;
params->stream = NULL;
params->sharedMem = 0;
params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
return ncclSuccess;
}
// Allocate/Set Intra Process Structures and set CG options
ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
comm->intraRank = rank;
comm->intraRanks = ranks;
comm->intraPhase = 0;
// Alloc shared structures
if (rank == 0) {
assert(comm == comm0);
int* bar;
NCCLCHECK(ncclCalloc(&bar, 2));
bar[0] = bar[1] = 0;
comm->intraBarrier = bar;
NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
int* CGMode;
NCCLCHECK(ncclCalloc(&CGMode, 1));
*CGMode = 0x11;
comm->intraCGMode = CGMode;
int* CC;
NCCLCHECK(ncclCalloc(&CC, 1));
*CC = ncclCudaFullCompCap();
comm->intraCC = CC;
} else {
comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
}
comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
NCCLCHECK(initParams(comm));
int cgMdLaunch = 0;
// Set CG Mode
comm->launchMode = ncclComm::GROUP;
char* str = getenv("NCCL_LAUNCH_MODE");
if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
comm->launchMode = ncclComm::PARALLEL;
}
if (comm->launchMode == ncclComm::GROUP) {
CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
#if __CUDACC_VER_MAJOR__ >= 9
if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
// Check whether the GPU supports Cooperative Group Multi Device Launch
(void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
}
#endif
}
// Disable cgMdLaunch if any rank does not support it
if (cgMdLaunch == 0) {
*comm->intraCGMode = 0x10;
}
return ncclSuccess;
}
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
int rank = comm->rank;
int nranks = comm->nRanks;
void* commState;
NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
struct ncclInfo* allInfo;
NCCLCHECK(ncclCalloc(&allInfo, nranks));
NCCLCHECK(fillInfo(allInfo+rank, rank));
NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
int* connectTransport;
ncclTvalue_t* connectValue;
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
//if (rank == 0) dumpMatrix(connectTransport, nranks);
//if (rank == 0) dumpMatrix(connectValue, nranks);
// Get my rings
int nrings;
int* prev, *next;
NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
comm->nThreads = getDefaultThreads();
NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
free(connectTransport);
free(connectValue);
// Find max nThreads
int allData[nranks];
allData[rank] = comm->nThreads;
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++)
comm->nThreads = std::max(allData[i], comm->nThreads);
if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads);
// Determine the minimum CUDA Compute capability of all GPUs
int myCompCap = ncclCudaCompCap();
int minCompCap = myCompCap;
allData[rank] = myCompCap;
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++)
minCompCap = std::min(allData[i], minCompCap);
if (rank == 0) INFO(INIT,"Min Comp Cap %d", minCompCap);
// Find min nrings across ranks
allData[rank] = nrings;
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++)
nrings = std::min(allData[i], nrings);
// Exchange data with others to build complete rings
comm->nRings = nrings;
for (int r=0; r<nrings; r++) {
NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
}
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
free(prev);
free(next);
// Connect with prev/next for each ring
for (int r=0; r<nrings; r++) {
int* ringRanks = rings+r*nranks;
struct ncclRing *ring = comm->rings+r;
struct ncclConnect connect[2];
NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connect));
NCCLCHECK(bootstrapRingExchange(commState, connect, ring->userRanks[nranks-1], ring->userRanks[1], sizeof(struct ncclConnect)));
NCCLCHECK(ring->send.transport->send.connect(connect+1, &ring->send));
NCCLCHECK(ring->recv.transport->recv.connect(connect+0, &ring->recv));
}
free(rings);
free(allInfo);
// Intra-process barrier setup
struct rankInfo {
uint64_t hostHash;
uint64_t pidHash;
struct ncclComm* comm;
} rankInfos[nranks];
rankInfos[rank].hostHash = getHostHash();
rankInfos[rank].pidHash = getPidHash();
rankInfos[rank].comm = comm;
NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
// Compute intra ranks
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
for (int r=0; r<nranks; r++) {
if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
(rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
if (intraRanks == 0) intraRank0 = r;
if (r == rank) intraRank = intraRanks;
intraRanks++;
}
}
TRACE(INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
return ncclInternalError;
}
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
// Barrier
bootstrapClose(commState);
return ncclSuccess;
}
bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false;
if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
WARN("Failed to set CPU affinity");
return false;
}
return true;
}
ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
cpu_set_t affinitySave;
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
NCCLCHECK(wrapNvmlSymbols());
NCCLCHECK(wrapNvmlInit());
// Make sure all host memory allocation are close to the GPU
int cudaDev;
nvmlDevice_t nvmlDevice;
CUDACHECK(cudaGetDevice(&cudaDev));
SetCpuAffinity(cudaDev, &nvmlDevice);
ncclResult_t res;
NCCLCHECKGOTO(commAlloc(newcomm, ndev, myrank), res, cleanup);
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
return ncclSuccess;
cleanup:
*newcomm = NULL;
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return res;
}
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
char* env = getenv("NCCL_COMM_ID");
if (env && myrank == 0) {
NCCLCHECK(bootstrapCreateRoot(&commId, true));
}
NCCLCHECK(ncclInit());
if (myrank == 0) showVersion();
INFO(INIT,"rank %d nranks %d", myrank, nranks);
// Make sure the CUDA runtime is initialized.
CUDACHECK(cudaFree(NULL));
NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
WARN("Invalid rank requested : %d/%d", myrank, nranks);
return ncclInvalidArgument;
}
if (ncclAsyncMode()) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
} else {
return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
}
}
static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
struct ncclInfo* allInfo;
NCCLCHECK(ncclCalloc(&allInfo, nranks));
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
NCCLCHECK(fillInfo(allInfo+rank, rank));
}
int* connectTransport;
ncclTvalue_t* connectValue;
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
for (int rank=0; rank<nranks; rank++)
NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
int* prev, *prevFinal, *next, *nextFinal;
NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
int nrings = MAXRINGS;
int nthreads=0;
int myCompCap = ncclCudaCompCap();
int minCompCap = myCompCap;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
int nringsRank;
int nthreadsRank = getDefaultThreads();
myCompCap = ncclCudaCompCap();
NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
nrings = std::min(nrings, nringsRank);
nthreads = std::max(nthreads, nthreadsRank);
minCompCap = std::min(minCompCap, myCompCap);
for (int ring=0; ring<nrings; ring++) {
int index = ring*nranks+rank;
prevFinal[index] = prev[index];
nextFinal[index] = next[index];
}
}
free(connectTransport);
free(connectValue);
free(prev);
free(next);
INFO(INIT,"Using %d threads", nthreads);
INFO(INIT,"Min Comp Cap %d", minCompCap);
int* rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
free(prevFinal);
free(nextFinal);
for (int rank=0; rank<nranks; rank++) {
comms[rank]->nRings = nrings;
comms[rank]->nThreads = nthreads;
}
for (int r=0; r<nrings; r++) {
struct ncclConnect connect[2*nranks];
int* ringRanks = rings+r*nranks;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
}
// RingExchange connect information
for (int rank=0; rank<nranks; rank++) {
// Swap rank->prev and prevRank->next
struct ncclRing *ring = comms[rank]->rings+r;
int prevRank = ring->userRanks[nranks-1];
struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
struct ncclConnect* rankPrevConnect = connect+2*rank;
swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
}
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
struct ncclRing *ring = comms[rank]->rings+r;
NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
}
}
free(rings);
free(allInfo);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
NCCLCHECK(ncclInit());
NCCLCHECK(wrapNvmlSymbols());
NCCLCHECK(wrapNvmlInit());
showVersion();
INFO(INIT,"nranks %d", ndev);
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
if (ndev < 1) {
WARN("Invalid device count requested : %d", ndev);
return ncclInvalidArgument;
}
ncclResult_t res;
int savedDevice;
int rank, cudaDev;
ncclComm_t comm = NULL;
nvmlDevice_t nvmlDevice;
int ncclDevList[ndev];
for (int i=0; i<ndev; i++) {
ncclDevList[i] = devlist ? devlist[i] : i;
}
cudaGetDevice(&savedDevice);
for(rank=0; rank<ndev; ++rank)
comms[rank] = NULL;
cpu_set_t affinitySave;
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
for (rank=0; rank<ndev; ++rank) {
cudaDev = ncclDevList[rank];
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
SetCpuAffinity(cudaDev, &nvmlDevice);
NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
comms[rank] = comm;
NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
}
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
for(rank=0; rank<ndev; ++rank) {
cudaDev = ncclDevList[rank];
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
}
res = ncclSuccess;
goto final;
cleanup:
for(rank=0; rank<ndev; ++rank) {
if(comms[rank] != NULL) {
commFree(comms[rank]);
}
}
final:
if(wrapNvmlShutdown() != ncclSuccess)
INFO(INIT,"NCCL did not shutdown nvml properly");
cudaSetDevice(savedDevice);
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return res;
}
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
int savedDevice;
CUDACHECK(cudaGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
if (savedDevice != commDevice) {
CUDACHECK(cudaSetDevice(commDevice));
}
NCCLCHECK(commFree(comm));
if (savedDevice != commDevice)
CUDACHECK(cudaSetDevice(savedDevice));
return ncclSuccess;
}
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
const char* ncclGetErrorString(ncclResult_t code) {
switch (code) {
case ncclSuccess : return "no error";
case ncclUnhandledCudaError : return "unhandled cuda error";
case ncclSystemError : return "unhandled system error";
case ncclInternalError : return "internal error";
case ncclInvalidArgument : return "invalid argument";
case ncclInvalidUsage : return "invalid usage";
default : return "unknown result code";
}
}
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
*count = comm->nRanks;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
*devid = comm->cudaDev;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
*rank = comm->rank;
return ncclSuccess;
}

View File

@ -1,155 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "libwrap.h"
#include <dlfcn.h>
#include "core.h"
int symbolsLoaded = 0;
static nvmlReturn_t (*nvmlInternalInit)(void);
static nvmlReturn_t (*nvmlInternalShutdown)(void);
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
ncclResult_t wrapSymbols(void) {
if (symbolsLoaded)
return ncclSuccess;
static void* nvmlhandle = NULL;
void* tmp;
void** cast;
nvmlhandle=dlopen("libnvidia-ml.so", RTLD_NOW);
if (!nvmlhandle) {
nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
if (!nvmlhandle) {
WARN("Failed to open libnvidia-ml.so[.1]");
goto teardown;
}
}
#define LOAD_SYM(handle, symbol, funcptr) do { \
cast = (void**)&funcptr; \
tmp = dlsym(handle, symbol); \
if (tmp == NULL) { \
WARN("dlsym failed on %s - %s", symbol, dlerror());\
goto teardown; \
} \
*cast = tmp; \
} while (0)
LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
symbolsLoaded = 1;
return ncclSuccess;
teardown:
nvmlInternalInit = NULL;
nvmlInternalShutdown = NULL;
nvmlInternalDeviceGetHandleByPciBusId = NULL;
nvmlInternalDeviceGetIndex = NULL;
nvmlInternalDeviceSetCpuAffinity = NULL;
nvmlInternalDeviceClearCpuAffinity = NULL;
if (nvmlhandle != NULL) dlclose(nvmlhandle);
return ncclSystemError;
}
ncclResult_t wrapNvmlInit(void) {
if (nvmlInternalInit == NULL) {
WARN("lib wrapper not initialized.");
return ncclLibWrapperNotSet;
}
nvmlReturn_t ret = nvmlInternalInit();
if (ret != NVML_SUCCESS) {
WARN("nvmlInit() failed: %s",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlShutdown(void) {
if (nvmlInternalShutdown == NULL) {
WARN("lib wrapper not initialized.");
return ncclLibWrapperNotSet;
}
nvmlReturn_t ret = nvmlInternalShutdown();
if (ret != NVML_SUCCESS) {
WARN("nvmlShutdown() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
WARN("lib wrapper not initialized.");
return ncclLibWrapperNotSet;
}
nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
if (nvmlInternalDeviceGetIndex == NULL) {
WARN("lib wrapper not initialized.");
return ncclLibWrapperNotSet;
}
nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetIndex() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
if (nvmlInternalDeviceSetCpuAffinity == NULL) {
WARN("lib wrapper not initialized.");
return ncclLibWrapperNotSet;
}
nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
if (nvmlInternalInit == NULL) {
WARN("lib wrapper not initialized.");
return ncclLibWrapperNotSet;
}
nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}

Some files were not shown because too many files have changed in this diff Show More