Add Fortran bindings

This commit is contained in:
Kyle Fernandes, ne Jacobs 2016-11-17 15:27:20 -08:00 committed by Sylvain Jeaugey
parent 534b9a1697
commit 5f2b32e45b
14 changed files with 2133 additions and 9 deletions

View File

@ -11,6 +11,7 @@ KEEP ?= 0
DEBUG ?= 0
PROFAPI ?= 0
BUILDDIR ?= build
BUILDDIR := $(abspath $(BUILDDIR))
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
@ -21,7 +22,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_52,code=compute_52
CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden
CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96
# Use addprefix so that we can specify more than one path
LDFLAGS := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
@ -59,7 +60,7 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
CXXFLAGS += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
.PHONY : lib clean test mpitest install deb debian debclean
.PHONY : lib clean test mpitest install deb debian debclean forlib fortest forclean
.DEFAULT : lib
INCEXPORTS := nccl.h
@ -82,19 +83,19 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-include $(DEPFILES)
$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
@printf "Linking %-25s\n" $@
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
$(INCDIR)/%.h : src/%.h
@printf "Grabbing %-25s > %-25s\n" $< $@
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
cp -f $< $@
$(OBJDIR)/%.o : src/%.cu
@printf "Compiling %-25s > %-25s\n" $< $@
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p $(OBJDIR)
$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
@ -147,7 +148,7 @@ MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
test : $(TESTBINS)
$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
@printf "Building %-25s > %-24s\n" $< $@
@printf "Building %-35s > %s\n" $< $@
mkdir -p $(TSTDIR)
$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
@ -159,7 +160,7 @@ $(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
mpitest : $(MPITESTBINS)
$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
@printf "Building %-25s > %-24s\n" $< $@
@printf "Building %-35s > %s\n" $< $@
mkdir -p $(MPITSTDIR)
$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
@ -193,7 +194,7 @@ debclean :
rm -Rf $(DEBIANDIR)
$(DEBIANDIR)/% : debian/%.in
@printf "Generating %-25s > %-24s\n" $< $@
@printf "Generating %-35s > %s\n" $< $@
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
@ -205,7 +206,18 @@ $(DEBIANDIR)/% : debian/%.in
$< > $@
$(DEBIANDIR)/% : debian/%
@printf "Grabbing %-25s > %-25s\n" $< $@
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(DEBIANDIR)
cp -f $< $@
#### FORTRAN BINDINGS ####
export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
forlib : lib
$(MAKE) -C fortran lib
fortest : forlib
$(MAKE) -C fortran test
forclean :
$(MAKE) -C fortran clean

81
fortran/Makefile Normal file
View File

@ -0,0 +1,81 @@
FC := gfortran
FCNAME := $(notdir $(FC))
BUILDDIR ?= ../build
INCDIR := $(BUILDDIR)/include
LIBDIR := $(BUILDDIR)/lib
OBJDIR := $(BUILDDIR)/obj
LIBNAME := libncclfor.so
LIBSONAME := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
LIBTARGET := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
LIBLINK += $(patsubst lib%.so,-l%,$(LIBNAME))
LIBCUDAFOR := libcudafor.so
ifneq ($(filter pgf%, $(FCNAME)), )
# PGI compiler (pgfortran, pgf90, pgf95)
FCMODFLAGS := -module $(INCDIR)
FCPREFLAGS := -Mpreprocess
FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
FCFLAGS := -fast -O3
else
# non-PGI compilers do not have CUDA support, compile our own CUDA lib
CUDAFORDEP := $(LIBDIR)/$(LIBCUDAFOR)
CUDALINK := -L$(CUDA_LIB) -lcudart
CUDAFORLINK := -lcudafor
ifeq ($(FCNAME), gfortran)
FCMODFLAGS := -J$(INCDIR)
FCPREFLAGS += -cpp
FCFLAGS += -ffree-line-length-none
else ifeq ($(FCNAME), ifort)
FCMODFLAGS := -module $(INCDIR)
FCPREFLAGS += -fpp
endif
endif
ifeq ($(VERBOSE), 0)
.SILENT:
endif
lib: $(CUDAFORDEP)
$(MAKE) $(LIBDIR)/$(LIBTARGET)
$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
@printf "Linking %-35s > %s\n" $(LIBCUDAFOR) $@
mkdir -p $(LIBDIR)
$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
$(OBJDIR)/%.o: src/%.f90
@printf "Building %-35s > %s\n" $< $@
mkdir -p $(OBJDIR)
mkdir -p $(INCDIR)
$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
ifneq ($(filter pgf%, $(FCNAME)), )
TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
endif
TESTDIR := $(BUILDDIR)/test/fortran
TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
test: lib $(TESTBINS)
$(TESTDIR)/%: test/%.f90 lib
@printf "Building %-35s > %s\n" $< $@
@mkdir -p $(TESTDIR)
$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
clean:
rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
rm -rf $(TESTDIR)/

164
fortran/src/cudafor.f90 Normal file
View File

@ -0,0 +1,164 @@
#ifndef _CUDA
!Start cudaFor module
module cudaFor
use iso_c_binding
implicit none
private
public :: c_devptr
public :: cudaMemcpyKind, &
cudaMemcpyHostToHost, &
cudaMemcpyHostToDevice, &
cudaMemcpyDeviceToHost, &
cudaMemcpyDeviceToDevice, &
cudaMemcpyDefault
public :: cuda_stream_kind
public :: cudaGetDeviceCount
public :: cudaSetDevice
public :: cudaMalloc
public :: cudaMemcpy
public :: cudaFree
public :: cudaStreamCreate
public :: cudaStreamSynchronize
public :: cudaStreamDestroy
!Start types
!Start c_devptr
type, bind(c) :: c_devptr
type(c_ptr) :: member
end type c_devptr
!End c_devptr
!Start cudaMemcpyKind
type, bind(c) :: cudaMemcpyKind
integer(c_int) :: member
end type cudaMemcpyKind
type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost = cudaMemcpyKind(0), &
cudaMemcpyHostToDevice = cudaMemcpyKind(1), &
cudaMemcpyDeviceToHost = cudaMemcpyKind(2), &
cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
cudaMemcpyDefault = cudaMemcpyKind(4)
!End cudaMemcpyKind
!Start cuda_stream_kind
integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
!End cuda_stream_kind
!End types
!Start interfaces
!Start cudaGetDeviceCount
interface cudaGetDeviceCount
integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
import :: c_int
implicit none
integer(c_int) :: count
end function cudaGetDeviceCount
end interface cudaGetDeviceCount
!End cudaGetDeviceCount
!Start cudaSetDevice
interface cudaSetDevice
integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
import :: c_int
implicit none
integer(c_int), value :: device
end function cudaSetDevice
end interface cudaSetDevice
!End cudaSetDevice
!Start cudaMalloc
interface cudaMalloc
integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
import :: c_int, c_size_t
import :: c_devptr
implicit none
type(c_devptr) :: devPtr
integer(c_size_t), value :: size
end function cudaMalloc
end interface cudaMalloc
!End cudaMalloc
!Start cudaMemcpy
interface cudaMemcpy
!Start cudaMemcpyH2D
integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
import :: c_ptr, c_int, c_size_t
import :: c_devptr, cudaMemcpyKind
implicit none
type(c_devptr), value :: dst
type(c_ptr), value :: src
integer(c_size_t), value :: count
type(cudaMemcpyKind), value :: kind
end function cudaMemcpyH2D
!End cudaMemcpyH2D
!Start cudaMemcpyD2H
integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
import :: c_ptr, c_int, c_size_t
import :: c_devptr, cudaMemcpyKind
implicit none
type(c_ptr), value :: dst
type(c_devptr), value :: src
integer(c_size_t), value :: count
type(cudaMemcpyKind), value :: kind
end function cudaMemcpyD2H
!End cudaMemcpyD2H
end interface cudaMemcpy
!End cudaMemcpy
!Start cudaFree
interface cudaFree
integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
import :: c_int
import :: c_devptr
implicit none
type(c_devptr), value :: devPtr
end function cudaFree
end interface cudaFree
!End cudaFree
!Start cudaStreamCreate
interface cudaStreamCreate
integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
import :: c_int
import :: cuda_stream_kind
implicit none
integer(cuda_stream_kind) :: pStream
end function cudaStreamCreate
end interface cudaStreamCreate
!End cudaStreamCreate
!Start cudaStreamSynchronize
interface cudaStreamSynchronize
integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
import :: c_int
import :: cuda_stream_kind
implicit none
integer(cuda_stream_kind), value :: stream
end function cudaStreamSynchronize
end interface cudaStreamSynchronize
!End cudaStreamSynchronize
!Start cudaStreamDestroy
interface cudaStreamDestroy
integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
import :: c_int
import :: cuda_stream_kind
implicit none
integer(cuda_stream_kind), value :: stream
end function cudaStreamDestroy
end interface cudaStreamDestroy
!End cudaStreamDestroy
!End interfaces
end module cudaFor
!End cudaFor module
#endif

305
fortran/src/ncclfor.f90 Normal file
View File

@ -0,0 +1,305 @@
!Start defines
#define NCCL_UNIQUE_ID_BYTES 128
!End defines
!Start ncclFor module
module ncclFor
use iso_c_binding
use cudaFor
implicit none
private
public :: ncclUniqueId
public :: ncclComm
public :: ncclResult, &
ncclSuccess, &
ncclUnhandledCudaError, &
ncclSystemError, &
ncclInternalError, &
ncclInvalidDevicePointer, &
ncclInvalidRank, &
ncclUnsupportedDeviceCount, &
ncclDeviceNotFound, &
ncclInvalidDeviceIndex, &
ncclLibWrapperNotSet, &
ncclCudaMallocFailed, &
ncclRankMismatch, &
ncclInvalidArgument, &
ncclInvalidType, &
ncclInvalidOperation, &
nccl_NUM_RESULTS
public :: ncclDataType, &
ncclChar, &
ncclInt, &
#ifdef CUDA_HAS_HALF
ncclHalf, &
#endif
ncclFloat, &
ncclDouble, &
ncclInt64, &
ncclUInt64, &
nccl_NUM_TYPES
public :: ncclRedOp, &
ncclSum, &
ncclProd, &
ncclMax, &
ncclMin, &
nccl_NUM_OPS
public :: ncclGetUniqueId
public :: ncclCommInitRank
public :: ncclCommInitAll
public :: ncclCommCuDevice
public :: ncclCommUserRank
public :: ncclCommCount
public :: ncclCommDestroy
public :: ncclReduce
public :: ncclAllReduce
public :: ncclReduceScatter
public :: ncclBcast
public :: ncclAllGather
!Start types
!Start ncclUniqueId
type, bind(c) :: ncclUniqueId
character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
end type ncclUniqueId
!End ncclUniqueId
!Start ncclComm
type, bind(c) :: ncclComm
type(c_ptr) :: member
end type ncclComm
!End ncclComm
!Start ncclResult
type, bind(c) :: ncclResult
integer(c_int) :: member
end type ncclResult
type(ncclResult), parameter :: ncclSuccess = ncclResult( 0), &
ncclUnhandledCudaError = ncclResult( 1), &
ncclSystemError = ncclResult( 2), &
ncclInternalError = ncclResult( 3), &
ncclInvalidDevicePointer = ncclResult( 4), &
ncclInvalidRank = ncclResult( 5), &
ncclUnsupportedDeviceCount = ncclResult( 6), &
ncclDeviceNotFound = ncclResult( 7), &
ncclInvalidDeviceIndex = ncclResult( 8), &
ncclLibWrapperNotSet = ncclResult( 9), &
ncclCudaMallocFailed = ncclResult(10), &
ncclRankMismatch = ncclResult(11), &
ncclInvalidArgument = ncclResult(12), &
ncclInvalidType = ncclResult(13), &
ncclInvalidOperation = ncclResult(14), &
nccl_NUM_RESULTS = ncclResult(15)
!End ncclResult
!Start ncclDataType
type, bind(c) :: ncclDataType
integer(c_int) :: member
end type ncclDataType
type(ncclDataType), parameter :: ncclChar = ncclDataType(0), &
ncclInt = ncclDataType(1), &
#ifdef CUDA_HAS_HALF
ncclHalf = ncclDataType(2), &
#endif
ncclFloat = ncclDataType(3), &
ncclDouble = ncclDataType(4), &
ncclInt64 = ncclDataType(5), &
ncclUInt64 = ncclDataType(6), &
nccl_NUM_TYPES = ncclDataType(7)
!End ncclDataType
!Start ncclRedOp
type, bind(c) :: ncclRedOp
integer(c_int) :: member
end type ncclRedOp
type(ncclRedOp), parameter :: ncclSum = ncclRedOp(0), &
ncclProd = ncclRedOp(1), &
ncclMax = ncclRedOp(2), &
ncclMin = ncclRedOp(3), &
nccl_NUM_OPS = ncclRedOp(4)
!End ncclRedOp
!End types
!Start interfaces
!Start ncclGetUniqueId
interface ncclGetUniqueId
type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
import :: ncclResult, ncclUniqueId
implicit none
type(ncclUniqueId) :: uniqueId
end function ncclGetUniqueId
end interface ncclGetUniqueId
!End ncclGetUniqueId
!Start ncclCommInitRank
interface ncclCommInitRank
type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
import :: c_int
import :: ncclResult, ncclUniqueId, ncclComm
implicit none
type(ncclComm) :: comm(*)
integer(c_int), value :: ndev
type(ncclUniqueId), value :: commId
integer(c_int), value :: rank
end function ncclCommInitRank
end interface ncclCommInitRank
!End ncclCommInitRank
!Start ncclCommInitAll
interface ncclCommInitAll
type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
import :: c_int
import :: ncclResult, ncclComm
implicit none
type(ncclComm) :: comm(*)
integer(c_int), value :: ndev
integer(c_int) :: devlist(*)
end function ncclCommInitAll
end interface ncclCommInitAll
!End ncclCommInitAll
!Start ncclCommCuDevice
interface ncclCommCuDevice
type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
import :: c_int
import :: ncclResult, ncclComm
implicit none
type(ncclComm), value :: comm
integer(c_int) :: devid
end function ncclCommCuDevice
end interface ncclCommCuDevice
!End ncclCommCuDevice
!Start ncclCommUserRank
interface ncclCommUserRank
type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
import :: c_int
import :: ncclResult, ncclComm
implicit none
type(ncclComm), value :: comm
integer(c_int) :: rank
end function ncclCommUserRank
end interface ncclCommUserRank
!End ncclCommUserRank
!Start ncclCommCount
interface ncclCommCount
type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
import :: c_int
import :: ncclResult, ncclComm
implicit none
type(ncclComm), value :: comm
integer(c_int) :: count
end function ncclCommCount
end interface ncclCommCount
!End ncclCommCount
!Start ncclCommDestroy
interface ncclCommDestroy
subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
import :: ncclComm
implicit none
type(ncclComm), value :: comm
end subroutine ncclCommDestroy
end interface ncclCommDestroy
!End ncclCommDestroy
!Start ncclReduce
interface ncclReduce
type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
implicit none
type(c_devptr), value :: sendbuff
type(c_devptr), value :: recvbuff
integer(c_int), value :: count
type(ncclDataType), value :: datatype
type(ncclRedOp), value :: op
integer(c_int), value :: root
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclReduce
end interface ncclReduce
!End ncclReduce
!Start ncclAllReduce
interface ncclAllReduce
type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
implicit none
type(c_devptr), value :: sendbuff
type(c_devptr), value :: recvbuff
integer(c_int), value :: count
type(ncclDataType), value :: datatype
type(ncclRedOp), value :: op
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclAllReduce
end interface ncclAllReduce
!End ncclAllReduce
!Start ncclReduceScatter
interface ncclReduceScatter
type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
implicit none
type(c_devptr), value :: sendbuff
type(c_devptr), value :: recvbuff
integer(c_int), value :: recvcount
type(ncclDataType), value :: datatype
type(ncclRedOp), value :: op
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclReduceScatter
end interface ncclReduceScatter
!End ncclReduceScatter
!Start ncclBcast
interface ncclBcast
type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType
implicit none
type(c_devptr), value :: buff
integer(c_int), value :: count
type(ncclDataType), value :: datatype
integer(c_int), value :: root
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclBcast
end interface ncclBcast
!End ncclBcast
!Start ncclAllGather
interface ncclAllGather
type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
import :: c_int
import :: c_devptr, cuda_stream_kind
import :: ncclResult, ncclComm, ncclDataType
implicit none
type(c_devptr), value :: sendbuff
integer(c_int), value :: count
type(ncclDataType), value :: datatype
type(c_devptr), value :: recvbuff
type(ncclComm), value :: comm
integer(cuda_stream_kind), value :: stream
end function ncclAllGather
end interface ncclAllGather
!End ncclAllGather
!End interfaces
end module ncclFor
!End nccl module

View File

@ -0,0 +1,155 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: sendBuff(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
real(real32), allocatable, device :: recvBuff(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl * nDev, nDev + 1))
call random_number(hostBuff)
print "(a)", "before allgather:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(sendBuff(nEl))
sendBuffPtr(i) = c_devloc(sendBuff)
sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(recvBuff(nEl * nDev))
recvBuffPtr(i) = c_devloc(recvBuff)
recvBuff = hostBuff(:, i)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
hostBuff(:, i) = recvBuff
end do
print "(a)", ""
print "(a)", "after allgather:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
end do
err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a)", ""
print "(a, e11.4e2)", "maximum error in sendbuff = ", err
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
deallocate(recvBuff)
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
deallocate(sendBuff)
end do
deallocate(sendBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,164 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl * nDev, nDev + 1))
call random_number(hostBuff)
print "(a)", "before allgather:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
end do
allocate(hostBuffPtr(nDev))
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
end do
print "(a)", ""
print "(a)", "after allgather:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
end do
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
end do
err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a)", ""
print "(a, e11.4e2)", "maximum error in sendbuff = ", err
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(recvBuffPtr(i))
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(sendBuffPtr(i))
end do
deallocate(sendBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,158 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: sendBuff(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
real(real32), allocatable, device :: recvBuff(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before allreduce:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(sendBuff(nEl))
sendBuffPtr(i) = c_devloc(sendBuff)
sendBuff = hostBuff(:, i)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(recvBuff(nEl))
recvBuffPtr(i) = c_devloc(recvBuff)
recvBuff = hostBuff(:, i)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
print "(a)", ""
print "(a)", "after allreduce:"
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
hostBuff(:, nDev + 1) = recvBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
hostBuff(:, nDev + 1) = sendBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
deallocate(recvBuff)
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
deallocate(sendBuff)
end do
deallocate(sendBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,159 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before allreduce:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
allocate(hostBuffPtr(nDev + 1))
do i = 1, nDev + 1
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
print "(a)", ""
print "(a)", "after allreduce:"
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(recvBuffPtr(i))
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(sendBuffPtr(i))
end do
deallocate(sendBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,130 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev, root
type(ncclDataType) :: dataType
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: devBuff(:)
type(c_devptr), allocatable :: devBuffPtr(:)
nEl = 2621440
! nDev = 2
! root = 0
stat = cudaGetDeviceCount(nDev)
root = nDev - 1
dataType = ncclFloat
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 1))
call random_number(hostBuff(:, 1:nDev))
hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
print "(a)", "before broadcast:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
end do
allocate(devBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(devBuff(nEl))
devBuffPtr(i) = c_devloc(devBuff)
devBuff = hostBuff(:, i)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
hostBuff(:, i) = devBuff
end do
print "(a)", ""
print "(a)", "after broadcast:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
deallocate(devBuff)
end do
deallocate(devBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,135 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev, root
type(ncclDataType) :: dataType
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: devBuffPtr(:)
nEl = 2621440
! nDev = 2
! root = 0
stat = cudaGetDeviceCount(nDev)
root = nDev - 1
dataType = ncclFloat
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 1))
call random_number(hostBuff(:, 1:nDev))
hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
print "(a)", "before broadcast:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
end do
allocate(hostBuffPtr(nDev))
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(devBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
end do
print "(a)", ""
print "(a)", "after broadcast:"
do i = 1, nDev
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(devBuffPtr(i))
end do
deallocate(devBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,168 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev, root
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: sendBuff(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
real(real32), allocatable, device :: recvBuff(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
! root = 0
stat = cudaGetDeviceCount(nDev)
root = nDev - 1
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before reduce:"
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(sendBuff(nEl))
sendBuffPtr(i) = c_devloc(sendBuff)
sendBuff = hostBuff(:, i)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(recvBuff(nEl))
recvBuffPtr(i) = c_devloc(recvBuff)
recvBuff = hostBuff(:, i)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
stat = cudaSetDevice(devList(root + 1))
call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
hostBuff(:, nDev + 1) = recvBuff
print "(a)", ""
print "(a)", "after reduce:"
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
hostBuff(:, nDev + 1) = sendBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
if (i - 1 /= root) then
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
hostBuff(:, nDev + 1) = recvBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err
end if
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
deallocate(recvBuff)
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
deallocate(sendBuff)
end do
deallocate(sendBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,168 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev, root
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
! root = 0
stat = cudaGetDeviceCount(nDev)
root = nDev - 1
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before reduce:"
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
allocate(hostBuffPtr(nDev + 1))
do i = 1, nDev + 1
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
stat = cudaSetDevice(devList(root + 1))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
print "(a)", ""
print "(a)", "after reduce:"
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
if (i - 1 /= root) then
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err
end if
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(recvBuffPtr(i))
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(sendBuffPtr(i))
end do
deallocate(sendBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,158 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable :: hostBuff(:, :)
real(real32), allocatable, device :: sendBuff(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
real(real32), allocatable, device :: recvBuff(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl * nDev, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before reducescatter:"
do i = 1, nDev
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(sendBuff(nEl * nDev))
sendBuffPtr(i) = c_devloc(sendBuff)
sendBuff = hostBuff(:, i)
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
allocate(recvBuff(nEl))
recvBuffPtr(i) = c_devloc(recvBuff)
recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
print "(a)", ""
print "(a)", "after reducescatter:"
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
hostBuff(:, nDev + 1) = sendBuff
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
deallocate(recvBuff)
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
deallocate(sendBuff)
end do
deallocate(sendBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test

View File

@ -0,0 +1,167 @@
program test
use iso_c_binding
use iso_fortran_env
use cudaFor
use ncclFor
implicit none
integer(int32) :: stat, i
real(real32) :: err
integer(int32) :: nEl, nDev
type(ncclDataType) :: dataType
type(ncclRedOp) :: redOp
type(ncclComm), allocatable :: comm(:)
integer(int32), allocatable :: devList(:)
type(ncclResult) :: res
integer(int32) :: cudaDev, rank
integer(cuda_stream_kind), allocatable :: stream(:)
integer(int32) :: time(8)
integer(int32), allocatable :: seed(:)
real(real32), allocatable, target :: hostBuff(:, :)
type(c_ptr), allocatable :: hostBuffPtr(:)
type(c_devptr), allocatable :: sendBuffPtr(:)
type(c_devptr), allocatable :: recvBuffPtr(:)
nEl = 2621440
! nDev = 2
stat = cudaGetDeviceCount(nDev)
dataType = ncclFloat
redOp = ncclProd
allocate(comm(nDev))
allocate(devList(nDev))
do i = 1, nDev
devList(i) = i - 1
end do
res = ncclCommInitAll(comm, nDev, devList)
do i = 1, nDev
res = ncclCommCuDevice(comm(i), cudaDev)
res = ncclCommUserRank(comm(i), rank)
end do
allocate(stream(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamCreate(stream(i))
end do
call date_and_time(values = time)
call random_seed(size = i)
allocate(seed(i))
call random_seed(get = seed)
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
call random_seed(put = seed)
allocate(hostBuff(nEl * nDev, nDev + 2))
call random_number(hostBuff(:, 1:nDev + 1))
hostBuff(:, nDev + 2) = hostBuff(:, 1)
do i = 2, nDev
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
end do
print "(a)", "before reducescatter:"
do i = 1, nDev
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
allocate(hostBuffPtr(nDev + 1))
do i = 1, nDev + 1
hostBuffPtr(i) = c_loc(hostBuff(1, i))
end do
allocate(sendBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
end do
do i = 1, nDev
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
end do
allocate(recvBuffPtr(nDev))
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
end do
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamSynchronize(stream(i))
end do
print "(a)", ""
print "(a)", "after reduceScatter:"
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
end do
do i = 1, nDev + 1
hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
end do
print "(a)", ""
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(recvBuffPtr(i))
end do
deallocate(recvBuffPtr)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaFree(sendBuffPtr(i))
end do
deallocate(sendBuffPtr)
deallocate(hostBuffPtr)
deallocate(hostBuff)
deallocate(seed)
do i = 1, nDev
stat = cudaSetDevice(devList(i))
stat = cudaStreamDestroy(stream(i))
end do
deallocate(stream)
do i = 1, nDev
call ncclCommDestroy(comm(i))
end do
deallocate(devList)
deallocate(comm)
end program test