Add Fortran bindings
This commit is contained in:
parent
534b9a1697
commit
5f2b32e45b
30
Makefile
30
Makefile
@ -11,6 +11,7 @@ KEEP ?= 0
|
||||
DEBUG ?= 0
|
||||
PROFAPI ?= 0
|
||||
BUILDDIR ?= build
|
||||
BUILDDIR := $(abspath $(BUILDDIR))
|
||||
|
||||
CUDA_LIB ?= $(CUDA_HOME)/lib64
|
||||
CUDA_INC ?= $(CUDA_HOME)/include
|
||||
@ -21,7 +22,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
|
||||
-gencode=arch=compute_52,code=sm_52 \
|
||||
-gencode=arch=compute_52,code=compute_52
|
||||
|
||||
CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden
|
||||
CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96
|
||||
# Use addprefix so that we can specify more than one path
|
||||
LDFLAGS := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
|
||||
@ -59,7 +60,7 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
CXXFLAGS += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
|
||||
|
||||
.PHONY : lib clean test mpitest install deb debian debclean
|
||||
.PHONY : lib clean test mpitest install deb debian debclean forlib fortest forclean
|
||||
.DEFAULT : lib
|
||||
|
||||
INCEXPORTS := nccl.h
|
||||
@ -82,19 +83,19 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
|
||||
-include $(DEPFILES)
|
||||
|
||||
$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
|
||||
@printf "Linking %-25s\n" $@
|
||||
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
|
||||
mkdir -p $(LIBDIR)
|
||||
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
|
||||
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
|
||||
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
|
||||
|
||||
$(INCDIR)/%.h : src/%.h
|
||||
@printf "Grabbing %-25s > %-25s\n" $< $@
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(INCDIR)
|
||||
cp -f $< $@
|
||||
|
||||
$(OBJDIR)/%.o : src/%.cu
|
||||
@printf "Compiling %-25s > %-25s\n" $< $@
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
mkdir -p $(OBJDIR)
|
||||
$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
|
||||
@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
|
||||
@ -147,7 +148,7 @@ MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
|
||||
test : $(TESTBINS)
|
||||
|
||||
$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
|
||||
@printf "Building %-25s > %-24s\n" $< $@
|
||||
@printf "Building %-35s > %s\n" $< $@
|
||||
mkdir -p $(TSTDIR)
|
||||
$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
|
||||
@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
|
||||
@ -159,7 +160,7 @@ $(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
|
||||
mpitest : $(MPITESTBINS)
|
||||
|
||||
$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
|
||||
@printf "Building %-25s > %-24s\n" $< $@
|
||||
@printf "Building %-35s > %s\n" $< $@
|
||||
mkdir -p $(MPITSTDIR)
|
||||
$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
|
||||
@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
|
||||
@ -193,7 +194,7 @@ debclean :
|
||||
rm -Rf $(DEBIANDIR)
|
||||
|
||||
$(DEBIANDIR)/% : debian/%.in
|
||||
@printf "Generating %-25s > %-24s\n" $< $@
|
||||
@printf "Generating %-35s > %s\n" $< $@
|
||||
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
|
||||
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
|
||||
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
|
||||
@ -205,7 +206,18 @@ $(DEBIANDIR)/% : debian/%.in
|
||||
$< > $@
|
||||
|
||||
$(DEBIANDIR)/% : debian/%
|
||||
@printf "Grabbing %-25s > %-25s\n" $< $@
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(DEBIANDIR)
|
||||
cp -f $< $@
|
||||
|
||||
#### FORTRAN BINDINGS ####
|
||||
|
||||
export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
|
||||
|
||||
forlib : lib
|
||||
$(MAKE) -C fortran lib
|
||||
fortest : forlib
|
||||
$(MAKE) -C fortran test
|
||||
forclean :
|
||||
$(MAKE) -C fortran clean
|
||||
|
||||
|
81
fortran/Makefile
Normal file
81
fortran/Makefile
Normal file
@ -0,0 +1,81 @@
|
||||
FC := gfortran
|
||||
FCNAME := $(notdir $(FC))
|
||||
|
||||
BUILDDIR ?= ../build
|
||||
INCDIR := $(BUILDDIR)/include
|
||||
LIBDIR := $(BUILDDIR)/lib
|
||||
OBJDIR := $(BUILDDIR)/obj
|
||||
|
||||
LIBNAME := libncclfor.so
|
||||
LIBSONAME := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
|
||||
LIBTARGET := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
|
||||
LIBLINK += $(patsubst lib%.so,-l%,$(LIBNAME))
|
||||
|
||||
LIBCUDAFOR := libcudafor.so
|
||||
|
||||
ifneq ($(filter pgf%, $(FCNAME)), )
|
||||
# PGI compiler (pgfortran, pgf90, pgf95)
|
||||
FCMODFLAGS := -module $(INCDIR)
|
||||
FCPREFLAGS := -Mpreprocess
|
||||
FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
|
||||
FCFLAGS := -fast -O3
|
||||
else
|
||||
# non-PGI compilers do not have CUDA support, compile our own CUDA lib
|
||||
CUDAFORDEP := $(LIBDIR)/$(LIBCUDAFOR)
|
||||
CUDALINK := -L$(CUDA_LIB) -lcudart
|
||||
CUDAFORLINK := -lcudafor
|
||||
ifeq ($(FCNAME), gfortran)
|
||||
FCMODFLAGS := -J$(INCDIR)
|
||||
FCPREFLAGS += -cpp
|
||||
FCFLAGS += -ffree-line-length-none
|
||||
else ifeq ($(FCNAME), ifort)
|
||||
FCMODFLAGS := -module $(INCDIR)
|
||||
FCPREFLAGS += -fpp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(VERBOSE), 0)
|
||||
.SILENT:
|
||||
endif
|
||||
|
||||
lib: $(CUDAFORDEP)
|
||||
$(MAKE) $(LIBDIR)/$(LIBTARGET)
|
||||
|
||||
$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
|
||||
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
|
||||
mkdir -p $(LIBDIR)
|
||||
$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
|
||||
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
|
||||
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
|
||||
|
||||
$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
|
||||
@printf "Linking %-35s > %s\n" $(LIBCUDAFOR) $@
|
||||
mkdir -p $(LIBDIR)
|
||||
$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
|
||||
|
||||
$(OBJDIR)/%.o: src/%.f90
|
||||
@printf "Building %-35s > %s\n" $< $@
|
||||
mkdir -p $(OBJDIR)
|
||||
mkdir -p $(INCDIR)
|
||||
$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
|
||||
|
||||
TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
|
||||
ifneq ($(filter pgf%, $(FCNAME)), )
|
||||
TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
|
||||
endif
|
||||
|
||||
TESTDIR := $(BUILDDIR)/test/fortran
|
||||
TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
|
||||
|
||||
test: lib $(TESTBINS)
|
||||
|
||||
$(TESTDIR)/%: test/%.f90 lib
|
||||
@printf "Building %-35s > %s\n" $< $@
|
||||
@mkdir -p $(TESTDIR)
|
||||
$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
|
||||
|
||||
clean:
|
||||
rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
|
||||
rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
|
||||
rm -rf $(TESTDIR)/
|
||||
|
164
fortran/src/cudafor.f90
Normal file
164
fortran/src/cudafor.f90
Normal file
@ -0,0 +1,164 @@
|
||||
#ifndef _CUDA
|
||||
|
||||
!Start cudaFor module
|
||||
module cudaFor
|
||||
use iso_c_binding
|
||||
implicit none
|
||||
private
|
||||
public :: c_devptr
|
||||
public :: cudaMemcpyKind, &
|
||||
cudaMemcpyHostToHost, &
|
||||
cudaMemcpyHostToDevice, &
|
||||
cudaMemcpyDeviceToHost, &
|
||||
cudaMemcpyDeviceToDevice, &
|
||||
cudaMemcpyDefault
|
||||
public :: cuda_stream_kind
|
||||
public :: cudaGetDeviceCount
|
||||
public :: cudaSetDevice
|
||||
public :: cudaMalloc
|
||||
public :: cudaMemcpy
|
||||
public :: cudaFree
|
||||
public :: cudaStreamCreate
|
||||
public :: cudaStreamSynchronize
|
||||
public :: cudaStreamDestroy
|
||||
|
||||
!Start types
|
||||
|
||||
!Start c_devptr
|
||||
type, bind(c) :: c_devptr
|
||||
type(c_ptr) :: member
|
||||
end type c_devptr
|
||||
!End c_devptr
|
||||
|
||||
!Start cudaMemcpyKind
|
||||
type, bind(c) :: cudaMemcpyKind
|
||||
integer(c_int) :: member
|
||||
end type cudaMemcpyKind
|
||||
|
||||
type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost = cudaMemcpyKind(0), &
|
||||
cudaMemcpyHostToDevice = cudaMemcpyKind(1), &
|
||||
cudaMemcpyDeviceToHost = cudaMemcpyKind(2), &
|
||||
cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
|
||||
cudaMemcpyDefault = cudaMemcpyKind(4)
|
||||
!End cudaMemcpyKind
|
||||
|
||||
!Start cuda_stream_kind
|
||||
integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
|
||||
!End cuda_stream_kind
|
||||
|
||||
!End types
|
||||
|
||||
!Start interfaces
|
||||
|
||||
!Start cudaGetDeviceCount
|
||||
interface cudaGetDeviceCount
|
||||
integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
|
||||
import :: c_int
|
||||
implicit none
|
||||
integer(c_int) :: count
|
||||
end function cudaGetDeviceCount
|
||||
end interface cudaGetDeviceCount
|
||||
!End cudaGetDeviceCount
|
||||
|
||||
!Start cudaSetDevice
|
||||
interface cudaSetDevice
|
||||
integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
|
||||
import :: c_int
|
||||
implicit none
|
||||
integer(c_int), value :: device
|
||||
end function cudaSetDevice
|
||||
end interface cudaSetDevice
|
||||
!End cudaSetDevice
|
||||
|
||||
!Start cudaMalloc
|
||||
interface cudaMalloc
|
||||
integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
|
||||
import :: c_int, c_size_t
|
||||
import :: c_devptr
|
||||
implicit none
|
||||
type(c_devptr) :: devPtr
|
||||
integer(c_size_t), value :: size
|
||||
end function cudaMalloc
|
||||
end interface cudaMalloc
|
||||
!End cudaMalloc
|
||||
|
||||
!Start cudaMemcpy
|
||||
interface cudaMemcpy
|
||||
|
||||
!Start cudaMemcpyH2D
|
||||
integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
|
||||
import :: c_ptr, c_int, c_size_t
|
||||
import :: c_devptr, cudaMemcpyKind
|
||||
implicit none
|
||||
type(c_devptr), value :: dst
|
||||
type(c_ptr), value :: src
|
||||
integer(c_size_t), value :: count
|
||||
type(cudaMemcpyKind), value :: kind
|
||||
end function cudaMemcpyH2D
|
||||
!End cudaMemcpyH2D
|
||||
|
||||
!Start cudaMemcpyD2H
|
||||
integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
|
||||
import :: c_ptr, c_int, c_size_t
|
||||
import :: c_devptr, cudaMemcpyKind
|
||||
implicit none
|
||||
type(c_ptr), value :: dst
|
||||
type(c_devptr), value :: src
|
||||
integer(c_size_t), value :: count
|
||||
type(cudaMemcpyKind), value :: kind
|
||||
end function cudaMemcpyD2H
|
||||
!End cudaMemcpyD2H
|
||||
|
||||
end interface cudaMemcpy
|
||||
!End cudaMemcpy
|
||||
|
||||
!Start cudaFree
|
||||
interface cudaFree
|
||||
integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
|
||||
import :: c_int
|
||||
import :: c_devptr
|
||||
implicit none
|
||||
type(c_devptr), value :: devPtr
|
||||
end function cudaFree
|
||||
end interface cudaFree
|
||||
!End cudaFree
|
||||
|
||||
!Start cudaStreamCreate
|
||||
interface cudaStreamCreate
|
||||
integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
|
||||
import :: c_int
|
||||
import :: cuda_stream_kind
|
||||
implicit none
|
||||
integer(cuda_stream_kind) :: pStream
|
||||
end function cudaStreamCreate
|
||||
end interface cudaStreamCreate
|
||||
!End cudaStreamCreate
|
||||
|
||||
!Start cudaStreamSynchronize
|
||||
interface cudaStreamSynchronize
|
||||
integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
|
||||
import :: c_int
|
||||
import :: cuda_stream_kind
|
||||
implicit none
|
||||
integer(cuda_stream_kind), value :: stream
|
||||
end function cudaStreamSynchronize
|
||||
end interface cudaStreamSynchronize
|
||||
!End cudaStreamSynchronize
|
||||
|
||||
!Start cudaStreamDestroy
|
||||
interface cudaStreamDestroy
|
||||
integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
|
||||
import :: c_int
|
||||
import :: cuda_stream_kind
|
||||
implicit none
|
||||
integer(cuda_stream_kind), value :: stream
|
||||
end function cudaStreamDestroy
|
||||
end interface cudaStreamDestroy
|
||||
!End cudaStreamDestroy
|
||||
|
||||
!End interfaces
|
||||
|
||||
end module cudaFor
|
||||
!End cudaFor module
|
||||
|
||||
#endif
|
305
fortran/src/ncclfor.f90
Normal file
305
fortran/src/ncclfor.f90
Normal file
@ -0,0 +1,305 @@
|
||||
!Start defines
|
||||
#define NCCL_UNIQUE_ID_BYTES 128
|
||||
!End defines
|
||||
|
||||
!Start ncclFor module
|
||||
module ncclFor
|
||||
use iso_c_binding
|
||||
use cudaFor
|
||||
implicit none
|
||||
private
|
||||
public :: ncclUniqueId
|
||||
public :: ncclComm
|
||||
public :: ncclResult, &
|
||||
ncclSuccess, &
|
||||
ncclUnhandledCudaError, &
|
||||
ncclSystemError, &
|
||||
ncclInternalError, &
|
||||
ncclInvalidDevicePointer, &
|
||||
ncclInvalidRank, &
|
||||
ncclUnsupportedDeviceCount, &
|
||||
ncclDeviceNotFound, &
|
||||
ncclInvalidDeviceIndex, &
|
||||
ncclLibWrapperNotSet, &
|
||||
ncclCudaMallocFailed, &
|
||||
ncclRankMismatch, &
|
||||
ncclInvalidArgument, &
|
||||
ncclInvalidType, &
|
||||
ncclInvalidOperation, &
|
||||
nccl_NUM_RESULTS
|
||||
public :: ncclDataType, &
|
||||
ncclChar, &
|
||||
ncclInt, &
|
||||
#ifdef CUDA_HAS_HALF
|
||||
ncclHalf, &
|
||||
#endif
|
||||
ncclFloat, &
|
||||
ncclDouble, &
|
||||
ncclInt64, &
|
||||
ncclUInt64, &
|
||||
nccl_NUM_TYPES
|
||||
public :: ncclRedOp, &
|
||||
ncclSum, &
|
||||
ncclProd, &
|
||||
ncclMax, &
|
||||
ncclMin, &
|
||||
nccl_NUM_OPS
|
||||
public :: ncclGetUniqueId
|
||||
public :: ncclCommInitRank
|
||||
public :: ncclCommInitAll
|
||||
public :: ncclCommCuDevice
|
||||
public :: ncclCommUserRank
|
||||
public :: ncclCommCount
|
||||
public :: ncclCommDestroy
|
||||
public :: ncclReduce
|
||||
public :: ncclAllReduce
|
||||
public :: ncclReduceScatter
|
||||
public :: ncclBcast
|
||||
public :: ncclAllGather
|
||||
|
||||
!Start types
|
||||
|
||||
!Start ncclUniqueId
|
||||
type, bind(c) :: ncclUniqueId
|
||||
character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
|
||||
end type ncclUniqueId
|
||||
!End ncclUniqueId
|
||||
|
||||
!Start ncclComm
|
||||
type, bind(c) :: ncclComm
|
||||
type(c_ptr) :: member
|
||||
end type ncclComm
|
||||
!End ncclComm
|
||||
|
||||
!Start ncclResult
|
||||
type, bind(c) :: ncclResult
|
||||
integer(c_int) :: member
|
||||
end type ncclResult
|
||||
|
||||
type(ncclResult), parameter :: ncclSuccess = ncclResult( 0), &
|
||||
ncclUnhandledCudaError = ncclResult( 1), &
|
||||
ncclSystemError = ncclResult( 2), &
|
||||
ncclInternalError = ncclResult( 3), &
|
||||
ncclInvalidDevicePointer = ncclResult( 4), &
|
||||
ncclInvalidRank = ncclResult( 5), &
|
||||
ncclUnsupportedDeviceCount = ncclResult( 6), &
|
||||
ncclDeviceNotFound = ncclResult( 7), &
|
||||
ncclInvalidDeviceIndex = ncclResult( 8), &
|
||||
ncclLibWrapperNotSet = ncclResult( 9), &
|
||||
ncclCudaMallocFailed = ncclResult(10), &
|
||||
ncclRankMismatch = ncclResult(11), &
|
||||
ncclInvalidArgument = ncclResult(12), &
|
||||
ncclInvalidType = ncclResult(13), &
|
||||
ncclInvalidOperation = ncclResult(14), &
|
||||
nccl_NUM_RESULTS = ncclResult(15)
|
||||
!End ncclResult
|
||||
|
||||
!Start ncclDataType
|
||||
type, bind(c) :: ncclDataType
|
||||
integer(c_int) :: member
|
||||
end type ncclDataType
|
||||
|
||||
type(ncclDataType), parameter :: ncclChar = ncclDataType(0), &
|
||||
ncclInt = ncclDataType(1), &
|
||||
#ifdef CUDA_HAS_HALF
|
||||
ncclHalf = ncclDataType(2), &
|
||||
#endif
|
||||
ncclFloat = ncclDataType(3), &
|
||||
ncclDouble = ncclDataType(4), &
|
||||
ncclInt64 = ncclDataType(5), &
|
||||
ncclUInt64 = ncclDataType(6), &
|
||||
nccl_NUM_TYPES = ncclDataType(7)
|
||||
!End ncclDataType
|
||||
|
||||
!Start ncclRedOp
|
||||
type, bind(c) :: ncclRedOp
|
||||
integer(c_int) :: member
|
||||
end type ncclRedOp
|
||||
|
||||
type(ncclRedOp), parameter :: ncclSum = ncclRedOp(0), &
|
||||
ncclProd = ncclRedOp(1), &
|
||||
ncclMax = ncclRedOp(2), &
|
||||
ncclMin = ncclRedOp(3), &
|
||||
nccl_NUM_OPS = ncclRedOp(4)
|
||||
!End ncclRedOp
|
||||
|
||||
!End types
|
||||
|
||||
!Start interfaces
|
||||
|
||||
!Start ncclGetUniqueId
|
||||
interface ncclGetUniqueId
|
||||
type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
|
||||
import :: ncclResult, ncclUniqueId
|
||||
implicit none
|
||||
type(ncclUniqueId) :: uniqueId
|
||||
end function ncclGetUniqueId
|
||||
end interface ncclGetUniqueId
|
||||
!End ncclGetUniqueId
|
||||
|
||||
!Start ncclCommInitRank
|
||||
interface ncclCommInitRank
|
||||
type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
|
||||
import :: c_int
|
||||
import :: ncclResult, ncclUniqueId, ncclComm
|
||||
implicit none
|
||||
type(ncclComm) :: comm(*)
|
||||
integer(c_int), value :: ndev
|
||||
type(ncclUniqueId), value :: commId
|
||||
integer(c_int), value :: rank
|
||||
end function ncclCommInitRank
|
||||
end interface ncclCommInitRank
|
||||
!End ncclCommInitRank
|
||||
|
||||
!Start ncclCommInitAll
|
||||
interface ncclCommInitAll
|
||||
type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
|
||||
import :: c_int
|
||||
import :: ncclResult, ncclComm
|
||||
implicit none
|
||||
type(ncclComm) :: comm(*)
|
||||
integer(c_int), value :: ndev
|
||||
integer(c_int) :: devlist(*)
|
||||
end function ncclCommInitAll
|
||||
end interface ncclCommInitAll
|
||||
!End ncclCommInitAll
|
||||
|
||||
!Start ncclCommCuDevice
|
||||
interface ncclCommCuDevice
|
||||
type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
|
||||
import :: c_int
|
||||
import :: ncclResult, ncclComm
|
||||
implicit none
|
||||
type(ncclComm), value :: comm
|
||||
integer(c_int) :: devid
|
||||
end function ncclCommCuDevice
|
||||
end interface ncclCommCuDevice
|
||||
!End ncclCommCuDevice
|
||||
|
||||
!Start ncclCommUserRank
|
||||
interface ncclCommUserRank
|
||||
type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
|
||||
import :: c_int
|
||||
import :: ncclResult, ncclComm
|
||||
implicit none
|
||||
type(ncclComm), value :: comm
|
||||
integer(c_int) :: rank
|
||||
end function ncclCommUserRank
|
||||
end interface ncclCommUserRank
|
||||
!End ncclCommUserRank
|
||||
|
||||
!Start ncclCommCount
|
||||
interface ncclCommCount
|
||||
type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
|
||||
import :: c_int
|
||||
import :: ncclResult, ncclComm
|
||||
implicit none
|
||||
type(ncclComm), value :: comm
|
||||
integer(c_int) :: count
|
||||
end function ncclCommCount
|
||||
end interface ncclCommCount
|
||||
!End ncclCommCount
|
||||
|
||||
!Start ncclCommDestroy
|
||||
interface ncclCommDestroy
|
||||
subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
|
||||
import :: ncclComm
|
||||
implicit none
|
||||
type(ncclComm), value :: comm
|
||||
end subroutine ncclCommDestroy
|
||||
end interface ncclCommDestroy
|
||||
!End ncclCommDestroy
|
||||
|
||||
!Start ncclReduce
|
||||
interface ncclReduce
|
||||
type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
|
||||
import :: c_int
|
||||
import :: c_devptr, cuda_stream_kind
|
||||
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
|
||||
implicit none
|
||||
type(c_devptr), value :: sendbuff
|
||||
type(c_devptr), value :: recvbuff
|
||||
integer(c_int), value :: count
|
||||
type(ncclDataType), value :: datatype
|
||||
type(ncclRedOp), value :: op
|
||||
integer(c_int), value :: root
|
||||
type(ncclComm), value :: comm
|
||||
integer(cuda_stream_kind), value :: stream
|
||||
end function ncclReduce
|
||||
end interface ncclReduce
|
||||
!End ncclReduce
|
||||
|
||||
!Start ncclAllReduce
|
||||
interface ncclAllReduce
|
||||
type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
|
||||
import :: c_int
|
||||
import :: c_devptr, cuda_stream_kind
|
||||
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
|
||||
implicit none
|
||||
type(c_devptr), value :: sendbuff
|
||||
type(c_devptr), value :: recvbuff
|
||||
integer(c_int), value :: count
|
||||
type(ncclDataType), value :: datatype
|
||||
type(ncclRedOp), value :: op
|
||||
type(ncclComm), value :: comm
|
||||
integer(cuda_stream_kind), value :: stream
|
||||
end function ncclAllReduce
|
||||
end interface ncclAllReduce
|
||||
!End ncclAllReduce
|
||||
|
||||
!Start ncclReduceScatter
|
||||
interface ncclReduceScatter
|
||||
type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
|
||||
import :: c_int
|
||||
import :: c_devptr, cuda_stream_kind
|
||||
import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
|
||||
implicit none
|
||||
type(c_devptr), value :: sendbuff
|
||||
type(c_devptr), value :: recvbuff
|
||||
integer(c_int), value :: recvcount
|
||||
type(ncclDataType), value :: datatype
|
||||
type(ncclRedOp), value :: op
|
||||
type(ncclComm), value :: comm
|
||||
integer(cuda_stream_kind), value :: stream
|
||||
end function ncclReduceScatter
|
||||
end interface ncclReduceScatter
|
||||
!End ncclReduceScatter
|
||||
|
||||
!Start ncclBcast
|
||||
interface ncclBcast
|
||||
type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
|
||||
import :: c_int
|
||||
import :: c_devptr, cuda_stream_kind
|
||||
import :: ncclResult, ncclComm, ncclDataType
|
||||
implicit none
|
||||
type(c_devptr), value :: buff
|
||||
integer(c_int), value :: count
|
||||
type(ncclDataType), value :: datatype
|
||||
integer(c_int), value :: root
|
||||
type(ncclComm), value :: comm
|
||||
integer(cuda_stream_kind), value :: stream
|
||||
end function ncclBcast
|
||||
end interface ncclBcast
|
||||
!End ncclBcast
|
||||
|
||||
!Start ncclAllGather
|
||||
interface ncclAllGather
|
||||
type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
|
||||
import :: c_int
|
||||
import :: c_devptr, cuda_stream_kind
|
||||
import :: ncclResult, ncclComm, ncclDataType
|
||||
implicit none
|
||||
type(c_devptr), value :: sendbuff
|
||||
integer(c_int), value :: count
|
||||
type(ncclDataType), value :: datatype
|
||||
type(c_devptr), value :: recvbuff
|
||||
type(ncclComm), value :: comm
|
||||
integer(cuda_stream_kind), value :: stream
|
||||
end function ncclAllGather
|
||||
end interface ncclAllGather
|
||||
!End ncclAllGather
|
||||
|
||||
!End interfaces
|
||||
|
||||
end module ncclFor
|
||||
!End nccl module
|
155
fortran/test/allgather_arr_out.f90
Normal file
155
fortran/test/allgather_arr_out.f90
Normal file
@ -0,0 +1,155 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable :: hostBuff(:, :)
|
||||
real(real32), allocatable, device :: sendBuff(:)
|
||||
type(c_devptr), allocatable :: sendBuffPtr(:)
|
||||
real(real32), allocatable, device :: recvBuff(:)
|
||||
type(c_devptr), allocatable :: recvBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
|
||||
dataType = ncclFloat
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl * nDev, nDev + 1))
|
||||
|
||||
call random_number(hostBuff)
|
||||
|
||||
print "(a)", "before allgather:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
|
||||
end do
|
||||
|
||||
allocate(sendBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(sendBuff(nEl))
|
||||
sendBuffPtr(i) = c_devloc(sendBuff)
|
||||
sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
|
||||
end do
|
||||
|
||||
allocate(recvBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(recvBuff(nEl * nDev))
|
||||
recvBuffPtr(i) = c_devloc(recvBuff)
|
||||
recvBuff = hostBuff(:, i)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
|
||||
hostBuff(:, i) = recvBuff
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after allgather:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
|
||||
hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
|
||||
end do
|
||||
|
||||
err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a)", ""
|
||||
print "(a, e11.4e2)", "maximum error in sendbuff = ", err
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
|
||||
deallocate(recvBuff)
|
||||
end do
|
||||
|
||||
deallocate(recvBuffPtr)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
|
||||
deallocate(sendBuff)
|
||||
end do
|
||||
|
||||
deallocate(sendBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
164
fortran/test/allgather_ptr_out.f90
Normal file
164
fortran/test/allgather_ptr_out.f90
Normal file
@ -0,0 +1,164 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable, target :: hostBuff(:, :)
|
||||
type(c_ptr), allocatable :: hostBuffPtr(:)
|
||||
type(c_devptr), allocatable :: sendBuffPtr(:)
|
||||
type(c_devptr), allocatable :: recvBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
|
||||
dataType = ncclFloat
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl * nDev, nDev + 1))
|
||||
|
||||
call random_number(hostBuff)
|
||||
|
||||
print "(a)", "before allgather:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
|
||||
end do
|
||||
|
||||
allocate(hostBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
|
||||
end do
|
||||
|
||||
allocate(sendBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
|
||||
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
hostBuffPtr(i) = c_loc(hostBuff(1, i))
|
||||
end do
|
||||
|
||||
allocate(recvBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
|
||||
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after allgather:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
|
||||
end do
|
||||
|
||||
err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a)", ""
|
||||
print "(a, e11.4e2)", "maximum error in sendbuff = ", err
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(recvBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(recvBuffPtr)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(sendBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(sendBuffPtr)
|
||||
|
||||
deallocate(hostBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
158
fortran/test/allreduce_arr_out.f90
Normal file
158
fortran/test/allreduce_arr_out.f90
Normal file
@ -0,0 +1,158 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclRedOp) :: redOp
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable :: hostBuff(:, :)
|
||||
real(real32), allocatable, device :: sendBuff(:)
|
||||
type(c_devptr), allocatable :: sendBuffPtr(:)
|
||||
real(real32), allocatable, device :: recvBuff(:)
|
||||
type(c_devptr), allocatable :: recvBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
|
||||
dataType = ncclFloat
|
||||
redOp = ncclProd
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl, nDev + 2))
|
||||
|
||||
call random_number(hostBuff(:, 1:nDev + 1))
|
||||
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, 1)
|
||||
do i = 2, nDev
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
|
||||
end do
|
||||
|
||||
print "(a)", "before allreduce:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
allocate(sendBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(sendBuff(nEl))
|
||||
sendBuffPtr(i) = c_devloc(sendBuff)
|
||||
sendBuff = hostBuff(:, i)
|
||||
end do
|
||||
|
||||
allocate(recvBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(recvBuff(nEl))
|
||||
recvBuffPtr(i) = c_devloc(recvBuff)
|
||||
recvBuff = hostBuff(:, i)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after allreduce:"
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
|
||||
hostBuff(:, nDev + 1) = recvBuff
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
|
||||
hostBuff(:, nDev + 1) = sendBuff
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
|
||||
end do
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
|
||||
deallocate(recvBuff)
|
||||
end do
|
||||
|
||||
deallocate(recvBuffPtr)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
|
||||
deallocate(sendBuff)
|
||||
end do
|
||||
|
||||
deallocate(sendBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
159
fortran/test/allreduce_ptr_out.f90
Normal file
159
fortran/test/allreduce_ptr_out.f90
Normal file
@ -0,0 +1,159 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclRedOp) :: redOp
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable, target :: hostBuff(:, :)
|
||||
type(c_ptr), allocatable :: hostBuffPtr(:)
|
||||
type(c_devptr), allocatable :: sendBuffPtr(:)
|
||||
type(c_devptr), allocatable :: recvBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
|
||||
dataType = ncclFloat
|
||||
redOp = ncclProd
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl, nDev + 2))
|
||||
|
||||
call random_number(hostBuff(:, 1:nDev + 1))
|
||||
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, 1)
|
||||
do i = 2, nDev
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
|
||||
end do
|
||||
|
||||
print "(a)", "before allreduce:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
allocate(hostBuffPtr(nDev + 1))
|
||||
|
||||
do i = 1, nDev + 1
|
||||
hostBuffPtr(i) = c_loc(hostBuff(1, i))
|
||||
end do
|
||||
|
||||
allocate(sendBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
|
||||
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
allocate(recvBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
|
||||
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after allreduce:"
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
|
||||
end do
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(recvBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(recvBuffPtr)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(sendBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(sendBuffPtr)
|
||||
|
||||
deallocate(hostBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
130
fortran/test/broadcast_arr.f90
Normal file
130
fortran/test/broadcast_arr.f90
Normal file
@ -0,0 +1,130 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev, root
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable :: hostBuff(:, :)
|
||||
real(real32), allocatable, device :: devBuff(:)
|
||||
type(c_devptr), allocatable :: devBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
! root = 0
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
root = nDev - 1
|
||||
|
||||
dataType = ncclFloat
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl, nDev + 1))
|
||||
|
||||
call random_number(hostBuff(:, 1:nDev))
|
||||
|
||||
hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
|
||||
|
||||
print "(a)", "before broadcast:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
|
||||
end do
|
||||
|
||||
allocate(devBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(devBuff(nEl))
|
||||
devBuffPtr(i) = c_devloc(devBuff)
|
||||
devBuff = hostBuff(:, i)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
|
||||
hostBuff(:, i) = devBuff
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after broadcast:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
|
||||
end do
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
|
||||
deallocate(devBuff)
|
||||
end do
|
||||
|
||||
deallocate(devBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
135
fortran/test/broadcast_ptr.f90
Normal file
135
fortran/test/broadcast_ptr.f90
Normal file
@ -0,0 +1,135 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev, root
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable, target :: hostBuff(:, :)
|
||||
type(c_ptr), allocatable :: hostBuffPtr(:)
|
||||
type(c_devptr), allocatable :: devBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
! root = 0
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
root = nDev - 1
|
||||
|
||||
dataType = ncclFloat
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl, nDev + 1))
|
||||
|
||||
call random_number(hostBuff(:, 1:nDev))
|
||||
|
||||
hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
|
||||
|
||||
print "(a)", "before broadcast:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
|
||||
end do
|
||||
|
||||
allocate(hostBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
hostBuffPtr(i) = c_loc(hostBuff(1, i))
|
||||
end do
|
||||
|
||||
allocate(devBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
|
||||
stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after broadcast:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
|
||||
print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
|
||||
end do
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(devBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(devBuffPtr)
|
||||
|
||||
deallocate(hostBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
168
fortran/test/reduce_arr_out.f90
Normal file
168
fortran/test/reduce_arr_out.f90
Normal file
@ -0,0 +1,168 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev, root
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclRedOp) :: redOp
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable :: hostBuff(:, :)
|
||||
real(real32), allocatable, device :: sendBuff(:)
|
||||
type(c_devptr), allocatable :: sendBuffPtr(:)
|
||||
real(real32), allocatable, device :: recvBuff(:)
|
||||
type(c_devptr), allocatable :: recvBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
! root = 0
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
root = nDev - 1
|
||||
|
||||
dataType = ncclFloat
|
||||
redOp = ncclProd
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl, nDev + 2))
|
||||
|
||||
call random_number(hostBuff(:, 1:nDev + 1))
|
||||
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, 1)
|
||||
do i = 2, nDev
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
|
||||
end do
|
||||
|
||||
print "(a)", "before reduce:"
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
|
||||
|
||||
allocate(sendBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(sendBuff(nEl))
|
||||
sendBuffPtr(i) = c_devloc(sendBuff)
|
||||
sendBuff = hostBuff(:, i)
|
||||
end do
|
||||
|
||||
allocate(recvBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(recvBuff(nEl))
|
||||
recvBuffPtr(i) = c_devloc(recvBuff)
|
||||
recvBuff = hostBuff(:, i)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
stat = cudaSetDevice(devList(root + 1))
|
||||
call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
|
||||
hostBuff(:, nDev + 1) = recvBuff
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after reduce:"
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
|
||||
|
||||
print "(a)", ""
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
|
||||
hostBuff(:, nDev + 1) = sendBuff
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
do i = 1, nDev
|
||||
if (i - 1 /= root) then
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
|
||||
hostBuff(:, nDev + 1) = recvBuff
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err
|
||||
end if
|
||||
end do
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
|
||||
deallocate(recvBuff)
|
||||
end do
|
||||
|
||||
deallocate(recvBuffPtr)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
|
||||
deallocate(sendBuff)
|
||||
end do
|
||||
|
||||
deallocate(sendBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
168
fortran/test/reduce_ptr_out.f90
Normal file
168
fortran/test/reduce_ptr_out.f90
Normal file
@ -0,0 +1,168 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev, root
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclRedOp) :: redOp
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable, target :: hostBuff(:, :)
|
||||
type(c_ptr), allocatable :: hostBuffPtr(:)
|
||||
type(c_devptr), allocatable :: sendBuffPtr(:)
|
||||
type(c_devptr), allocatable :: recvBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
! root = 0
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
root = nDev - 1
|
||||
|
||||
dataType = ncclFloat
|
||||
redOp = ncclProd
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl, nDev + 2))
|
||||
|
||||
call random_number(hostBuff(:, 1:nDev + 1))
|
||||
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, 1)
|
||||
do i = 2, nDev
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
|
||||
end do
|
||||
|
||||
print "(a)", "before reduce:"
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
|
||||
|
||||
allocate(hostBuffPtr(nDev + 1))
|
||||
|
||||
do i = 1, nDev + 1
|
||||
hostBuffPtr(i) = c_loc(hostBuff(1, i))
|
||||
end do
|
||||
|
||||
allocate(sendBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
|
||||
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
allocate(recvBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
|
||||
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
stat = cudaSetDevice(devList(root + 1))
|
||||
stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after reduce:"
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
|
||||
|
||||
print "(a)", ""
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
do i = 1, nDev
|
||||
if (i - 1 /= root) then
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err
|
||||
end if
|
||||
end do
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(recvBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(recvBuffPtr)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(sendBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(sendBuffPtr)
|
||||
|
||||
deallocate(hostBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
158
fortran/test/reducescatter_arr_out.f90
Normal file
158
fortran/test/reducescatter_arr_out.f90
Normal file
@ -0,0 +1,158 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclRedOp) :: redOp
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable :: hostBuff(:, :)
|
||||
real(real32), allocatable, device :: sendBuff(:)
|
||||
type(c_devptr), allocatable :: sendBuffPtr(:)
|
||||
real(real32), allocatable, device :: recvBuff(:)
|
||||
type(c_devptr), allocatable :: recvBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
|
||||
dataType = ncclFloat
|
||||
redOp = ncclProd
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl * nDev, nDev + 2))
|
||||
|
||||
call random_number(hostBuff(:, 1:nDev + 1))
|
||||
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, 1)
|
||||
do i = 2, nDev
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
|
||||
end do
|
||||
|
||||
print "(a)", "before reducescatter:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
allocate(sendBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(sendBuff(nEl * nDev))
|
||||
sendBuffPtr(i) = c_devloc(sendBuff)
|
||||
sendBuff = hostBuff(:, i)
|
||||
end do
|
||||
|
||||
allocate(recvBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
allocate(recvBuff(nEl))
|
||||
recvBuffPtr(i) = c_devloc(recvBuff)
|
||||
recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after reducescatter:"
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
|
||||
hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
|
||||
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
|
||||
hostBuff(:, nDev + 1) = sendBuff
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
|
||||
end do
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
|
||||
deallocate(recvBuff)
|
||||
end do
|
||||
|
||||
deallocate(recvBuffPtr)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
|
||||
deallocate(sendBuff)
|
||||
end do
|
||||
|
||||
deallocate(sendBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
167
fortran/test/reducescatter_ptr_out.f90
Normal file
167
fortran/test/reducescatter_ptr_out.f90
Normal file
@ -0,0 +1,167 @@
|
||||
program test
|
||||
use iso_c_binding
|
||||
use iso_fortran_env
|
||||
use cudaFor
|
||||
use ncclFor
|
||||
implicit none
|
||||
integer(int32) :: stat, i
|
||||
real(real32) :: err
|
||||
integer(int32) :: nEl, nDev
|
||||
type(ncclDataType) :: dataType
|
||||
type(ncclRedOp) :: redOp
|
||||
type(ncclComm), allocatable :: comm(:)
|
||||
integer(int32), allocatable :: devList(:)
|
||||
type(ncclResult) :: res
|
||||
integer(int32) :: cudaDev, rank
|
||||
integer(cuda_stream_kind), allocatable :: stream(:)
|
||||
integer(int32) :: time(8)
|
||||
integer(int32), allocatable :: seed(:)
|
||||
real(real32), allocatable, target :: hostBuff(:, :)
|
||||
type(c_ptr), allocatable :: hostBuffPtr(:)
|
||||
type(c_devptr), allocatable :: sendBuffPtr(:)
|
||||
type(c_devptr), allocatable :: recvBuffPtr(:)
|
||||
|
||||
nEl = 2621440
|
||||
|
||||
! nDev = 2
|
||||
stat = cudaGetDeviceCount(nDev)
|
||||
|
||||
dataType = ncclFloat
|
||||
redOp = ncclProd
|
||||
|
||||
allocate(comm(nDev))
|
||||
allocate(devList(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
devList(i) = i - 1
|
||||
end do
|
||||
|
||||
res = ncclCommInitAll(comm, nDev, devList)
|
||||
|
||||
do i = 1, nDev
|
||||
res = ncclCommCuDevice(comm(i), cudaDev)
|
||||
res = ncclCommUserRank(comm(i), rank)
|
||||
end do
|
||||
|
||||
allocate(stream(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamCreate(stream(i))
|
||||
end do
|
||||
|
||||
call date_and_time(values = time)
|
||||
call random_seed(size = i)
|
||||
allocate(seed(i))
|
||||
call random_seed(get = seed)
|
||||
seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
|
||||
call random_seed(put = seed)
|
||||
|
||||
allocate(hostBuff(nEl * nDev, nDev + 2))
|
||||
|
||||
call random_number(hostBuff(:, 1:nDev + 1))
|
||||
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, 1)
|
||||
do i = 2, nDev
|
||||
hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
|
||||
end do
|
||||
|
||||
print "(a)", "before reducescatter:"
|
||||
do i = 1, nDev
|
||||
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
allocate(hostBuffPtr(nDev + 1))
|
||||
|
||||
do i = 1, nDev + 1
|
||||
hostBuffPtr(i) = c_loc(hostBuff(1, i))
|
||||
end do
|
||||
|
||||
allocate(sendBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
|
||||
stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
|
||||
end do
|
||||
|
||||
allocate(recvBuffPtr(nDev))
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
|
||||
stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
|
||||
end do
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamSynchronize(stream(i))
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
print "(a)", "after reduceScatter:"
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
|
||||
err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
|
||||
end do
|
||||
|
||||
do i = 1, nDev + 1
|
||||
hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
|
||||
end do
|
||||
|
||||
print "(a)", ""
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
|
||||
err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
|
||||
print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
|
||||
end do
|
||||
print "(a)", ""
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(recvBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(recvBuffPtr)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaFree(sendBuffPtr(i))
|
||||
end do
|
||||
|
||||
deallocate(sendBuffPtr)
|
||||
|
||||
deallocate(hostBuffPtr)
|
||||
|
||||
deallocate(hostBuff)
|
||||
|
||||
deallocate(seed)
|
||||
|
||||
do i = 1, nDev
|
||||
stat = cudaSetDevice(devList(i))
|
||||
stat = cudaStreamDestroy(stream(i))
|
||||
end do
|
||||
|
||||
deallocate(stream)
|
||||
|
||||
do i = 1, nDev
|
||||
call ncclCommDestroy(comm(i))
|
||||
end do
|
||||
|
||||
deallocate(devList)
|
||||
deallocate(comm)
|
||||
|
||||
end program test
|
Loading…
x
Reference in New Issue
Block a user