NCCL 2.4.6-1
Added detection of IBM/Power NVLink bridge device. Add NUMA support to PCI distance calculations. Added NCCL_IGNORE_CPU_AFFINITY env var. Fix memory leaks; GithubIssue#180 Compiler warning fix; GithubIssue#178 Replace non-standard variable length arrays. GithubIssue#171 Fix Tree+Shared Memory crash. GithubPR#185 Fix LL cleanup hang during long running DL jobs. Fix NCCL_RINGS environment variable handling. Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191 Improve bootstrap socket connection reliability at scale. Fix hostname hashing issue. GithubIssue#187 Code cleanup to rename all non device files from *.cu to *.cc
This commit is contained in:
parent
14e0cf644b
commit
f40ce73e89
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions
|
modification, are permitted provided that the following conditions
|
||||||
|
2
Makefile
2
Makefile
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
|
|||||||
|
|
||||||
## Copyright
|
## Copyright
|
||||||
|
|
||||||
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
@ -15,6 +15,7 @@ PROFAPI ?= 0
|
|||||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||||
|
|
||||||
CUDA_LIB ?= $(CUDA_HOME)/lib64
|
CUDA_LIB ?= $(CUDA_HOME)/lib64
|
||||||
|
CUDA_INC ?= $(CUDA_HOME)/include
|
||||||
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
||||||
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
|
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
|
||||||
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||||
@ -43,7 +44,8 @@ endif
|
|||||||
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
|
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
|
||||||
|
|
||||||
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
|
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
|
||||||
CXXFLAGS += -Wall -Wno-sign-compare
|
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
|
||||||
|
CXXFLAGS += -I $(CUDA_INC)
|
||||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
||||||
# Use addprefix so that we can specify more than one path
|
# Use addprefix so that we can specify more than one path
|
||||||
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
|
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
|
||||||
@ -67,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq ($(VERBOSE), 0)
|
ifneq ($(VERBOSE), 0)
|
||||||
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
|
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
|
||||||
CXXFLAGS += -Wall -Wextra
|
CXXFLAGS += -Wall -Wextra
|
||||||
else
|
else
|
||||||
.SILENT:
|
.SILENT:
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 4
|
NCCL_MINOR := 4
|
||||||
NCCL_PATCH := 2
|
NCCL_PATCH := 6
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
20
src/Makefile
20
src/Makefile
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
@ -9,10 +9,10 @@ include ../makefiles/version.mk
|
|||||||
|
|
||||||
##### src files
|
##### src files
|
||||||
INCEXPORTS := nccl.h nccl_net.h
|
INCEXPORTS := nccl.h nccl_net.h
|
||||||
LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
|
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
|
||||||
misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
|
misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
|
||||||
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
|
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
|
||||||
collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
|
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
|
||||||
|
|
||||||
##### lib files
|
##### lib files
|
||||||
LIBNAME := libnccl.so
|
LIBNAME := libnccl.so
|
||||||
@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
|
|||||||
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
|
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
|
||||||
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
|
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
|
||||||
STATICLIBTARGET := $(STATICLIBNAME)
|
STATICLIBTARGET := $(STATICLIBNAME)
|
||||||
LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
|
LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
|
||||||
DEPFILES := $(LIBOBJ:%.o=%.d)
|
DEPFILES := $(LIBOBJ:%.o=%.d)
|
||||||
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
|
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
|
||||||
|
|
||||||
@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
|
|||||||
mkdir -p $(INCDIR)
|
mkdir -p $(INCDIR)
|
||||||
cp -f $< $@
|
cp -f $< $@
|
||||||
|
|
||||||
$(OBJDIR)/%.o : %.cu
|
$(OBJDIR)/%.o : %.cc
|
||||||
@printf "Compiling %-35s > %s\n" $< $@
|
@printf "Compiling %-35s > %s\n" $< $@
|
||||||
mkdir -p `dirname $@`
|
mkdir -p `dirname $@`
|
||||||
$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
|
$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
|
||||||
@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
|
@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
|
||||||
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
|
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
|
||||||
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
|
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
|
||||||
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
|
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
|
||||||
@ -107,7 +107,7 @@ install : lib
|
|||||||
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
|
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
|
||||||
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
|
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
|
||||||
|
|
||||||
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
|
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
|
||||||
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
|
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
|
||||||
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
|
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
|
||||||
# as the BUILDDIR variable.
|
# as the BUILDDIR variable.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
|||||||
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
|
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
|
||||||
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
|
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Free the peer structures.
|
||||||
|
CUDACHECK(cudaFree(channel->devPeers));
|
||||||
|
free(channel->peers);
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "core.h"
|
#include "devcomm.h"
|
||||||
#include "primitives.h"
|
#include "primitives.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int nthreads = blockDim.x - 1;
|
const int nthreads = blockDim.x - 1;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
const ssize_t size = args->N;
|
const ssize_t size = args->N;
|
||||||
@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
const int nthreads = args->nThreads;
|
const int nthreads = args->nThreads;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "core.h"
|
#include "devcomm.h"
|
||||||
#include "primitives.h"
|
#include "primitives.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int nthreads = blockDim.x - 1;
|
const int nthreads = blockDim.x - 1;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
const ssize_t size = args->N;
|
const ssize_t size = args->N;
|
||||||
@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int nthreads = blockDim.x - 1;
|
const int nthreads = blockDim.x - 1;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclTree* tree = &channel->tree;
|
struct ncclTree* tree = &channel->tree;
|
||||||
const ssize_t size = args->N;
|
const ssize_t size = args->N;
|
||||||
@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
const int nthreads = args->nThreads;
|
const int nthreads = args->nThreads;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
|
|
||||||
@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int nthreads = args->nThreads;
|
const int nthreads = args->nThreads;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclTree* tree = &channel->tree;
|
struct ncclTree* tree = &channel->tree;
|
||||||
const ssize_t size = args->N;
|
const ssize_t size = args->N;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "core.h"
|
#include "devcomm.h"
|
||||||
#include "primitives.h"
|
#include "primitives.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int nthreads = blockDim.x - 1;
|
const int nthreads = blockDim.x - 1;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
const ssize_t size = args->N;
|
const ssize_t size = args->N;
|
||||||
@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
const int nthreads = args->nThreads;
|
const int nthreads = args->nThreads;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -8,7 +8,7 @@
|
|||||||
#define NCCL_DEVICE_COMMON_H_
|
#define NCCL_DEVICE_COMMON_H_
|
||||||
|
|
||||||
#include "../collectives.h"
|
#include "../collectives.h"
|
||||||
#include "core.h"
|
#include "devcomm.h"
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
|
|
||||||
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
|
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
|
||||||
@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
|||||||
int bid = blockIdx.x; \
|
int bid = blockIdx.x; \
|
||||||
__shared__ struct ncclColl localColl; \
|
__shared__ struct ncclColl localColl; \
|
||||||
\
|
\
|
||||||
struct ncclComm* comm = firstColl.args.comm; \
|
struct ncclDevComm* comm = firstColl.args.comm; \
|
||||||
struct ncclChannel* channel = comm->channels+bid; \
|
struct ncclChannel* channel = comm->channels+bid; \
|
||||||
struct ncclColl* c; \
|
struct ncclColl* c; \
|
||||||
if (bid == 0) { \
|
if (bid == 0) { \
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -7,7 +7,7 @@
|
|||||||
#ifndef NCCL_COMMON_KERNEL_H_
|
#ifndef NCCL_COMMON_KERNEL_H_
|
||||||
#define NCCL_COMMON_KERNEL_H_
|
#define NCCL_COMMON_KERNEL_H_
|
||||||
|
|
||||||
#include "core.h"
|
#include "devcomm.h"
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "core.h"
|
#include "devcomm.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
|
@ -50,7 +50,7 @@ class ncclPrimitives {
|
|||||||
T* sendDirectBuff[NSEND];
|
T* sendDirectBuff[NSEND];
|
||||||
const T* recvBuff[NRECV];
|
const T* recvBuff[NRECV];
|
||||||
T* sendBuff[NSEND];
|
T* sendBuff[NSEND];
|
||||||
struct ncclComm* comm;
|
struct ncclDevComm* comm;
|
||||||
|
|
||||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
||||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
||||||
@ -239,7 +239,7 @@ class ncclPrimitives {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
|
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||||
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
|
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
|
||||||
// Make sure step is updated before we read it
|
// Make sure step is updated before we read it
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -329,14 +329,14 @@ class ncclLLPrimitives {
|
|||||||
uint64_t sendConnHead;
|
uint64_t sendConnHead;
|
||||||
union ncclLLFifoLine* recvBuff[NRECV];
|
union ncclLLFifoLine* recvBuff[NRECV];
|
||||||
union ncclLLFifoLine* sendBuff[NSEND];
|
union ncclLLFifoLine* sendBuff[NSEND];
|
||||||
struct ncclComm* comm;
|
struct ncclDevComm* comm;
|
||||||
|
|
||||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||||
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
||||||
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
|
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
|
||||||
inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
|
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
|
||||||
inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
|
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
|
||||||
|
|
||||||
// Exit If Abort Barrier : make sure all threads exit consistently
|
// Exit If Abort Barrier : make sure all threads exit consistently
|
||||||
// Each thread sets a predicate to true if val == 1
|
// Each thread sets a predicate to true if val == 1
|
||||||
@ -393,7 +393,10 @@ class ncclLLPrimitives {
|
|||||||
sendConnHead = *waitPtr;
|
sendConnHead = *waitPtr;
|
||||||
if (checkAbort(sendConn[i]->opCountRem)) break;
|
if (checkAbort(sendConn[i]->opCountRem)) break;
|
||||||
}
|
}
|
||||||
if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
|
if (fifoPtr) {
|
||||||
|
int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
|
||||||
|
fifoPtr[sendStep[i]%NCCL_STEPS] = size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -402,7 +405,12 @@ class ncclLLPrimitives {
|
|||||||
if (tid == i) *postPtr = recvStep[i];
|
if (tid == i) *postPtr = recvStep[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ void postSend(int i) {
|
inline __device__ void postSend(int i, int offset) {
|
||||||
|
// LL Cleanup : write all flags in the slice to make sure we don't have
|
||||||
|
// data corruption when flag loops over.
|
||||||
|
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
|
||||||
|
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
|
||||||
|
}
|
||||||
sendStep[i]++;
|
sendStep[i]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -443,9 +451,10 @@ class ncclLLPrimitives {
|
|||||||
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
|
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
|
||||||
uint64_t* srcPack = (uint64_t*)srcPtr;
|
uint64_t* srcPack = (uint64_t*)srcPtr;
|
||||||
uint64_t* dstPack = (uint64_t*)dstPtr;
|
uint64_t* dstPack = (uint64_t*)dstPtr;
|
||||||
|
int offset = tid;
|
||||||
// Do multiples of 64 bits
|
// Do multiples of 64 bits
|
||||||
#pragma unroll 2
|
#pragma unroll 2
|
||||||
for (int offset=tid; offset<npack; offset+=nthreads) {
|
for (; offset<npack; offset+=nthreads) {
|
||||||
// Recv : local, then intra-node, then inter-node
|
// Recv : local, then intra-node, then inter-node
|
||||||
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
|
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
|
||||||
if (RECV) {
|
if (RECV) {
|
||||||
@ -471,7 +480,7 @@ class ncclLLPrimitives {
|
|||||||
}
|
}
|
||||||
exitIfAbortLocalBarrier();
|
exitIfAbortLocalBarrier();
|
||||||
FOR_RECV(postRecv);
|
FOR_RECV(postRecv);
|
||||||
FOR_SEND(postSend);
|
FOR_SEND(postSend, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
||||||
@ -514,32 +523,9 @@ class ncclLLPrimitives {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void llSendCleaning(int i) {
|
|
||||||
if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
|
|
||||||
/* Reset all flags */
|
|
||||||
static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
|
|
||||||
static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
|
|
||||||
for (int s=0; s<NCCL_STEPS; s++) {
|
|
||||||
waitSend(i, 0);
|
|
||||||
for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
|
|
||||||
const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
|
|
||||||
sendPtr(i)[o].i4 = resetLine.i4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ __forceinline__ void llRecvCleaning(int i) {
|
|
||||||
if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
|
|
||||||
recvStep[i] += NCCL_STEPS;
|
|
||||||
if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
|
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||||
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
|
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
|
||||||
// Make sure step is updated before we read it.
|
// Make sure step is updated before we read it.
|
||||||
barrier();
|
barrier();
|
||||||
@ -577,8 +563,6 @@ class ncclLLPrimitives {
|
|||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ ~ncclLLPrimitives() {
|
__device__ __forceinline__ ~ncclLLPrimitives() {
|
||||||
for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
|
|
||||||
for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
|
|
||||||
// Save steps for the next operation
|
// Save steps for the next operation
|
||||||
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
|
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
|
||||||
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
|
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "core.h"
|
#include "devcomm.h"
|
||||||
#include "primitives.h"
|
#include "primitives.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int nthreads = blockDim.x - 1;
|
const int nthreads = blockDim.x - 1;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
const ssize_t size = args->N;
|
const ssize_t size = args->N;
|
||||||
@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
const int nthreads = args->nThreads;
|
const int nthreads = args->nThreads;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "core.h"
|
#include "devcomm.h"
|
||||||
#include "primitives.h"
|
#include "primitives.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int nthreads = blockDim.x - 1;
|
const int nthreads = blockDim.x - 1;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
const ssize_t size = args->N;
|
const ssize_t size = args->N;
|
||||||
@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
|
|||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = args->bid;
|
const int bid = args->bid;
|
||||||
const int nthreads = args->nThreads;
|
const int nthreads = args->nThreads;
|
||||||
struct ncclComm* comm = args->comm;
|
struct ncclDevComm* comm = args->comm;
|
||||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||||
struct ncclRing* ring = &channel->ring;
|
struct ncclRing* ring = &channel->ring;
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -87,7 +87,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
|
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
|
||||||
params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
|
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
|
||||||
|
|
||||||
// Set active = 2 for the last operation
|
// Set active = 2 for the last operation
|
||||||
for (int r=0; r<params->gridDim.x; r++) {
|
for (int r=0; r<params->gridDim.x; r++) {
|
||||||
@ -266,7 +266,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
|||||||
|
|
||||||
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
|
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
|
||||||
// Compute thresholds and limits that users can override
|
// Compute thresholds and limits that users can override
|
||||||
int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
|
ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
|
||||||
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
|
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
|
||||||
|
|
||||||
// First compute nThreads
|
// First compute nThreads
|
||||||
@ -365,7 +365,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
|
|||||||
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
|
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
|
||||||
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
|
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
|
||||||
|
|
||||||
info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
|
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
|
||||||
if (info->comm->userStreamSet == false) {
|
if (info->comm->userStreamSet == false) {
|
||||||
info->comm->userStream = info->stream;
|
info->comm->userStream = info->stream;
|
||||||
info->comm->userStreamSet = true;
|
info->comm->userStreamSet = true;
|
51
src/include/alloc.h
Normal file
51
src/include/alloc.h
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NCCL_ALLOC_H_
|
||||||
|
#define NCCL_ALLOC_H_
|
||||||
|
|
||||||
|
#include "nccl.h"
|
||||||
|
#include "checks.h"
|
||||||
|
#include <sys/mman.h>
|
||||||
|
|
||||||
|
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
|
||||||
|
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
|
||||||
|
memset(*ptr, 0, size);
|
||||||
|
*devPtr = *ptr;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||||
|
CUDACHECK(cudaFreeHost(ptr));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||||
|
void* p = malloc(nelem*sizeof(T));
|
||||||
|
if (p == NULL) {
|
||||||
|
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
memset(p, 0, nelem*sizeof(T));
|
||||||
|
*ptr = (T*)p;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
|
||||||
|
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
|
||||||
|
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||||
|
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
15
src/include/argcheck.h
Normal file
15
src/include/argcheck.h
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NCCL_ARGCHECK_H_
|
||||||
|
#define NCCL_ARGCHECK_H_
|
||||||
|
|
||||||
|
#include "core.h"
|
||||||
|
|
||||||
|
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
||||||
|
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
||||||
|
|
||||||
|
#endif
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,10 +1,73 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "core.h"
|
#ifndef NCCL_CHECKS_H_
|
||||||
|
#define NCCL_CHECKS_H_
|
||||||
|
|
||||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
#include "debug.h"
|
||||||
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
|
||||||
|
// Check CUDA calls
|
||||||
|
#define CUDACHECK(cmd) do { \
|
||||||
|
cudaError_t e = cmd; \
|
||||||
|
if( e != cudaSuccess ) { \
|
||||||
|
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
||||||
|
return ncclUnhandledCudaError; \
|
||||||
|
} \
|
||||||
|
} while(false)
|
||||||
|
|
||||||
|
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||||
|
cudaError_t e = cmd; \
|
||||||
|
if( e != cudaSuccess ) { \
|
||||||
|
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
||||||
|
res = ncclUnhandledCudaError; \
|
||||||
|
goto label; \
|
||||||
|
} \
|
||||||
|
} while(false)
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
|
// Check system calls
|
||||||
|
#define SYSCHECK(call, name) do { \
|
||||||
|
int retval; \
|
||||||
|
SYSCHECKVAL(call, name, retval); \
|
||||||
|
} while (false)
|
||||||
|
|
||||||
|
#define SYSCHECKVAL(call, name, retval) do { \
|
||||||
|
SYSCHECKSYNC(call, name, retval); \
|
||||||
|
if (retval == -1) { \
|
||||||
|
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||||
|
return ncclSystemError; \
|
||||||
|
} \
|
||||||
|
} while (false)
|
||||||
|
|
||||||
|
#define SYSCHECKSYNC(call, name, retval) do { \
|
||||||
|
retval = call; \
|
||||||
|
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||||
|
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
||||||
|
} else { \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
} while(true)
|
||||||
|
|
||||||
|
// Propagate errors up
|
||||||
|
#define NCCLCHECK(call) do { \
|
||||||
|
ncclResult_t res = call; \
|
||||||
|
if (res != ncclSuccess) { \
|
||||||
|
/* Print the back trace*/ \
|
||||||
|
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||||
|
return res; \
|
||||||
|
} \
|
||||||
|
} while (0);
|
||||||
|
|
||||||
|
#define NCCLCHECKGOTO(call, res, label) do { \
|
||||||
|
res = call; \
|
||||||
|
if (res != ncclSuccess) { \
|
||||||
|
/* Print the back trace*/ \
|
||||||
|
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||||
|
goto label; \
|
||||||
|
} \
|
||||||
|
} while (0);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
127
src/include/comm.h
Normal file
127
src/include/comm.h
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NCCL_COMM_H_
|
||||||
|
#define NCCL_COMM_H_
|
||||||
|
|
||||||
|
#if CUDART_VERSION < 9000
|
||||||
|
struct cudaLaunchParams {
|
||||||
|
void *func;
|
||||||
|
dim3 gridDim;
|
||||||
|
dim3 blockDim;
|
||||||
|
void **args;
|
||||||
|
size_t sharedMem;
|
||||||
|
cudaStream_t stream;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MAXCHANNELS 16
|
||||||
|
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
||||||
|
|
||||||
|
#define CACHE_LINE_SIZE 128
|
||||||
|
#define MEM_ALIGN 4096
|
||||||
|
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
|
||||||
|
|
||||||
|
struct ncclSendMem {
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
uint64_t head;
|
||||||
|
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||||
|
void* ptrExchange;
|
||||||
|
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
|
||||||
|
uint64_t opCount;
|
||||||
|
};
|
||||||
|
char pad3[MEM_ALIGN];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ncclRecvMem {
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
uint64_t tail;
|
||||||
|
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||||
|
uint64_t opCount;
|
||||||
|
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||||
|
int sizesFifo[NCCL_STEPS];
|
||||||
|
};
|
||||||
|
char pad4[MEM_ALIGN];
|
||||||
|
};
|
||||||
|
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
|
||||||
|
char buff[1]; // Actually larger than that
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ncclComm {
|
||||||
|
struct ncclChannel channels[MAXCHANNELS];
|
||||||
|
|
||||||
|
struct ncclPeerInfo* peerInfo;
|
||||||
|
|
||||||
|
void* bootstrap;
|
||||||
|
|
||||||
|
int rank; // my rank in the communicator
|
||||||
|
int nRanks; // number of GPUs in communicator
|
||||||
|
int cudaDev; // my cuda device index
|
||||||
|
int nvmlDev; // my NVML device number
|
||||||
|
|
||||||
|
enum { GROUP, PARALLEL } launchMode;
|
||||||
|
cudaStream_t userStream;
|
||||||
|
bool userStreamSet;
|
||||||
|
cudaEvent_t doneEvent;
|
||||||
|
bool checkPointers;
|
||||||
|
|
||||||
|
// Counter to make sure collectives match (needed for bcast/reduce
|
||||||
|
// where syncs are not symmetric).
|
||||||
|
uint64_t opCount;
|
||||||
|
|
||||||
|
// Channels for collectives
|
||||||
|
int nChannels;
|
||||||
|
int nThreads;
|
||||||
|
|
||||||
|
// Low-latency algorithm threshold
|
||||||
|
ssize_t llThreshold;
|
||||||
|
ssize_t threadThreshold;
|
||||||
|
|
||||||
|
// Tree algorithm threshold
|
||||||
|
ssize_t treeThreshold;
|
||||||
|
|
||||||
|
// An internal CUDA stream for NCCL kernel CGMD launches
|
||||||
|
int groupCudaStream;
|
||||||
|
cudaStream_t groupStream;
|
||||||
|
|
||||||
|
// Whether there has been a fatal error in this communicator.
|
||||||
|
ncclResult_t fatalError;
|
||||||
|
|
||||||
|
// Error reported by GPU
|
||||||
|
volatile ncclDevError_t* fatalDevError;
|
||||||
|
|
||||||
|
// Flag to ask NCCL kernels to abort
|
||||||
|
volatile uint32_t *abortFlag;
|
||||||
|
|
||||||
|
// Device side of the communicator
|
||||||
|
struct ncclDevComm *devComm;
|
||||||
|
// Host copy of the devComm (to free CUDA allocs)
|
||||||
|
struct ncclDevComm hostDevComm;
|
||||||
|
|
||||||
|
// Intra-process sync
|
||||||
|
int intraRank;
|
||||||
|
int intraRanks;
|
||||||
|
int* intraBarrier;
|
||||||
|
int intraPhase;
|
||||||
|
|
||||||
|
// Storage for deferred intra-process launch
|
||||||
|
struct cudaLaunchParams * intraParams;
|
||||||
|
struct cudaLaunchParams *myParams;
|
||||||
|
int* intraCudaDevs;
|
||||||
|
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||||
|
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||||
|
struct ncclColl args;
|
||||||
|
void* argsptr;
|
||||||
|
|
||||||
|
// Global proxy thread
|
||||||
|
pthread_t proxyThread;
|
||||||
|
struct ncclProxyState proxyState;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -7,385 +7,20 @@
|
|||||||
#ifndef NCCL_CORE_H_
|
#ifndef NCCL_CORE_H_
|
||||||
#define NCCL_CORE_H_
|
#define NCCL_CORE_H_
|
||||||
|
|
||||||
#define NCCL_MAX_OPS 2048
|
#include <pthread.h>
|
||||||
#define NCCL_STEPS 8
|
#include <algorithm>
|
||||||
|
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
#include "transport.h"
|
|
||||||
#include "debug.h"
|
#include "debug.h"
|
||||||
|
#include "checks.h"
|
||||||
|
#include "alloc.h"
|
||||||
|
#include "transport.h"
|
||||||
|
#include "devcomm.h"
|
||||||
|
#include "comm.h"
|
||||||
|
#include "info.h"
|
||||||
|
#include "argcheck.h"
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <algorithm> // std::min/std::max
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <cuda_runtime.h>
|
|
||||||
|
|
||||||
#if CUDART_VERSION < 9000
|
|
||||||
struct cudaLaunchParams {
|
|
||||||
void *func;
|
|
||||||
dim3 gridDim;
|
|
||||||
dim3 blockDim;
|
|
||||||
void **args;
|
|
||||||
size_t sharedMem;
|
|
||||||
cudaStream_t stream;
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define MAXCHANNELS 16
|
|
||||||
#define MAXTHREADS 256
|
|
||||||
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
|
||||||
|
|
||||||
// Channels / LL tuning
|
|
||||||
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
|
|
||||||
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
|
|
||||||
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
|
|
||||||
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
|
|
||||||
#define NCCL_LL_MIN_NTHREADS 64
|
|
||||||
|
|
||||||
#define DIVUP(x, y) \
|
|
||||||
(((x)+(y)-1)/(y))
|
|
||||||
#define ROUNDUP(x, y) \
|
|
||||||
(DIVUP((x), (y))*(y))
|
|
||||||
|
|
||||||
#define ALIGN_SIZE(size, align) \
|
|
||||||
size = ((size + (align) - 1) / (align)) * (align);
|
|
||||||
|
|
||||||
union ncclLLFifoLine {
|
|
||||||
/* Flags have to be *after* data, because otherwise, an incomplete receive
|
|
||||||
from the network may receive the flag but not the data.
|
|
||||||
Note this is assuming that either we receive contiguous chunks of data
|
|
||||||
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
|
|
||||||
struct {
|
|
||||||
uint32_t data1;
|
|
||||||
uint32_t flag1;
|
|
||||||
uint32_t data2;
|
|
||||||
uint32_t flag2;
|
|
||||||
};
|
|
||||||
uint64_t v[2];
|
|
||||||
int4 i4;
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
|
|
||||||
|
|
||||||
typedef enum {
|
|
||||||
ncclPatternRing,
|
|
||||||
ncclPatternRingTwice,
|
|
||||||
ncclPatternPipelineFrom,
|
|
||||||
ncclPatternPipelineTo,
|
|
||||||
ncclPatternTreeUp,
|
|
||||||
ncclPatternTreeDown,
|
|
||||||
ncclPatternTreeUpDown
|
|
||||||
} ncclPattern_t;
|
|
||||||
|
|
||||||
typedef enum {
|
|
||||||
ncclDevSuccess,
|
|
||||||
ncclDevAssertedMismatch,
|
|
||||||
ncclDevSuspectedMismatch
|
|
||||||
} ncclDevError_t;
|
|
||||||
|
|
||||||
// Used to pass NCCL call information between functions
|
|
||||||
struct ncclInfo {
|
|
||||||
ncclColl_t coll;
|
|
||||||
const char* opName;
|
|
||||||
// NCCL Coll Args
|
|
||||||
const void* sendbuff;
|
|
||||||
void* recvbuff;
|
|
||||||
size_t count;
|
|
||||||
ncclDataType_t datatype;
|
|
||||||
ncclRedOp_t op;
|
|
||||||
int root;
|
|
||||||
ncclComm_t comm;
|
|
||||||
cudaStream_t stream;
|
|
||||||
// Algorithm details
|
|
||||||
int chunkSteps;
|
|
||||||
int sliceSteps;
|
|
||||||
// Computed later
|
|
||||||
ncclPattern_t pattern;
|
|
||||||
size_t nBytes;
|
|
||||||
int nstepsPerLoop;
|
|
||||||
int nchunksPerLoop;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ncclConnInfo {
|
|
||||||
// Regular comm mechanism
|
|
||||||
char *buff; // Local for recv, remote for send
|
|
||||||
uint64_t *tail; // Local for recv, remote for send
|
|
||||||
uint64_t *head; // Local for send, remote for recv
|
|
||||||
uint64_t *opCountLoc; // opCount of local rank
|
|
||||||
uint64_t *opCountRem; // opCount of remote rank
|
|
||||||
|
|
||||||
int direct; // Direct communication
|
|
||||||
void **ptrExchange; // Pointer exchange for direct communication
|
|
||||||
|
|
||||||
int *fifo; // Size fifo for proxy
|
|
||||||
|
|
||||||
uint64_t step; // Keep where we are
|
|
||||||
|
|
||||||
// Low latency mechanism
|
|
||||||
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
|
|
||||||
uint64_t llLastCleaning;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ncclConnector {
|
|
||||||
int connected;
|
|
||||||
struct ncclProxyArgs *proxyAppend;
|
|
||||||
struct ncclTransportComm* transportComm;
|
|
||||||
void* transportResources; // Host-side resources
|
|
||||||
struct ncclConnInfo conn;
|
|
||||||
struct ncclComm *comm;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define CACHE_LINE_SIZE 128
|
|
||||||
#define MEM_ALIGN 4096
|
|
||||||
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
|
|
||||||
|
|
||||||
#define NUM_LINES_PER_THREAD 8
|
|
||||||
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
|
|
||||||
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
|
|
||||||
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
|
|
||||||
#define NCCL_LL_CLEAN_FREQ 0x10000000
|
|
||||||
|
|
||||||
struct ncclSendMem {
|
|
||||||
union {
|
|
||||||
struct {
|
|
||||||
uint64_t head;
|
|
||||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
|
||||||
void* ptrExchange;
|
|
||||||
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
|
|
||||||
uint64_t opCount;
|
|
||||||
};
|
|
||||||
char pad3[MEM_ALIGN];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ncclRecvMem {
|
|
||||||
union {
|
|
||||||
struct {
|
|
||||||
uint64_t tail;
|
|
||||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
|
||||||
uint64_t opCount;
|
|
||||||
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
|
||||||
int sizesFifo[NCCL_STEPS];
|
|
||||||
};
|
|
||||||
char pad4[MEM_ALIGN];
|
|
||||||
};
|
|
||||||
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
|
|
||||||
char buff[1]; // Actually larger than that
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ncclRing {
|
|
||||||
// Shortcuts for userRanks[1] and userRanks[n-1]
|
|
||||||
int prev;
|
|
||||||
int next;
|
|
||||||
|
|
||||||
// Maps an internal nccl index to user-specified rank order. This is necessary
|
|
||||||
// since we need to know how the user expects data to be ordered across
|
|
||||||
// devices. Ordered from current device.
|
|
||||||
int* userRanks;
|
|
||||||
int* devUserRanks;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define NCCL_MAX_TREE_ARITY 3
|
|
||||||
struct ncclTree {
|
|
||||||
int depth;
|
|
||||||
int up;
|
|
||||||
int down[NCCL_MAX_TREE_ARITY];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ncclPeer {
|
|
||||||
struct ncclConnector send;
|
|
||||||
struct ncclConnector recv;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ncclChannel {
|
|
||||||
union {
|
|
||||||
struct {
|
|
||||||
struct ncclRing ring;
|
|
||||||
struct ncclTree tree;
|
|
||||||
|
|
||||||
int id;
|
|
||||||
int nthreads;
|
|
||||||
int buffSize;
|
|
||||||
|
|
||||||
// Communication structures
|
|
||||||
struct ncclPeer* peers;
|
|
||||||
struct ncclPeer* devPeers;
|
|
||||||
|
|
||||||
// Operation list for aggregation
|
|
||||||
struct ncclColl* collectives;
|
|
||||||
struct ncclColl* devCollectives;
|
|
||||||
int collStart;
|
|
||||||
int collCount;
|
|
||||||
int collFifoHead; // Only used by GPU
|
|
||||||
int collFifoTail; // Only used by CPU
|
|
||||||
};
|
|
||||||
int data[0x80];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
|
|
||||||
|
|
||||||
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
|
||||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
|
||||||
/* Make sure to adjust padding at the end of ncclColl. */
|
|
||||||
struct CollectiveArgs {
|
|
||||||
struct ncclComm* comm;
|
|
||||||
uint64_t opCount;
|
|
||||||
|
|
||||||
// local and remote input, output, and buffer
|
|
||||||
const void * ThisInput;
|
|
||||||
void * ThisOutput;
|
|
||||||
|
|
||||||
// general parameters
|
|
||||||
size_t N;
|
|
||||||
uint32_t root;
|
|
||||||
uint8_t bid;
|
|
||||||
uint8_t nChannels;
|
|
||||||
uint16_t nThreads;
|
|
||||||
|
|
||||||
int lastChunkSize;
|
|
||||||
};
|
|
||||||
struct ncclColl {
|
|
||||||
union {
|
|
||||||
struct {
|
|
||||||
struct CollectiveArgs args;
|
|
||||||
uint16_t funcIndex;
|
|
||||||
uint16_t nextIndex;
|
|
||||||
uint8_t active;
|
|
||||||
};
|
|
||||||
int data[0x10];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
|
|
||||||
|
|
||||||
struct ncclComm {
|
|
||||||
struct ncclChannel channels[MAXCHANNELS];
|
|
||||||
|
|
||||||
struct ncclPeerInfo* peerInfo;
|
|
||||||
|
|
||||||
void* bootstrap;
|
|
||||||
|
|
||||||
int rank; // my rank in the communicator
|
|
||||||
int nRanks; // number of GPUs in communicator
|
|
||||||
int cudaDev; // my cuda device index
|
|
||||||
int nvmlDev; // my NVML device number
|
|
||||||
|
|
||||||
enum { GROUP, PARALLEL } launchMode;
|
|
||||||
cudaStream_t userStream;
|
|
||||||
bool userStreamSet;
|
|
||||||
cudaEvent_t doneEvent;
|
|
||||||
bool checkPointers;
|
|
||||||
|
|
||||||
// Counter to make sure collectives match (needed for bcast/reduce
|
|
||||||
// where syncs are not symmetric).
|
|
||||||
uint64_t opCount;
|
|
||||||
|
|
||||||
// Channels for collectives
|
|
||||||
int nChannels;
|
|
||||||
int nThreads;
|
|
||||||
|
|
||||||
// Low-latency algorithm threshold
|
|
||||||
ssize_t llThreshold;
|
|
||||||
ssize_t threadThreshold;
|
|
||||||
|
|
||||||
// Tree algorithm threshold
|
|
||||||
ssize_t treeThreshold;
|
|
||||||
|
|
||||||
// An internal CUDA stream for NCCL kernel CGMD launches
|
|
||||||
int groupCudaStream;
|
|
||||||
cudaStream_t groupStream;
|
|
||||||
|
|
||||||
// Whether there has been a fatal error in this communicator.
|
|
||||||
ncclResult_t fatalError;
|
|
||||||
|
|
||||||
// Error reported by GPU
|
|
||||||
volatile ncclDevError_t* fatalDevError;
|
|
||||||
|
|
||||||
// On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
|
|
||||||
// On device: this pointer has been obtained from cudaHostGetDevicePointer()
|
|
||||||
volatile uint32_t *abortFlag;
|
|
||||||
|
|
||||||
// Device copy of the communicator
|
|
||||||
struct ncclComm *devComm;
|
|
||||||
|
|
||||||
// Intra-process sync
|
|
||||||
int intraRank;
|
|
||||||
int intraRanks;
|
|
||||||
int* intraBarrier;
|
|
||||||
int intraPhase;
|
|
||||||
|
|
||||||
// Storage for deferred intra-process launch
|
|
||||||
struct cudaLaunchParams * intraParams;
|
|
||||||
struct cudaLaunchParams *myParams;
|
|
||||||
int* intraCudaDevs;
|
|
||||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
|
||||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
|
||||||
struct ncclColl args;
|
|
||||||
void* argsptr;
|
|
||||||
|
|
||||||
// Global proxy thread
|
|
||||||
pthread_t proxyThread;
|
|
||||||
struct ncclProxyState proxyState;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Check CUDA calls
|
|
||||||
#define CUDACHECK(cmd) do { \
|
|
||||||
cudaError_t e = cmd; \
|
|
||||||
if( e != cudaSuccess ) { \
|
|
||||||
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
|
||||||
return ncclUnhandledCudaError; \
|
|
||||||
} \
|
|
||||||
} while(false)
|
|
||||||
|
|
||||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
|
||||||
cudaError_t e = cmd; \
|
|
||||||
if( e != cudaSuccess ) { \
|
|
||||||
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
|
||||||
res = ncclUnhandledCudaError; \
|
|
||||||
goto label; \
|
|
||||||
} \
|
|
||||||
} while(false)
|
|
||||||
|
|
||||||
#include <errno.h>
|
|
||||||
// Check system calls
|
|
||||||
#define SYSCHECK(call, name) do { \
|
|
||||||
int retval; \
|
|
||||||
SYSCHECKVAL(call, name, retval); \
|
|
||||||
} while (false)
|
|
||||||
|
|
||||||
#define SYSCHECKVAL(call, name, retval) do { \
|
|
||||||
SYSCHECKSYNC(call, name, retval); \
|
|
||||||
if (retval == -1) { \
|
|
||||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
|
||||||
return ncclSystemError; \
|
|
||||||
} \
|
|
||||||
} while (false)
|
|
||||||
|
|
||||||
#define SYSCHECKSYNC(call, name, retval) do { \
|
|
||||||
retval = call; \
|
|
||||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
|
||||||
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
|
||||||
} else { \
|
|
||||||
break; \
|
|
||||||
} \
|
|
||||||
} while(true)
|
|
||||||
|
|
||||||
// Propagate errors up
|
|
||||||
#define NCCLCHECK(call) do { \
|
|
||||||
ncclResult_t res = call; \
|
|
||||||
if (res != ncclSuccess) { \
|
|
||||||
/* Print the back trace*/ \
|
|
||||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
|
||||||
return res; \
|
|
||||||
} \
|
|
||||||
} while (0);
|
|
||||||
|
|
||||||
#define NCCLCHECKGOTO(call, res, label) do { \
|
|
||||||
res = call; \
|
|
||||||
if (res != ncclSuccess) { \
|
|
||||||
/* Print the back trace*/ \
|
|
||||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
|
||||||
goto label; \
|
|
||||||
} \
|
|
||||||
} while (0);
|
|
||||||
|
|
||||||
#ifdef PROFAPI
|
#ifdef PROFAPI
|
||||||
#define NCCL_API(ret, func, args...) \
|
#define NCCL_API(ret, func, args...) \
|
||||||
@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#include <sys/mman.h>
|
|
||||||
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
|
|
||||||
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
|
|
||||||
memset(*ptr, 0, size);
|
|
||||||
*devPtr = *ptr;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
|
||||||
CUDACHECK(cudaFreeHost(ptr));
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
|
||||||
void* p = malloc(nelem*sizeof(T));
|
|
||||||
if (p == NULL) {
|
|
||||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
|
||||||
return ncclSystemError;
|
|
||||||
}
|
|
||||||
memset(p, 0, nelem*sizeof(T));
|
|
||||||
*ptr = (T*)p;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
|
|
||||||
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
|
|
||||||
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
|
||||||
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // end include guard
|
#endif // end include guard
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -24,7 +24,7 @@ extern int ncclDebugLevel;
|
|||||||
extern uint64_t ncclDebugMask;
|
extern uint64_t ncclDebugMask;
|
||||||
extern pthread_mutex_t ncclDebugOutputLock;
|
extern pthread_mutex_t ncclDebugOutputLock;
|
||||||
extern FILE *ncclDebugFile;
|
extern FILE *ncclDebugFile;
|
||||||
extern ncclResult_t getHostName(char* hostname, int maxlen);
|
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||||
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
|
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
|
||||||
|
|
||||||
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
||||||
@ -108,7 +108,7 @@ static inline void initDebug() {
|
|||||||
break;
|
break;
|
||||||
case 'h': // %h = hostname
|
case 'h': // %h = hostname
|
||||||
char hostname[1024];
|
char hostname[1024];
|
||||||
getHostName(hostname, 1024);
|
getHostName(hostname, 1024, '.');
|
||||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||||
break;
|
break;
|
||||||
case 'p': // %p = pid
|
case 'p': // %p = pid
|
||||||
|
194
src/include/devcomm.h
Normal file
194
src/include/devcomm.h
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NCCL_DEVICE_H_
|
||||||
|
#define NCCL_DEVICE_H_
|
||||||
|
|
||||||
|
#include "nccl.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#define NCCL_MAX_OPS 2048
|
||||||
|
#define NCCL_STEPS 8
|
||||||
|
|
||||||
|
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
|
||||||
|
|
||||||
|
#define DIVUP(x, y) \
|
||||||
|
(((x)+(y)-1)/(y))
|
||||||
|
#define ROUNDUP(x, y) \
|
||||||
|
(DIVUP((x), (y))*(y))
|
||||||
|
|
||||||
|
#define ALIGN_SIZE(size, align) \
|
||||||
|
size = ((size + (align) - 1) / (align)) * (align);
|
||||||
|
|
||||||
|
union ncclLLFifoLine {
|
||||||
|
/* Flags have to be *after* data, because otherwise, an incomplete receive
|
||||||
|
from the network may receive the flag but not the data.
|
||||||
|
Note this is assuming that either we receive contiguous chunks of data
|
||||||
|
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
|
||||||
|
struct {
|
||||||
|
uint32_t data1;
|
||||||
|
uint32_t flag1;
|
||||||
|
uint32_t data2;
|
||||||
|
uint32_t flag2;
|
||||||
|
};
|
||||||
|
uint64_t v[2];
|
||||||
|
int4 i4;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define MAXTHREADS 256
|
||||||
|
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
|
||||||
|
#define NUM_LINES_PER_THREAD 8
|
||||||
|
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
|
||||||
|
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
|
||||||
|
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
|
||||||
|
#ifdef DEBUG_LL
|
||||||
|
#define NCCL_LL_CLEAN_MASK 0x00000ff8
|
||||||
|
#define NCCL_LL_FLAG_MAX 0x00001000
|
||||||
|
#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
|
||||||
|
#else
|
||||||
|
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
|
||||||
|
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
|
||||||
|
#endif
|
||||||
|
// Make sure the clean mask will last for at least NCCL_NSTEPS
|
||||||
|
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
|
||||||
|
|
||||||
|
struct ncclConnInfo {
|
||||||
|
// Regular comm mechanism
|
||||||
|
char *buff; // Local for recv, remote for send
|
||||||
|
uint64_t *tail; // Local for recv, remote for send
|
||||||
|
uint64_t *head; // Local for send, remote for recv
|
||||||
|
uint64_t *opCountLoc; // opCount of local rank
|
||||||
|
uint64_t *opCountRem; // opCount of remote rank
|
||||||
|
|
||||||
|
int direct; // Direct communication
|
||||||
|
void **ptrExchange; // Pointer exchange for direct communication
|
||||||
|
|
||||||
|
int *fifo; // Size fifo for proxy
|
||||||
|
|
||||||
|
uint64_t step; // Keep where we are
|
||||||
|
|
||||||
|
// Low latency mechanism
|
||||||
|
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
|
||||||
|
uint64_t llLastCleaning;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ncclConnector {
|
||||||
|
int connected;
|
||||||
|
struct ncclProxyArgs *proxyAppend;
|
||||||
|
struct ncclTransportComm* transportComm;
|
||||||
|
void* transportResources; // Host-side resources
|
||||||
|
struct ncclConnInfo conn;
|
||||||
|
struct ncclComm *comm;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ncclRing {
|
||||||
|
// Shortcuts for userRanks[1] and userRanks[n-1]
|
||||||
|
int prev;
|
||||||
|
int next;
|
||||||
|
|
||||||
|
// Maps an internal nccl index to user-specified rank order. This is necessary
|
||||||
|
// since we need to know how the user expects data to be ordered across
|
||||||
|
// devices. Ordered from current device.
|
||||||
|
int* userRanks;
|
||||||
|
int* devUserRanks;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#define NCCL_MAX_TREE_ARITY 3
|
||||||
|
struct ncclTree {
|
||||||
|
int depth;
|
||||||
|
int up;
|
||||||
|
int down[NCCL_MAX_TREE_ARITY];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ncclPeer {
|
||||||
|
struct ncclConnector send;
|
||||||
|
struct ncclConnector recv;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ncclDevComm;
|
||||||
|
|
||||||
|
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
||||||
|
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||||
|
/* Make sure to adjust padding at the end of ncclColl. */
|
||||||
|
struct CollectiveArgs {
|
||||||
|
struct ncclDevComm* comm;
|
||||||
|
uint64_t opCount;
|
||||||
|
|
||||||
|
// local and remote input, output, and buffer
|
||||||
|
const void * ThisInput;
|
||||||
|
void * ThisOutput;
|
||||||
|
|
||||||
|
// general parameters
|
||||||
|
size_t N;
|
||||||
|
uint32_t root;
|
||||||
|
uint8_t bid;
|
||||||
|
uint8_t nChannels;
|
||||||
|
uint16_t nThreads;
|
||||||
|
|
||||||
|
int lastChunkSize;
|
||||||
|
};
|
||||||
|
struct ncclColl {
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
struct CollectiveArgs args;
|
||||||
|
uint16_t funcIndex;
|
||||||
|
uint16_t nextIndex;
|
||||||
|
uint8_t active;
|
||||||
|
};
|
||||||
|
int data[0x10];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
|
||||||
|
|
||||||
|
struct ncclChannel {
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
struct ncclRing ring;
|
||||||
|
struct ncclTree tree;
|
||||||
|
|
||||||
|
int id;
|
||||||
|
int nthreads;
|
||||||
|
int buffSize;
|
||||||
|
|
||||||
|
// Communication structures
|
||||||
|
struct ncclPeer* peers;
|
||||||
|
struct ncclPeer* devPeers;
|
||||||
|
|
||||||
|
// Operation list for aggregation
|
||||||
|
struct ncclColl* collectives;
|
||||||
|
struct ncclColl* devCollectives;
|
||||||
|
int collStart;
|
||||||
|
int collCount;
|
||||||
|
int collFifoHead; // Only used by GPU
|
||||||
|
int collFifoTail; // Only used by CPU
|
||||||
|
};
|
||||||
|
int data[0x80];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
|
||||||
|
|
||||||
|
#define MAXCHANNELS 16
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
ncclDevSuccess,
|
||||||
|
ncclDevAssertedMismatch,
|
||||||
|
ncclDevSuspectedMismatch
|
||||||
|
} ncclDevError_t;
|
||||||
|
|
||||||
|
struct ncclDevComm {
|
||||||
|
int rank;
|
||||||
|
int nRanks;
|
||||||
|
|
||||||
|
// Flag to ask NCCL kernels to abort
|
||||||
|
volatile uint32_t *abortFlag;
|
||||||
|
volatile ncclDevError_t *fatalDevError;
|
||||||
|
|
||||||
|
// Channels, device side
|
||||||
|
struct ncclChannel* channels;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -10,6 +10,12 @@
|
|||||||
#include "core.h"
|
#include "core.h"
|
||||||
#include "group.h"
|
#include "group.h"
|
||||||
|
|
||||||
|
// Channels / LL tuning
|
||||||
|
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
|
||||||
|
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
|
||||||
|
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
|
||||||
|
#define NCCL_LL_MIN_NTHREADS 64
|
||||||
|
|
||||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||||
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
|
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
|
||||||
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
|
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
|
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
45
src/include/info.h
Normal file
45
src/include/info.h
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NCCL_INFO_H_
|
||||||
|
#define NCCL_INFO_H_
|
||||||
|
|
||||||
|
#include "nccl.h"
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
ncclPatternRing,
|
||||||
|
ncclPatternRingTwice,
|
||||||
|
ncclPatternPipelineFrom,
|
||||||
|
ncclPatternPipelineTo,
|
||||||
|
ncclPatternTreeUp,
|
||||||
|
ncclPatternTreeDown,
|
||||||
|
ncclPatternTreeUpDown
|
||||||
|
} ncclPattern_t;
|
||||||
|
|
||||||
|
// Used to pass NCCL call information between functions
|
||||||
|
struct ncclInfo {
|
||||||
|
ncclColl_t coll;
|
||||||
|
const char* opName;
|
||||||
|
// NCCL Coll Args
|
||||||
|
const void* sendbuff;
|
||||||
|
void* recvbuff;
|
||||||
|
size_t count;
|
||||||
|
ncclDataType_t datatype;
|
||||||
|
ncclRedOp_t op;
|
||||||
|
int root;
|
||||||
|
ncclComm_t comm;
|
||||||
|
cudaStream_t stream;
|
||||||
|
// Algorithm details
|
||||||
|
int chunkSteps;
|
||||||
|
int sliceSteps;
|
||||||
|
// Computed later
|
||||||
|
ncclPattern_t pattern;
|
||||||
|
size_t nBytes;
|
||||||
|
int nstepsPerLoop;
|
||||||
|
int nchunksPerLoop;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -80,12 +80,13 @@ typedef struct {
|
|||||||
// Finalize connection establishment after remote peer has called connectHandle
|
// Finalize connection establishment after remote peer has called connectHandle
|
||||||
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
// Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
// Asynchronous send to a peer.
|
||||||
// May return request == NULL if the call cannot be performed (or would block)
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
|
||||||
// Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
// Asynchronous recv from a peer.
|
||||||
// May return request == NULL if the call cannot be performed (or would block)
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -18,6 +18,7 @@
|
|||||||
enum ncclNvLinkDeviceType {
|
enum ncclNvLinkDeviceType {
|
||||||
ncclNvLinkDeviceGpu,
|
ncclNvLinkDeviceGpu,
|
||||||
ncclNvLinkDeviceSwitch,
|
ncclNvLinkDeviceSwitch,
|
||||||
|
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
|
||||||
};
|
};
|
||||||
|
|
||||||
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
|
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
|
||||||
@ -25,7 +26,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
|
|||||||
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
|
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
|
||||||
char* rPath = realpath(classPath, NULL);
|
char* rPath = realpath(classPath, NULL);
|
||||||
int fd;
|
int fd;
|
||||||
SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
|
if ((fd = open(rPath, O_RDONLY)) == -1) {
|
||||||
|
// Could not find device. It might be because we're in a VM and
|
||||||
|
// we don't see the whole machine. This is handled silently so
|
||||||
|
// we don't want to print an INFO error.
|
||||||
|
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
free(rPath);
|
free(rPath);
|
||||||
char pciClass[9];
|
char pciClass[9];
|
||||||
strncpy(pciClass, "0x000000", 9);
|
strncpy(pciClass, "0x000000", 9);
|
||||||
@ -35,6 +42,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
|
|||||||
if (strcmp(pciClass, "0x068000") == 0) {
|
if (strcmp(pciClass, "0x068000") == 0) {
|
||||||
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
|
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
|
||||||
*type = ncclNvLinkDeviceSwitch;
|
*type = ncclNvLinkDeviceSwitch;
|
||||||
|
} else if (strcmp(pciClass, "0x068001") == 0) {
|
||||||
|
// PCI device is of type "Bridge: IBM Device 04ea"
|
||||||
|
*type = ncclNvLinkDeviceBridge;
|
||||||
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|
||||||
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
|
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
|
||||||
*type = ncclNvLinkDeviceGpu;
|
*type = ncclNvLinkDeviceGpu;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
|
|||||||
s++;
|
s++;
|
||||||
strncpy(envValue, line+s, 1024);
|
strncpy(envValue, line+s, 1024);
|
||||||
setenv(envVar, envValue, 0);
|
setenv(envVar, envValue, 0);
|
||||||
char *str = getenv(envVar);
|
|
||||||
}
|
}
|
||||||
if (line) free(line);
|
if (line) free(line);
|
||||||
fclose(file);
|
fclose(file);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -18,8 +18,9 @@
|
|||||||
|
|
||||||
#define MAX_IFS 16
|
#define MAX_IFS 16
|
||||||
#define MAX_IF_NAME_SIZE 16
|
#define MAX_IF_NAME_SIZE 16
|
||||||
#define SLEEP_INT 1000 // sleep interval in usec
|
#define SLEEP_INT 1000 // connection retry sleep interval in usec
|
||||||
#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
|
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
|
||||||
|
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
|
||||||
|
|
||||||
/* Common socket address storage structure for IPv4/IPv6 */
|
/* Common socket address storage structure for IPv4/IPv6 */
|
||||||
union socketAddress {
|
union socketAddress {
|
||||||
@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
int ret;
|
int ret;
|
||||||
int retries = 0;
|
int timedout_retries = 0;
|
||||||
|
int refused_retries = 0;
|
||||||
retry:
|
retry:
|
||||||
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
|
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
|
||||||
if (ret == 0) return ncclSuccess;
|
if (ret == 0) return ncclSuccess;
|
||||||
if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
|
if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
|
||||||
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
|
if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
||||||
usleep(SLEEP_INT);
|
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
|
||||||
goto retry;
|
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
|
||||||
|
usleep(SLEEP_INT);
|
||||||
|
goto retry;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
|
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -11,49 +11,35 @@
|
|||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
ncclResult_t getCudaPath(int cudaDev, char** path);
|
||||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
|
||||||
|
|
||||||
static ncclResult_t getCudaPath(int cudaDev, char** path) {
|
static int getNumaId(char *path) {
|
||||||
char busId[BUSID_SIZE];
|
char npath[PATH_MAX];
|
||||||
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
|
snprintf(npath, PATH_MAX, "%s/numa_node", path);
|
||||||
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
|
npath[PATH_MAX-1] = '\0';
|
||||||
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
|
|
||||||
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
|
int numaId = -1;
|
||||||
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
|
FILE *file = fopen(npath, "r");
|
||||||
*path = realpath(busPath, NULL);
|
if (file == NULL) return -1;
|
||||||
if (*path == NULL) {
|
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
|
||||||
WARN("Could not find real path of %s", busPath);
|
fclose(file);
|
||||||
return ncclSystemError;
|
|
||||||
}
|
return numaId;
|
||||||
return ncclSuccess;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ncclPathDist {
|
enum ncclPathDist {
|
||||||
PATH_PIX = 0,
|
PATH_PIX = 0,
|
||||||
PATH_PXB = 1,
|
PATH_PXB = 1,
|
||||||
PATH_PHB = 2,
|
PATH_PHB = 2,
|
||||||
PATH_SOC = 3
|
PATH_NODE = 3,
|
||||||
|
PATH_SYS = 4,
|
||||||
|
PATH_ARRAY_SIZE = 5
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
|
extern const char* pathDists[PATH_ARRAY_SIZE];
|
||||||
|
|
||||||
static int pciDistance(char* path1, char* path2) {
|
int pciDistance(char* path1, char* path2);
|
||||||
int score = 0;
|
|
||||||
int depth = 0;
|
|
||||||
int same = 1;
|
|
||||||
for (int i=0; i<strlen(path1); i++) {
|
|
||||||
if (path1[i] != path2[i]) same = 0;
|
|
||||||
if (path1[i] == '/') {
|
|
||||||
depth++;
|
|
||||||
if (same == 1) score++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (score <= 3) return PATH_SOC;
|
|
||||||
if (score == 4) return PATH_PHB;
|
|
||||||
if (score == depth-1) return PATH_PIX;
|
|
||||||
return PATH_PXB;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
#define NCCL_TRANSPORT_H_
|
#define NCCL_TRANSPORT_H_
|
||||||
|
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
|
#include "devcomm.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "nvmlwrap.h"
|
#include "nvmlwrap.h"
|
||||||
|
|
||||||
@ -37,7 +38,7 @@ struct ncclConnect {
|
|||||||
char data[CONNECT_SIZE];
|
char data[CONNECT_SIZE];
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
|
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||||
|
|
||||||
struct ncclProxyArgs;
|
struct ncclProxyArgs;
|
||||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
|
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
|
||||||
@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void transportProxyIdle(int idle) {
|
|
||||||
sched_yield();
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -10,7 +10,7 @@
|
|||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
ncclResult_t getHostName(char* hostname, int maxlen);
|
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||||
uint64_t getHostHash();
|
uint64_t getHostHash();
|
||||||
uint64_t getPidHash();
|
uint64_t getPidHash();
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ FILE *ncclDebugFile = stdout;
|
|||||||
std::chrono::high_resolution_clock::time_point ncclEpoch;
|
std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if CUDART_VERSION >= 9200
|
#if CUDART_VERSION >= 9020
|
||||||
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
|
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
|
||||||
#else
|
#else
|
||||||
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
|
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
|
||||||
@ -182,6 +182,11 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
|
|||||||
return bootstrapGetUniqueId(out);
|
return bootstrapGetUniqueId(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prevent compiler from optimizing out these operations
|
||||||
|
void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) {
|
||||||
|
comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1;
|
||||||
|
}
|
||||||
|
|
||||||
static ncclResult_t commFree(ncclComm_t comm) {
|
static ncclResult_t commFree(ncclComm_t comm) {
|
||||||
if (comm == NULL)
|
if (comm == NULL)
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -191,6 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
|||||||
if (comm->bootstrap)
|
if (comm->bootstrap)
|
||||||
NCCLCHECK(bootstrapClose(comm->bootstrap));
|
NCCLCHECK(bootstrapClose(comm->bootstrap));
|
||||||
|
|
||||||
|
CUDACHECK(cudaFree(comm->hostDevComm.channels));
|
||||||
CUDACHECK(cudaFree(comm->devComm));
|
CUDACHECK(cudaFree(comm->devComm));
|
||||||
|
|
||||||
for (int channel=0; channel<comm->nChannels; channel++)
|
for (int channel=0; channel<comm->nChannels; channel++)
|
||||||
@ -216,6 +222,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
|||||||
CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
|
CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
|
||||||
CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
|
CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
|
||||||
|
|
||||||
|
// Poison comm to try and catch a double free
|
||||||
|
commPoison(comm);
|
||||||
|
|
||||||
free(comm);
|
free(comm);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -238,17 +247,17 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||||||
struct ncclComm* comm;
|
struct ncclComm* comm;
|
||||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||||
|
|
||||||
comm->rank = rank;
|
comm->rank = comm->hostDevComm.rank =rank;
|
||||||
comm->nRanks = ndev;
|
comm->nRanks = comm->hostDevComm.nRanks = ndev;
|
||||||
cudaGetDevice(&comm->cudaDev);
|
cudaGetDevice(&comm->cudaDev);
|
||||||
getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
|
getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
|
||||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
|
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
|
||||||
|
|
||||||
comm->doneEvent = doneEvent;
|
comm->doneEvent = doneEvent;
|
||||||
comm->llThreshold = ncclParamLlThreshold();
|
comm->llThreshold = ncclParamLlThreshold();
|
||||||
comm->treeThreshold = ncclParamTreeThreshold();
|
comm->treeThreshold = ncclParamTreeThreshold();
|
||||||
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
||||||
#if CUDART_VERSION >= 9200
|
#if CUDART_VERSION >= 9020
|
||||||
comm->groupCudaStream = ncclParamGroupCudaStream();
|
comm->groupCudaStream = ncclParamGroupCudaStream();
|
||||||
#else
|
#else
|
||||||
// Don't allow the user to overload the default setting in older CUDA builds
|
// Don't allow the user to overload the default setting in older CUDA builds
|
||||||
@ -256,10 +265,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||||||
#endif
|
#endif
|
||||||
comm->fatalError = ncclSuccess;
|
comm->fatalError = ncclSuccess;
|
||||||
|
|
||||||
CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
|
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
|
||||||
*comm->fatalDevError = ncclDevSuccess;
|
*comm->fatalDevError = ncclDevSuccess;
|
||||||
|
|
||||||
CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
|
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
|
||||||
*comm->abortFlag = 0;
|
*comm->abortFlag = 0;
|
||||||
|
|
||||||
comm->argsptr = &comm->args;
|
comm->argsptr = &comm->args;
|
||||||
@ -269,23 +278,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||||
// Fully duplicate the comm on the device
|
// Duplicate the channels on the device
|
||||||
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
|
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
|
||||||
// Copy the comm on the device
|
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
|
||||||
NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
|
|
||||||
// Copy userRanks
|
// Copy userRanks and peers
|
||||||
for (int r=0; r<comm->nChannels; r++) {
|
for (int r=0; r<comm->nChannels; r++) {
|
||||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
||||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
|
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
|
||||||
}
|
}
|
||||||
// Copy the device-accessible pointer to comm->abortFlag
|
|
||||||
void *devAbortFlag;
|
// Duplicate the dev comm on the device
|
||||||
CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
|
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
|
||||||
CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
|
NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
|
||||||
// Copy the device-accessible pointer to comm->fatalDevError
|
|
||||||
void *devFatalError;
|
|
||||||
CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
|
|
||||||
CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -423,7 +428,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int ranks[nMasters];
|
int* ranks;
|
||||||
|
NCCLCHECK(ncclCalloc(&ranks, nMasters));
|
||||||
int i = 0, masterIndex = -1;
|
int i = 0, masterIndex = -1;
|
||||||
// Build binary tree
|
// Build binary tree
|
||||||
for (int r=0; r<nranks; r++) {
|
for (int r=0; r<nranks; r++) {
|
||||||
@ -455,6 +461,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
|
|||||||
tree->up = prev;
|
tree->up = prev;
|
||||||
if (treeMasters[next] == 0) tree->down[0] = next;
|
if (treeMasters[next] == 0) tree->down[0] = next;
|
||||||
}
|
}
|
||||||
|
free(ranks);
|
||||||
}
|
}
|
||||||
|
|
||||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||||
@ -638,6 +645,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
|
|||||||
if (peer == -1) continue;
|
if (peer == -1) continue;
|
||||||
conn = &channel->peers[peer].recv;
|
conn = &channel->peers[peer].recv;
|
||||||
if (conn->connected) { ++nSkippedRecv; continue; }
|
if (conn->connected) { ++nSkippedRecv; continue; }
|
||||||
|
memset(&connect, 0, sizeof(connect));
|
||||||
NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
||||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||||
}
|
}
|
||||||
@ -646,6 +654,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
|
|||||||
if (peer == -1) continue;
|
if (peer == -1) continue;
|
||||||
conn = &channel->peers[peer].send;
|
conn = &channel->peers[peer].send;
|
||||||
if (conn->connected) { ++nSkippedSend; continue; }
|
if (conn->connected) { ++nSkippedSend; continue; }
|
||||||
|
memset(&connect, 0, sizeof(connect));
|
||||||
NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
||||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||||
}
|
}
|
||||||
@ -654,6 +663,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
|
|||||||
if (peer == -1) continue;
|
if (peer == -1) continue;
|
||||||
conn = &channel->peers[peer].send;
|
conn = &channel->peers[peer].send;
|
||||||
if (conn->connected) {++nSkippedSend; continue; }
|
if (conn->connected) {++nSkippedSend; continue; }
|
||||||
|
memset(&connect, 0, sizeof(connect));
|
||||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||||
NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
||||||
conn->connected = 1;
|
conn->connected = 1;
|
||||||
@ -663,6 +673,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
|
|||||||
if (peer == -1) continue;
|
if (peer == -1) continue;
|
||||||
conn = &channel->peers[peer].recv;
|
conn = &channel->peers[peer].recv;
|
||||||
if (conn->connected) {++nSkippedRecv; continue; }
|
if (conn->connected) {++nSkippedRecv; continue; }
|
||||||
|
memset(&connect, 0, sizeof(connect));
|
||||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||||
NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
||||||
conn->connected = 1;
|
conn->connected = 1;
|
||||||
@ -877,18 +888,42 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
|
||||||
|
|
||||||
static ncclResult_t setCpuAffinity(int cudaDev) {
|
static ncclResult_t setCpuAffinity(int cudaDev) {
|
||||||
// Work within the enveloppe we were provided
|
// Query the CPU affinity set we were provided
|
||||||
cpu_set_t mask;
|
cpu_set_t mask;
|
||||||
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
|
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
|
||||||
|
|
||||||
// Find the subpart that is local to our GPU
|
#ifdef ENABLE_TRACE
|
||||||
|
{
|
||||||
|
char affinityStr[sizeof(cpu_set_t)*2];
|
||||||
|
NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
|
||||||
|
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Find the CPUs that are local to the supplied GPU
|
||||||
cpu_set_t gpuMask;
|
cpu_set_t gpuMask;
|
||||||
NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
|
NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
|
||||||
cpu_set_t finalMask;
|
|
||||||
CPU_AND(&finalMask, &mask, &gpuMask);
|
|
||||||
|
|
||||||
// If those are not disjoint, try to stay local
|
#ifdef ENABLE_TRACE
|
||||||
|
{
|
||||||
|
char affinityStr[sizeof(cpu_set_t)*2];
|
||||||
|
NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
|
||||||
|
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
cpu_set_t finalMask;
|
||||||
|
if (ncclParamIgnoreCpuAffinity())
|
||||||
|
// Ignore the CPU affinity set and use the GPU one instead
|
||||||
|
finalMask = gpuMask;
|
||||||
|
else
|
||||||
|
// Use a subset of the GPU affinity set
|
||||||
|
CPU_AND(&finalMask, &mask, &gpuMask);
|
||||||
|
|
||||||
|
// If there is a non empty set, use it to set affinity
|
||||||
if (CPU_COUNT(&finalMask)) {
|
if (CPU_COUNT(&finalMask)) {
|
||||||
char affinityStr[sizeof(cpu_set_t)*2];
|
char affinityStr[sizeof(cpu_set_t)*2];
|
||||||
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
|
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
|
||||||
@ -1018,8 +1053,9 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
|
|||||||
comms[rank]->threadThreshold = threadThreshold;
|
comms[rank]->threadThreshold = threadThreshold;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ncclConnect* connect;
|
||||||
|
NCCLCHECK(ncclCalloc(&connect, 2*nranks));
|
||||||
for (int r=0; r<nrings; r++) {
|
for (int r=0; r<nrings; r++) {
|
||||||
struct ncclConnect connect[2*nranks];
|
|
||||||
int* ringRanks = rings+r*nranks;
|
int* ringRanks = rings+r*nranks;
|
||||||
for (int rank=0; rank<nranks; rank++) {
|
for (int rank=0; rank<nranks; rank++) {
|
||||||
CUDACHECK(cudaSetDevice(devs[rank]));
|
CUDACHECK(cudaSetDevice(devs[rank]));
|
||||||
@ -1045,6 +1081,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
|
|||||||
NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
|
NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
free(connect);
|
||||||
free(allInfo);
|
free(allInfo);
|
||||||
free(rings);
|
free(rings);
|
||||||
free(treeIn);
|
free(treeIn);
|
||||||
@ -1072,12 +1109,13 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
|||||||
int savedDevice;
|
int savedDevice;
|
||||||
int rank, cudaDev;
|
int rank, cudaDev;
|
||||||
ncclComm_t comm = NULL;
|
ncclComm_t comm = NULL;
|
||||||
int ncclDevList[ndev];
|
int* ncclDevList = NULL;
|
||||||
|
NCCLCHECK(ncclCalloc(&ncclDevList, ndev));
|
||||||
for (int i=0; i<ndev; i++) {
|
for (int i=0; i<ndev; i++) {
|
||||||
ncclDevList[i] = devlist ? devlist[i] : i;
|
ncclDevList[i] = devlist ? devlist[i] : i;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaGetDevice(&savedDevice);
|
CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);
|
||||||
|
|
||||||
for(rank=0; rank<ndev; ++rank)
|
for(rank=0; rank<ndev; ++rank)
|
||||||
comms[rank] = NULL;
|
comms[rank] = NULL;
|
||||||
@ -1118,6 +1156,7 @@ cleanup:
|
|||||||
}
|
}
|
||||||
|
|
||||||
final:
|
final:
|
||||||
|
free(ncclDevList);
|
||||||
if(wrapNvmlShutdown() != ncclSuccess)
|
if(wrapNvmlShutdown() != ncclSuccess)
|
||||||
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
|
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
|
||||||
cudaSetDevice(savedDevice);
|
cudaSetDevice(savedDevice);
|
||||||
@ -1128,9 +1167,11 @@ final:
|
|||||||
|
|
||||||
static ncclResult_t commDestroy(ncclComm_t comm) {
|
static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||||
int savedDevice;
|
int savedDevice;
|
||||||
|
#ifdef ENABLE_TRACE
|
||||||
|
int rank = comm->rank;
|
||||||
|
#endif
|
||||||
CUDACHECK(cudaGetDevice(&savedDevice));
|
CUDACHECK(cudaGetDevice(&savedDevice));
|
||||||
int commDevice = comm->cudaDev;
|
int commDevice = comm->cudaDev;
|
||||||
int rank = comm->rank;
|
|
||||||
|
|
||||||
if (savedDevice != commDevice) {
|
if (savedDevice != commDevice) {
|
||||||
CUDACHECK(cudaSetDevice(commDevice));
|
CUDACHECK(cudaSetDevice(commDevice));
|
||||||
@ -1145,7 +1186,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
|||||||
if (savedDevice != commDevice)
|
if (savedDevice != commDevice)
|
||||||
CUDACHECK(cudaSetDevice(savedDevice));
|
CUDACHECK(cudaSetDevice(savedDevice));
|
||||||
|
|
||||||
INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
|
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -1155,6 +1196,14 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
|||||||
if (comm == NULL)
|
if (comm == NULL)
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
|
||||||
|
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev);
|
||||||
|
|
||||||
|
// Try and prevent a double free of the comm struct (user error)
|
||||||
|
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) {
|
||||||
|
WARN("comm %p has already been destroyed", comm);
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
|
||||||
return commDestroy(comm);
|
return commDestroy(comm);
|
||||||
}
|
}
|
||||||
|
|
@ -1,10 +1,10 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "checks.h"
|
#include "argcheck.h"
|
||||||
|
|
||||||
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||||
cudaPointerAttributes attr;
|
cudaPointerAttributes attr;
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -118,7 +118,7 @@ ncclResult_t ncclGroupEnd() {
|
|||||||
int savedDev;
|
int savedDev;
|
||||||
CUDACHECK(cudaGetDevice(&savedDev));
|
CUDACHECK(cudaGetDevice(&savedDev));
|
||||||
int done = ncclGroupIndex;
|
int done = ncclGroupIndex;
|
||||||
int doneArray[ncclGroupIndex];
|
int doneArray[MAX_ASYNC_OPS];
|
||||||
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
|
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
|
||||||
|
|
||||||
ncclResult_t ret = ncclGroupError;
|
ncclResult_t ret = ncclGroupError;
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -208,8 +208,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
|
|||||||
NCCLCHECK(getEnvThreads(nthreads));
|
NCCLCHECK(getEnvThreads(nthreads));
|
||||||
for (int r = 0; r<*nrings; r++) {
|
for (int r = 0; r<*nrings; r++) {
|
||||||
for (int i = 0; i<nranks; i++) {
|
for (int i = 0; i<nranks; i++) {
|
||||||
if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
|
if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
|
||||||
if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
|
if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
51
src/misc/topo.cc
Normal file
51
src/misc/topo.cc
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#include "core.h"
|
||||||
|
#include "topo.h"
|
||||||
|
|
||||||
|
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||||
|
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||||
|
|
||||||
|
ncclResult_t getCudaPath(int cudaDev, char** path) {
|
||||||
|
char busId[BUSID_SIZE];
|
||||||
|
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
|
||||||
|
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
|
||||||
|
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
|
||||||
|
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
|
||||||
|
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
|
||||||
|
*path = realpath(busPath, NULL);
|
||||||
|
if (*path == NULL) {
|
||||||
|
WARN("Could not find real path of %s", busPath);
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
|
||||||
|
|
||||||
|
int pciDistance(char* path1, char* path2) {
|
||||||
|
int score = 0;
|
||||||
|
int depth = 0;
|
||||||
|
int same = 1;
|
||||||
|
for (int i=0; i<strlen(path1); i++) {
|
||||||
|
if (path1[i] != path2[i]) same = 0;
|
||||||
|
if (path1[i] == '/') {
|
||||||
|
depth++;
|
||||||
|
if (same == 1) score++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (score <= 3) {
|
||||||
|
/* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
|
||||||
|
int numaId1 = getNumaId(path1);
|
||||||
|
int numaId2 = getNumaId(path2);
|
||||||
|
TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
|
||||||
|
return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
|
||||||
|
}
|
||||||
|
if (score == 4) return PATH_PHB;
|
||||||
|
if (score == depth-1) return PATH_PIX;
|
||||||
|
return PATH_PXB;
|
||||||
|
}
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -29,13 +29,13 @@ ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t getHostName(char* hostname, int maxlen) {
|
ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
|
||||||
if (gethostname(hostname, maxlen) != 0) {
|
if (gethostname(hostname, maxlen) != 0) {
|
||||||
strncpy(hostname, "unknown", maxlen);
|
strncpy(hostname, "unknown", maxlen);
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
|
while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
|
||||||
hostname[i] = '\0';
|
hostname[i] = '\0';
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -48,7 +48,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
|||||||
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
|
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
|
||||||
|
|
||||||
char hostname[1024];
|
char hostname[1024];
|
||||||
getHostName(hostname, 1024);
|
getHostName(hostname, 1024, '.');
|
||||||
int cudaDev;
|
int cudaDev;
|
||||||
cudaGetDevice(&cudaDev);
|
cudaGetDevice(&cudaDev);
|
||||||
|
|
||||||
@ -104,8 +104,8 @@ uint64_t getHash(const char* string) {
|
|||||||
*/
|
*/
|
||||||
uint64_t getHostHash(void) {
|
uint64_t getHostHash(void) {
|
||||||
char uname[1024];
|
char uname[1024];
|
||||||
// Start off with the hostname
|
// Start off with the full hostname
|
||||||
(void) getHostName(uname, sizeof(uname));
|
(void) getHostName(uname, sizeof(uname), '\0');
|
||||||
int offset = strlen(uname);
|
int offset = strlen(uname);
|
||||||
int len;
|
int len;
|
||||||
// $(readlink /proc/self/ns/uts)
|
// $(readlink /proc/self/ns/uts)
|
@ -28,7 +28,7 @@ static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IF
|
|||||||
static ncclTvalue_t getTvalue(short* distances, int ndev) {
|
static ncclTvalue_t getTvalue(short* distances, int ndev) {
|
||||||
ncclTvalue_t tvalue = 0;
|
ncclTvalue_t tvalue = 0;
|
||||||
for (int d=0; d<ndev; d++) {
|
for (int d=0; d<ndev; d++) {
|
||||||
int score = 1 + PATH_SOC - distances[d];
|
int score = 1 + PATH_SYS - distances[d];
|
||||||
// Keep 3 bits of score info per dev
|
// Keep 3 bits of score info per dev
|
||||||
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
|
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
|
||||||
}
|
}
|
||||||
@ -81,7 +81,7 @@ static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
|
|||||||
ncclResult_t err;
|
ncclResult_t err;
|
||||||
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
|
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
|
||||||
err = ncclNetPciPath(dev, &nicPath);
|
err = ncclNetPciPath(dev, &nicPath);
|
||||||
*distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
|
*distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
|
||||||
if (nicPath) free(nicPath);
|
if (nicPath) free(nicPath);
|
||||||
if (cudaPath) free(cudaPath);
|
if (cudaPath) free(cudaPath);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -173,19 +173,19 @@ static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroup
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
||||||
int nGroups = groups[nranks-1] + 1;
|
int nGroups = groups[nranks-1] + 1;
|
||||||
int cardUsed[NET_MAX_IFS*nGroups];
|
int *cardUsed, *starts, *ends;
|
||||||
for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
|
NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
|
||||||
|
NCCLCHECK(ncclCalloc(&starts, nGroups));
|
||||||
|
NCCLCHECK(ncclCalloc(&ends, nGroups));
|
||||||
|
|
||||||
for (int ring = 0; ring<*nringsRet; ring++) {
|
for (int ring = 0; ring<*nringsRet; ring++) {
|
||||||
int starts[nGroups];
|
|
||||||
int ends[nGroups];
|
|
||||||
for (int group = 0; group<nGroups; group++) {
|
for (int group = 0; group<nGroups; group++) {
|
||||||
int nranksInGroup = 0;
|
int nranksInGroup = 0;
|
||||||
int nsubGroups = 0;
|
int nsubGroups = 0;
|
||||||
for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
|
for (int rank=0; rank<nranks; rank++)
|
||||||
|
if (groups[rank] == group) {
|
||||||
nranksInGroup++;
|
nranksInGroup++;
|
||||||
nsubGroups = std::max(subgroups[rank], nsubGroups);
|
nsubGroups = std::max(subgroups[rank], nsubGroups);
|
||||||
}
|
}
|
||||||
@ -207,7 +207,7 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
|
|||||||
}
|
}
|
||||||
if (starts[group] == -1 || ends[group] == -1) {
|
if (starts[group] == -1 || ends[group] == -1) {
|
||||||
*nringsRet = ring;
|
*nringsRet = ring;
|
||||||
return ncclSuccess;
|
goto done;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Link groups together
|
// Link groups together
|
||||||
@ -217,6 +217,10 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
|
|||||||
prev[ring*nranks+starts[nextGroup]] = ends[group];
|
prev[ring*nranks+starts[nextGroup]] = ends[group];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
done:
|
||||||
|
free(cardUsed);
|
||||||
|
free(starts);
|
||||||
|
free(ends);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -432,11 +436,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||||||
if (args->head < args->end) {
|
if (args->head < args->end) {
|
||||||
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
|
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
|
||||||
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
|
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
|
||||||
|
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
|
||||||
if (args->llMode) {
|
if (args->llMode) {
|
||||||
int buffSlot = args->tail%NCCL_STEPS;
|
int buffSlot = args->tail%NCCL_STEPS;
|
||||||
int size = sizesFifo[buffSlot];
|
int size = sizesFifo[buffSlot];
|
||||||
if (size != -1) {
|
if (size != -1) {
|
||||||
uint32_t flag = args->tail + 1;
|
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
|
||||||
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
||||||
size = nFifoLines * sizeof(union ncclLLFifoLine);
|
size = nFifoLines * sizeof(union ncclLLFifoLine);
|
||||||
union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
|
union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
|
||||||
@ -457,7 +462,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (args->tail < resources->hostRecvMem->tail) {
|
} else if (args->tail < *recvTail) {
|
||||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||||
int stepSize = args->channel->buffSize/NCCL_STEPS;
|
int stepSize = args->channel->buffSize/NCCL_STEPS;
|
||||||
// Send through network
|
// Send through network
|
||||||
@ -486,19 +491,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||||||
if (args->head == args->end) {
|
if (args->head == args->end) {
|
||||||
resources->step = args->end;
|
resources->step = args->end;
|
||||||
args->idle = 0;
|
args->idle = 0;
|
||||||
args->state = ncclProxyOpDone;
|
args->state = ncclProxyOpNone;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (args->state == ncclProxyOpDone) {
|
|
||||||
union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
|
|
||||||
if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
|
|
||||||
for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
|
|
||||||
resources->step += NCCL_STEPS;
|
|
||||||
resources->hostSendMem->head = resources->step;
|
|
||||||
resources->llLastCleaning = resources->step;
|
|
||||||
}
|
|
||||||
args->state = ncclProxyOpNone;
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -522,7 +517,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
|||||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||||
char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
|
char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
|
||||||
void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
|
void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
|
||||||
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
|
volatile uint64_t* sendHead = &resources->hostSendMem->head;
|
||||||
|
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
|
||||||
int buffSlot = args->tail%NCCL_STEPS;
|
int buffSlot = args->tail%NCCL_STEPS;
|
||||||
int sliceSize = stepSize * args->sliceSteps;
|
int sliceSize = stepSize * args->sliceSteps;
|
||||||
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
|
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
|
||||||
@ -548,17 +544,9 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
|||||||
if (args->head == args->end) {
|
if (args->head == args->end) {
|
||||||
resources->step = args->end;
|
resources->step = args->end;
|
||||||
args->idle = 0;
|
args->idle = 0;
|
||||||
args->state = ncclProxyOpDone;
|
args->state = ncclProxyOpNone;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (args->state == ncclProxyOpDone) {
|
|
||||||
if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
|
|
||||||
resources->step += NCCL_STEPS;
|
|
||||||
while (resources->hostSendMem->head < resources->step);
|
|
||||||
resources->llLastCleaning = resources->step;
|
|
||||||
}
|
|
||||||
args->state = ncclProxyOpNone;
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -119,6 +119,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
|||||||
}
|
}
|
||||||
int found = 0;
|
int found = 0;
|
||||||
struct ibv_device_attr devAttr;
|
struct ibv_device_attr devAttr;
|
||||||
|
memset(&devAttr, 0, sizeof(devAttr));
|
||||||
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
|
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
|
||||||
WARN("NET/IB : Unable to query device %s", devices[d]->name);
|
WARN("NET/IB : Unable to query device %s", devices[d]->name);
|
||||||
if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
|
if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
@ -57,7 +57,7 @@ static int busIdToCudaDev(const char* busId) {
|
|||||||
/* Determine if we can communicate with the peer through p2p */
|
/* Determine if we can communicate with the peer through p2p */
|
||||||
ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
|
ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
|
||||||
// Do not use P2P across root complexes by default (provided CUDA permits it)
|
// Do not use P2P across root complexes by default (provided CUDA permits it)
|
||||||
int p2pLevel = PATH_SOC;
|
int p2pLevel = PATH_NODE;
|
||||||
if (ncclParamP2pDisable() == 1) p2pLevel = 0;
|
if (ncclParamP2pDisable() == 1) p2pLevel = 0;
|
||||||
if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
|
if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
|
||||||
|
|
||||||
@ -70,13 +70,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
|
|||||||
|
|
||||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||||
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
|
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
|
||||||
if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
|
if (peerCudaDev == -1) {
|
||||||
|
// Peer's CUDA device is not visible in this process
|
||||||
|
#if CUDART_VERSION >= 10010
|
||||||
|
// But in CUDA 10.1 we can still communicate with 'invisible' devices
|
||||||
|
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
|
||||||
|
// Check for NVLink/NVswitch including P2P access
|
||||||
|
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
|
||||||
|
if (nvlinkp2p > 0) {
|
||||||
|
*ret = nvlinkp2p;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
|
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
|
||||||
|
|
||||||
// Do not detect topology if we're on the same GPU. Note this is not really supported.
|
// Do not detect topology if we're on the same GPU. Note this is not really supported.
|
||||||
if (myInfo->cudaDev == peerCudaDev) {
|
if (myInfo->cudaDev == peerCudaDev) {
|
||||||
*ret = 1 + PATH_SOC;
|
*ret = 1 + PATH_SYS;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,7 +117,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
|
|||||||
if (err1 == ncclSuccess && err2 == ncclSuccess) {
|
if (err1 == ncclSuccess && err2 == ncclSuccess) {
|
||||||
int distance = pciDistance(myPath, peerPath);
|
int distance = pciDistance(myPath, peerPath);
|
||||||
if (distance < p2pLevel) {
|
if (distance < p2pLevel) {
|
||||||
*ret = 1 + PATH_SOC - distance;
|
*ret = 1 + PATH_SYS - distance;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (err1 == ncclSuccess) free(myPath);
|
if (err1 == ncclSuccess) free(myPath);
|
||||||
@ -112,6 +125,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
|
||||||
|
#define MAXGPUS_PCI 64
|
||||||
|
|
||||||
static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
|
static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
|
||||||
int nrings = 0;
|
int nrings = 0;
|
||||||
ncclTvalue_t* line = matrix+current*n;
|
ncclTvalue_t* line = matrix+current*n;
|
||||||
@ -139,7 +155,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int ringsSave[nRingsMax*n];
|
int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
|
||||||
int maxStep = 0;
|
int maxStep = 0;
|
||||||
for (int i=0; i<n; i++) {
|
for (int i=0; i<n; i++) {
|
||||||
if (inTheRing[i] == 0 && line[i] > 0) {
|
if (inTheRing[i] == 0 && line[i] > 0) {
|
||||||
@ -297,9 +313,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
|
static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
|
||||||
for (int score = PATH_SOC+1; score >= minScore; score--) {
|
for (int score = PATH_SYS+1; score >= minScore; score--) {
|
||||||
int best = -1;
|
int best = -1;
|
||||||
int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
|
int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
|
||||||
for (int n = 0; n < nranks; n++) {
|
for (int n = 0; n < nranks; n++) {
|
||||||
if (inRing[n]) continue;
|
if (inRing[n]) continue;
|
||||||
if (values[rank*nranks+n] == score) {
|
if (values[rank*nranks+n] == score) {
|
||||||
@ -321,7 +337,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
|
|||||||
int start = findConnect(nranks, prev+r*nranks);
|
int start = findConnect(nranks, prev+r*nranks);
|
||||||
int end = findConnect(nranks, next+r*nranks);
|
int end = findConnect(nranks, next+r*nranks);
|
||||||
|
|
||||||
int inRing[nranks];
|
int inRing[MAXGPUS_PCI];
|
||||||
for (int i=0; i<nranks; i++) inRing[i] = 0;
|
for (int i=0; i<nranks; i++) inRing[i] = 0;
|
||||||
|
|
||||||
if (start == -1 && end == -1) {
|
if (start == -1 && end == -1) {
|
||||||
@ -405,10 +421,14 @@ ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
|
|||||||
links += val/CONNECT_NVLINK;
|
links += val/CONNECT_NVLINK;
|
||||||
}
|
}
|
||||||
if (rank == 0) directLinks = links;
|
if (rank == 0) directLinks = links;
|
||||||
else directLinks = std::min(directLinks, links);
|
else directLinks = std::min(directLinks, links);
|
||||||
}
|
}
|
||||||
if (directLinks > 0) {
|
if (directLinks > 0) {
|
||||||
// NVLink : Connect rings or create new ones
|
// NVLink : Connect rings or create new ones
|
||||||
|
if (nranks > MAXGPUS_NVLINKP2P) {
|
||||||
|
WARN("Recursive P2P computation cannot work for >8 GPUs");
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
|
nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
|
||||||
goto end;
|
goto end;
|
||||||
}
|
}
|
||||||
@ -600,6 +620,7 @@ ncclResult_t p2pSendFree(void* resources) {
|
|||||||
if (sendRes->ipcPtr)
|
if (sendRes->ipcPtr)
|
||||||
CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
|
CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
|
||||||
CUDACHECK(cudaFree(sendRes->devMem));
|
CUDACHECK(cudaFree(sendRes->devMem));
|
||||||
|
free(sendRes);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -608,6 +629,7 @@ ncclResult_t p2pRecvFree(void* resources) {
|
|||||||
if (recvRes->ipcPtr)
|
if (recvRes->ipcPtr)
|
||||||
CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
|
CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
|
||||||
CUDACHECK(cudaFree(recvRes->devMem));
|
CUDACHECK(cudaFree(recvRes->devMem));
|
||||||
|
free(recvRes);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
@ -60,11 +60,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MAXGROUPS 16
|
||||||
|
|
||||||
ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
||||||
if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
|
if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
|
||||||
int nGroups = groups[nranks-1] + 1;
|
int nGroups = groups[nranks-1] + 1;
|
||||||
int starts[nGroups];
|
int starts[MAXGROUPS];
|
||||||
int ends[nGroups];
|
int ends[MAXGROUPS];
|
||||||
for (int ring = 0; ring<*nringsRet; ring++) {
|
for (int ring = 0; ring<*nringsRet; ring++) {
|
||||||
int startGroup = -1, endGroup = -1;
|
int startGroup = -1, endGroup = -1;
|
||||||
for (int group = 0; group<nGroups; group++) {
|
for (int group = 0; group<nGroups; group++) {
|
Loading…
x
Reference in New Issue
Block a user