2.16.2-1
Add support for CUDA 12.0, drop Kepler (sm_35). Support for H100 features. Make socket code more robust and protected. Solves #555. Improve performance on large CUDA graphs, reducing dependencies. Reduce inter-socket bandwidth on AMD CPUs to favor better paths. Various fixes to ncclCommAbort. Make service thread polling resistant to EINTR. Compile with profiling API by default. Extend NVTX instrumentation with call arguments.
This commit is contained in:
parent
614b49f0de
commit
28189e2df8
@ -10,7 +10,7 @@ VERBOSE ?= 0
|
|||||||
KEEP ?= 0
|
KEEP ?= 0
|
||||||
DEBUG ?= 0
|
DEBUG ?= 0
|
||||||
TRACE ?= 0
|
TRACE ?= 0
|
||||||
PROFAPI ?= 0
|
PROFAPI ?= 1
|
||||||
NVTX ?= 1
|
NVTX ?= 1
|
||||||
|
|
||||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||||
@ -25,22 +25,26 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
|||||||
|
|
||||||
# You should define NVCC_GENCODE in your environment to the minimal set
|
# You should define NVCC_GENCODE in your environment to the minimal set
|
||||||
# of archs to reduce compile time.
|
# of archs to reduce compile time.
|
||||||
CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
|
CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
|
||||||
-gencode=arch=compute_50,code=sm_50 \
|
|
||||||
-gencode=arch=compute_60,code=sm_60 \
|
-gencode=arch=compute_60,code=sm_60 \
|
||||||
-gencode=arch=compute_61,code=sm_61
|
-gencode=arch=compute_61,code=sm_61
|
||||||
|
ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
|
||||||
|
# SM35 is deprecated from CUDA12.0 onwards
|
||||||
|
CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
|
||||||
|
endif
|
||||||
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
||||||
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
||||||
CUDA11_8_GENCODE = -gencode=arch=compute_90,code=sm_90
|
CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
|
||||||
|
|
||||||
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
||||||
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
||||||
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
|
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
|
||||||
CUDA11_8_PTX = -gencode=arch=compute_90,code=compute_90
|
CUDA12_PTX = -gencode=arch=compute_90,code=compute_90
|
||||||
|
|
||||||
|
|
||||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
|
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
|
||||||
# Include Hopper support if we're using CUDA11.8 or above
|
# Include Hopper support if we're using CUDA11.8 or above
|
||||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_8_GENCODE) $(CUDA11_8_PTX)
|
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
|
||||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
|
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
|
||||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
|
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
|
||||||
# Include Volta support if we're using CUDA9 or above
|
# Include Volta support if we're using CUDA9 or above
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 15
|
NCCL_MINOR := 16
|
||||||
NCCL_PATCH := 5
|
NCCL_PATCH := 2
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -9,7 +9,7 @@ include ../makefiles/version.mk
|
|||||||
|
|
||||||
##### src files
|
##### src files
|
||||||
INCEXPORTS := nccl.h nccl_net.h
|
INCEXPORTS := nccl.h nccl_net.h
|
||||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
|
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
|
||||||
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
|
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
|
||||||
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
|
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
|
||||||
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
|
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
|
||||||
|
191
src/bootstrap.cc
191
src/bootstrap.cc
@ -13,6 +13,11 @@
|
|||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include "proxy.h"
|
#include "proxy.h"
|
||||||
|
|
||||||
|
struct bootstrapRootArgs {
|
||||||
|
struct ncclSocket* listenSock;
|
||||||
|
uint64_t magic;
|
||||||
|
};
|
||||||
|
|
||||||
/* Init functions */
|
/* Init functions */
|
||||||
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
|
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
|
||||||
static union ncclSocketAddress bootstrapNetIfAddr;
|
static union ncclSocketAddress bootstrapNetIfAddr;
|
||||||
@ -26,7 +31,7 @@ ncclResult_t bootstrapNetInit() {
|
|||||||
char* env = getenv("NCCL_COMM_ID");
|
char* env = getenv("NCCL_COMM_ID");
|
||||||
if (env) {
|
if (env) {
|
||||||
union ncclSocketAddress remoteAddr;
|
union ncclSocketAddress remoteAddr;
|
||||||
if (ncclGetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
|
if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
|
||||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||||
return ncclInvalidArgument;
|
return ncclInvalidArgument;
|
||||||
}
|
}
|
||||||
@ -89,8 +94,10 @@ static ncclResult_t setFilesLimit() {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void *bootstrapRoot(void* args) {
|
static void *bootstrapRoot(void* rargs) {
|
||||||
struct ncclSocket* listenSock = (struct ncclSocket*)args;
|
struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs;
|
||||||
|
struct ncclSocket* listenSock = args->listenSock;
|
||||||
|
uint64_t magic = args->magic;
|
||||||
ncclResult_t res = ncclSuccess;
|
ncclResult_t res = ncclSuccess;
|
||||||
int nranks = 0, c = 0;
|
int nranks = 0, c = 0;
|
||||||
struct extInfo info;
|
struct extInfo info;
|
||||||
@ -104,11 +111,10 @@ static void *bootstrapRoot(void* args) {
|
|||||||
/* Receive addresses from all ranks */
|
/* Receive addresses from all ranks */
|
||||||
do {
|
do {
|
||||||
struct ncclSocket sock;
|
struct ncclSocket sock;
|
||||||
/* bootstrap root thread always uses blocking ncclSocketAccept. */
|
NCCLCHECKGOTO(ncclSocketInit(&sock), res, out);
|
||||||
NCCLCHECKGOTO(ncclSocketInit(&sock, NULL, NULL, 0), res, out);
|
|
||||||
NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
|
NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
|
||||||
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
|
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
|
||||||
close(sock.fd);
|
NCCLCHECKGOTO(ncclSocketClose(&sock), res, out);
|
||||||
|
|
||||||
if (c == 0) {
|
if (c == 0) {
|
||||||
nranks = info.nranks;
|
nranks = info.nranks;
|
||||||
@ -139,54 +145,60 @@ static void *bootstrapRoot(void* args) {
|
|||||||
for (int r=0; r<nranks; ++r) {
|
for (int r=0; r<nranks; ++r) {
|
||||||
int next = (r+1) % nranks;
|
int next = (r+1) % nranks;
|
||||||
struct ncclSocket sock;
|
struct ncclSocket sock;
|
||||||
sock.abortFlag = NULL;
|
NCCLCHECKGOTO(ncclSocketInit(&sock, rankAddressesRoot+r, magic, ncclSocketTypeBootstrap), res, out);
|
||||||
sock.asyncFlag = 0;
|
|
||||||
memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
|
|
||||||
NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
|
NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
|
||||||
NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
|
NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
|
||||||
close(sock.fd);
|
NCCLCHECKGOTO(ncclSocketClose(&sock), res, out);
|
||||||
}
|
}
|
||||||
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
|
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
close(listenSock->fd);
|
if (listenSock != NULL) {
|
||||||
|
ncclSocketClose(listenSock);
|
||||||
free(listenSock);
|
free(listenSock);
|
||||||
|
}
|
||||||
if (rankAddresses) free(rankAddresses);
|
if (rankAddresses) free(rankAddresses);
|
||||||
if (rankAddressesRoot) free(rankAddressesRoot);
|
if (rankAddressesRoot) free(rankAddressesRoot);
|
||||||
if (zero) free(zero);
|
if (zero) free(zero);
|
||||||
|
free(rargs);
|
||||||
|
|
||||||
TRACE(NCCL_INIT, "DONE");
|
TRACE(NCCL_INIT, "DONE");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
|
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) {
|
||||||
struct ncclSocket* listenSock;
|
struct ncclSocket* listenSock;
|
||||||
NCCLCHECK(ncclCalloc(&listenSock, 1));
|
struct bootstrapRootArgs* args;
|
||||||
memcpy(&listenSock->addr, id, sizeof(union ncclSocketAddress));
|
|
||||||
NCCLCHECK(ncclSocketListen(listenSock));
|
|
||||||
memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
|
|
||||||
pthread_t thread;
|
pthread_t thread;
|
||||||
pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
|
|
||||||
|
NCCLCHECK(ncclCalloc(&listenSock, 1));
|
||||||
|
NCCLCHECK(ncclSocketInit(listenSock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, NULL, 0));
|
||||||
|
NCCLCHECK(ncclSocketListen(listenSock));
|
||||||
|
NCCLCHECK(ncclSocketGetAddr(listenSock, &handle->addr));
|
||||||
|
|
||||||
|
NCCLCHECK(ncclCalloc(&args, 1));
|
||||||
|
args->listenSock = listenSock;
|
||||||
|
args->magic = handle->magic;
|
||||||
|
NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0);
|
||||||
ncclSetThreadName(thread, "NCCL BootstrapR");
|
ncclSetThreadName(thread, "NCCL BootstrapR");
|
||||||
pthread_detach(thread); // will not be pthread_join()'d
|
NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
|
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
|
||||||
static_assert(sizeof(union ncclSocketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
memset(handle, 0, sizeof(ncclBootstrapHandle));
|
||||||
memset(id, 0, sizeof(ncclUniqueId));
|
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
|
||||||
union ncclSocketAddress* connectAddr = (union ncclSocketAddress*) id;
|
|
||||||
|
|
||||||
char* env = getenv("NCCL_COMM_ID");
|
char* env = getenv("NCCL_COMM_ID");
|
||||||
if (env) {
|
if (env) {
|
||||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
|
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
|
||||||
if (ncclGetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
|
if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) {
|
||||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||||
return ncclInvalidArgument;
|
return ncclInvalidArgument;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
memcpy(id, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
|
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
|
||||||
NCCLCHECK(bootstrapCreateRoot(id, false));
|
NCCLCHECK(bootstrapCreateRoot(handle, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -209,38 +221,39 @@ struct bootstrapState {
|
|||||||
int cudaDev;
|
int cudaDev;
|
||||||
int rank;
|
int rank;
|
||||||
int nranks;
|
int nranks;
|
||||||
|
uint64_t magic;
|
||||||
volatile uint32_t *abortFlag;
|
volatile uint32_t *abortFlag;
|
||||||
};
|
};
|
||||||
|
|
||||||
ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
|
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) {
|
||||||
int rank = comm->rank;
|
int rank = comm->rank;
|
||||||
int nranks = comm->nRanks;
|
int nranks = comm->nRanks;
|
||||||
struct bootstrapState* state;
|
struct bootstrapState* state;
|
||||||
|
struct ncclSocket* proxySocket;
|
||||||
|
ncclSocketAddress nextAddr;
|
||||||
|
struct ncclSocket sock, listenSockRoot;
|
||||||
|
struct extInfo info = { 0 };
|
||||||
|
|
||||||
NCCLCHECK(ncclCalloc(&state, 1));
|
NCCLCHECK(ncclCalloc(&state, 1));
|
||||||
state->rank = rank;
|
state->rank = rank;
|
||||||
state->nranks = nranks;
|
state->nranks = nranks;
|
||||||
state->abortFlag = comm->abortFlag;
|
state->abortFlag = comm->abortFlag;
|
||||||
comm->bootstrap = state;
|
comm->bootstrap = state;
|
||||||
|
comm->magic = state->magic = handle->magic;
|
||||||
|
|
||||||
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
|
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
|
||||||
|
|
||||||
struct extInfo info = { 0 };
|
|
||||||
info.rank = rank;
|
info.rank = rank;
|
||||||
info.nranks = nranks;
|
info.nranks = nranks;
|
||||||
struct ncclSocket sock, listenSockRoot;
|
|
||||||
|
|
||||||
NCCLCHECK(ncclSocketInit(&sock, (union ncclSocketAddress*) id, comm->abortFlag, 0));
|
|
||||||
NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->abortFlag, 0));
|
|
||||||
NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->abortFlag, 0));
|
|
||||||
NCCLCHECK(ncclSocketInit(&state->ringSendSocket, NULL, comm->abortFlag, 0));
|
|
||||||
NCCLCHECK(ncclSocketInit(&state->ringRecvSocket, NULL, comm->abortFlag, 0));
|
|
||||||
// Create socket for other ranks to contact me
|
// Create socket for other ranks to contact me
|
||||||
|
NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
|
||||||
NCCLCHECK(ncclSocketListen(&state->listenSock));
|
NCCLCHECK(ncclSocketListen(&state->listenSock));
|
||||||
memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress));
|
NCCLCHECK(ncclSocketGetAddr(&state->listenSock, &info.extAddressListen));
|
||||||
|
|
||||||
// Create socket for root to contact me
|
// Create socket for root to contact me
|
||||||
|
NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
|
||||||
NCCLCHECK(ncclSocketListen(&listenSockRoot));
|
NCCLCHECK(ncclSocketListen(&listenSockRoot));
|
||||||
memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress));
|
NCCLCHECK(ncclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot));
|
||||||
|
|
||||||
// stagger connection times to avoid an overload of the root
|
// stagger connection times to avoid an overload of the root
|
||||||
if (nranks > 128) {
|
if (nranks > 128) {
|
||||||
@ -253,32 +266,37 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// send info on my listening socket to root
|
// send info on my listening socket to root
|
||||||
|
NCCLCHECK(ncclSocketInit(&sock, &handle->addr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
|
||||||
NCCLCHECK(ncclSocketConnect(&sock));
|
NCCLCHECK(ncclSocketConnect(&sock));
|
||||||
NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
|
NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
|
||||||
close(sock.fd);
|
NCCLCHECK(ncclSocketClose(&sock));
|
||||||
|
|
||||||
// get info on my "next" rank in the bootstrap ring from root
|
// get info on my "next" rank in the bootstrap ring from root
|
||||||
|
NCCLCHECK(ncclSocketInit(&sock));
|
||||||
NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
|
NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
|
||||||
NCCLCHECK(bootstrapNetRecv(&sock, &state->ringSendSocket.addr, sizeof(union ncclSocketAddress)));
|
NCCLCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union ncclSocketAddress)));
|
||||||
close(sock.fd);
|
NCCLCHECK(ncclSocketClose(&sock));
|
||||||
close(listenSockRoot.fd);
|
NCCLCHECK(ncclSocketClose(&listenSockRoot));
|
||||||
|
|
||||||
|
NCCLCHECK(ncclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
|
||||||
NCCLCHECK(ncclSocketConnect(&state->ringSendSocket));
|
NCCLCHECK(ncclSocketConnect(&state->ringSendSocket));
|
||||||
// Accept the connect request from the previous rank in the AllGather ring
|
// Accept the connect request from the previous rank in the AllGather ring
|
||||||
|
NCCLCHECK(ncclSocketInit(&state->ringRecvSocket));
|
||||||
NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock));
|
NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock));
|
||||||
|
|
||||||
// AllGather all listen handlers
|
// AllGather all listen handlers
|
||||||
NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
|
NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
|
||||||
memcpy(state->peerCommAddresses+rank, &state->listenSock.addr, sizeof(union ncclSocketAddress));
|
NCCLCHECK(ncclSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank));
|
||||||
NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)));
|
NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)));
|
||||||
|
|
||||||
// Create the service proxy
|
// Create the service proxy
|
||||||
NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
|
NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
|
||||||
struct ncclSocket* proxySocket;
|
|
||||||
|
// proxy is aborted through a message; don't set abortFlag
|
||||||
NCCLCHECK(ncclCalloc(&proxySocket, 1));
|
NCCLCHECK(ncclCalloc(&proxySocket, 1));
|
||||||
NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, NULL, 0));
|
NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
|
||||||
NCCLCHECK(ncclSocketListen(proxySocket));
|
NCCLCHECK(ncclSocketListen(proxySocket));
|
||||||
memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress));
|
NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
|
||||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
|
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
|
||||||
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
|
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
|
||||||
|
|
||||||
@ -314,16 +332,21 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
|
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||||
struct ncclSocket sock;
|
struct ncclSocket sock;
|
||||||
|
|
||||||
NCCLCHECK(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->abortFlag, 1));
|
NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
|
||||||
NCCLCHECK(ncclSocketConnect(&sock));
|
NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
|
||||||
NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
|
NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
|
||||||
NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int)));
|
NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
|
||||||
NCCLCHECK(bootstrapNetSend(&sock, data, size));
|
NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail);
|
||||||
close(sock.fd);
|
|
||||||
return ncclSuccess;
|
exit:
|
||||||
|
NCCLCHECK(ncclSocketClose(&sock));
|
||||||
|
return ret;
|
||||||
|
fail:
|
||||||
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
|
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
|
||||||
@ -382,9 +405,10 @@ ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag,
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
|
ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) {
|
||||||
struct unexConn* elem = state->unexpectedConnections;
|
struct unexConn* elem = state->unexpectedConnections;
|
||||||
struct unexConn* prev = NULL;
|
struct unexConn* prev = NULL;
|
||||||
|
*found = 0;
|
||||||
while (elem) {
|
while (elem) {
|
||||||
if (elem->peer == peer && elem->tag == tag) {
|
if (elem->peer == peer && elem->tag == tag) {
|
||||||
if (prev == NULL) {
|
if (prev == NULL) {
|
||||||
@ -394,54 +418,75 @@ ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag,
|
|||||||
}
|
}
|
||||||
memcpy(sock, &elem->sock, sizeof(struct ncclSocket));
|
memcpy(sock, &elem->sock, sizeof(struct ncclSocket));
|
||||||
free(elem);
|
free(elem);
|
||||||
|
*found = 1;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
prev = elem;
|
prev = elem;
|
||||||
elem = elem->next;
|
elem = elem->next;
|
||||||
}
|
}
|
||||||
sock->fd = -1;
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void unexpectedFree(struct bootstrapState* state) {
|
||||||
|
struct unexConn* elem = state->unexpectedConnections;
|
||||||
|
struct unexConn* prev = NULL;
|
||||||
|
|
||||||
|
while (elem) {
|
||||||
|
prev = elem;
|
||||||
|
elem = elem->next;
|
||||||
|
free(prev);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// We can't know who we'll receive from, so we need to receive everything at once
|
// We can't know who we'll receive from, so we need to receive everything at once
|
||||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
|
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||||
struct ncclSocket sock;
|
struct ncclSocket sock;
|
||||||
|
int newPeer, newTag;
|
||||||
|
|
||||||
// Search unexpected connections first
|
// Search unexpected connections first
|
||||||
NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock));
|
int found;
|
||||||
if (sock.fd != -1) {
|
NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found));
|
||||||
NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
|
if (found) {
|
||||||
close(sock.fd);
|
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
|
||||||
return ncclSuccess;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Then look for new connections
|
// Then look for new connections
|
||||||
NCCLCHECK(ncclSocketInit(&sock, NULL, state->listenSock.abortFlag, 0));
|
|
||||||
while (1) {
|
while (1) {
|
||||||
NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock));
|
NCCLCHECKGOTO(ncclSocketInit(&sock), ret, fail);
|
||||||
int newPeer, newTag;
|
NCCLCHECKGOTO(ncclSocketAccept(&sock, &state->listenSock), ret, fail);
|
||||||
NCCLCHECK(bootstrapNetRecv(&sock, &newPeer, sizeof(int)));
|
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail);
|
||||||
NCCLCHECK(bootstrapNetRecv(&sock, &newTag, sizeof(int)));
|
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail);
|
||||||
if (newPeer == peer && newTag == tag) {
|
if (newPeer == peer && newTag == tag) {
|
||||||
NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
|
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
|
||||||
close(sock.fd);
|
goto exit;
|
||||||
return ncclSuccess;
|
|
||||||
}
|
}
|
||||||
// Unexpected connection. Save for later.
|
// Unexpected connection. Save for later.
|
||||||
NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, &sock));
|
NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail);
|
||||||
}
|
}
|
||||||
|
exit:
|
||||||
|
NCCLCHECK(ncclSocketClose(&sock));
|
||||||
|
return ret;
|
||||||
|
fail:
|
||||||
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t bootstrapClose(void* commState) {
|
ncclResult_t bootstrapClose(void* commState) {
|
||||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||||
if (state->unexpectedConnections != NULL) {
|
if (state->unexpectedConnections != NULL) {
|
||||||
|
unexpectedFree(state);
|
||||||
|
if (*state->abortFlag == 0) {
|
||||||
WARN("Unexpected connections are not empty");
|
WARN("Unexpected connections are not empty");
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
if (state->listenSock.fd >= 0) close(state->listenSock.fd);
|
}
|
||||||
if (state->ringSendSocket.fd >= 0) close(state->ringSendSocket.fd);
|
|
||||||
if (state->ringRecvSocket.fd >= 0) close(state->ringRecvSocket.fd);
|
NCCLCHECK(ncclSocketClose(&state->listenSock));
|
||||||
|
NCCLCHECK(ncclSocketClose(&state->ringSendSocket));
|
||||||
|
NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
|
||||||
|
|
||||||
free(state->peerCommAddresses);
|
free(state->peerCommAddresses);
|
||||||
free(state);
|
free(state);
|
||||||
@ -452,9 +497,9 @@ ncclResult_t bootstrapClose(void* commState) {
|
|||||||
ncclResult_t bootstrapAbort(void* commState) {
|
ncclResult_t bootstrapAbort(void* commState) {
|
||||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||||
if (commState == NULL) return ncclSuccess;
|
if (commState == NULL) return ncclSuccess;
|
||||||
if (state->listenSock.fd) close(state->listenSock.fd);
|
NCCLCHECK(ncclSocketClose(&state->listenSock));
|
||||||
if (state->ringSendSocket.fd) close(state->ringSendSocket.fd);
|
NCCLCHECK(ncclSocketClose(&state->ringSendSocket));
|
||||||
if (state->ringRecvSocket.fd) close(state->ringRecvSocket.fd);
|
NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
|
||||||
free(state->peerCommAddresses);
|
free(state->peerCommAddresses);
|
||||||
free(state->peerProxyAddresses);
|
free(state->peerProxyAddresses);
|
||||||
free(state);
|
free(state);
|
||||||
|
@ -11,7 +11,13 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
|
|||||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
||||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
|
||||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
// Just pass the size of one message and not the total bytes sent/received.
|
||||||
|
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
|
||||||
|
};
|
||||||
|
size_t msgsize = sendcount * ncclTypeSize(datatype);
|
||||||
|
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
|
||||||
|
|
||||||
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
|
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
|
||||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||||
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
|
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
|
||||||
|
@ -5,12 +5,25 @@
|
|||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "enqueue.h"
|
#include "enqueue.h"
|
||||||
|
#include "nccl.h"
|
||||||
|
|
||||||
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
||||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
||||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
struct NvtxParamsAllReduce {
|
||||||
|
size_t bytes;
|
||||||
|
ncclRedOp_t op;
|
||||||
|
};
|
||||||
|
// Just pass the size of one message and not the total bytes sent/received.
|
||||||
|
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
||||||
|
offsetof(NvtxParamsAllReduce, op)}
|
||||||
|
};
|
||||||
|
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
|
||||||
|
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
|
||||||
|
|
||||||
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
|
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
|
||||||
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
|
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
|
||||||
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
|
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
|
||||||
|
@ -11,7 +11,17 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
|
|||||||
ncclComm_t comm, cudaStream_t stream);
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||||
ncclComm_t comm, cudaStream_t stream) {
|
ncclComm_t comm, cudaStream_t stream) {
|
||||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
struct NvtxParamsBroadcast {
|
||||||
|
size_t bytes;
|
||||||
|
int root;
|
||||||
|
};
|
||||||
|
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
|
||||||
|
};
|
||||||
|
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
|
||||||
|
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
|
||||||
|
|
||||||
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
|
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
|
||||||
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
||||||
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
|
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
|
||||||
|
@ -37,6 +37,7 @@ struct ncclShmemData {
|
|||||||
};
|
};
|
||||||
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
|
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
|
||||||
int channelId;
|
int channelId;
|
||||||
|
int aborted;
|
||||||
alignas(16) struct ncclDevComm comm;
|
alignas(16) struct ncclDevComm comm;
|
||||||
alignas(16) struct ncclDevChannel channel;
|
alignas(16) struct ncclDevChannel channel;
|
||||||
alignas(16) struct ncclWork work;
|
alignas(16) struct ncclWork work;
|
||||||
@ -135,6 +136,8 @@ __device__ void ncclKernel(
|
|||||||
}
|
}
|
||||||
__syncthreads(); // publish ncclShmem.channelId
|
__syncthreads(); // publish ncclShmem.channelId
|
||||||
int channelId = ncclShmem.channelId;
|
int channelId = ncclShmem.channelId;
|
||||||
|
/* set abort flag to 0 */
|
||||||
|
if (tid == 0) ncclShmem.aborted = 0;
|
||||||
|
|
||||||
if (true) {
|
if (true) {
|
||||||
void *dst, *src;
|
void *dst, *src;
|
||||||
|
@ -62,7 +62,10 @@ class Primitives<
|
|||||||
inline __device__ bool checkAbort(int &spins) {
|
inline __device__ bool checkAbort(int &spins) {
|
||||||
spins++;
|
spins++;
|
||||||
if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||||
flags |= *ncclShmem.comm.abortFlag ? Aborted : 0;
|
if (*ncclShmem.comm.abortFlag) {
|
||||||
|
flags |= Aborted;
|
||||||
|
ncclShmem.aborted = 1;
|
||||||
|
}
|
||||||
spins = 0;
|
spins = 0;
|
||||||
}
|
}
|
||||||
return flags & Aborted;
|
return flags & Aborted;
|
||||||
@ -176,6 +179,9 @@ class Primitives<
|
|||||||
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
|
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
|
||||||
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
|
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
|
||||||
subBarrier();
|
subBarrier();
|
||||||
|
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
|
||||||
|
* to 0 to avoid unnecessary workload. */
|
||||||
|
size_t workSize = ncclShmem.aborted ? 0 : sliceSize;
|
||||||
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
|
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
|
||||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||||
if (Send) {
|
if (Send) {
|
||||||
@ -184,7 +190,7 @@ class Primitives<
|
|||||||
(tid, nworkers, nullptr, false,
|
(tid, nworkers, nullptr, false,
|
||||||
1, (T const**)ncclShmem.groups[group].srcs,
|
1, (T const**)ncclShmem.groups[group].srcs,
|
||||||
fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
|
fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
|
||||||
sliceSize);
|
workSize);
|
||||||
}
|
}
|
||||||
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
|
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
|
||||||
// For broadcast in CollNet to do empty send
|
// For broadcast in CollNet to do empty send
|
||||||
@ -192,7 +198,7 @@ class Primitives<
|
|||||||
(tid, nworkers, ncclShmem.redOpArgs, postOp,
|
(tid, nworkers, ncclShmem.redOpArgs, postOp,
|
||||||
Recv, (T const**)ncclShmem.groups[group].srcs,
|
Recv, (T const**)ncclShmem.groups[group].srcs,
|
||||||
Dst, (T**)ncclShmem.groups[group].dsts,
|
Dst, (T**)ncclShmem.groups[group].dsts,
|
||||||
sliceSize);
|
workSize);
|
||||||
} else {
|
} else {
|
||||||
constexpr int PreOpN = SrcBuf != Input ? 0 :
|
constexpr int PreOpN = SrcBuf != Input ? 0 :
|
||||||
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
|
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
|
||||||
@ -200,7 +206,7 @@ class Primitives<
|
|||||||
(tid, nworkers, ncclShmem.redOpArgs, postOp,
|
(tid, nworkers, ncclShmem.redOpArgs, postOp,
|
||||||
Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
|
Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
|
||||||
Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
|
Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
|
||||||
sliceSize);
|
workSize);
|
||||||
}
|
}
|
||||||
barrier(); // This barrier has a counterpart in following loop
|
barrier(); // This barrier has a counterpart in following loop
|
||||||
if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
|
if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
|
||||||
|
@ -6,12 +6,26 @@
|
|||||||
|
|
||||||
#include "enqueue.h"
|
#include "enqueue.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
|
#include "nccl.h"
|
||||||
|
|
||||||
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
struct NvtxParamsReduce {
|
||||||
|
size_t bytes;
|
||||||
|
int root;
|
||||||
|
ncclRedOp_t op;
|
||||||
|
};
|
||||||
|
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
||||||
|
offsetof(NvtxParamsReduce, op)}
|
||||||
|
};
|
||||||
|
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
|
||||||
|
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
|
||||||
|
|
||||||
struct ncclInfo info = { ncclFuncReduce, "Reduce",
|
struct ncclInfo info = { ncclFuncReduce, "Reduce",
|
||||||
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
||||||
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
|
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
|
||||||
|
@ -6,12 +6,24 @@
|
|||||||
|
|
||||||
#include "enqueue.h"
|
#include "enqueue.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
|
#include "nccl.h"
|
||||||
|
|
||||||
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
|
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
||||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
||||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
struct NvtxParamsReduceScatter {
|
||||||
|
size_t bytes;
|
||||||
|
ncclRedOp_t op;
|
||||||
|
};
|
||||||
|
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
||||||
|
offsetof(NvtxParamsReduceScatter, op)}
|
||||||
|
};
|
||||||
|
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
|
||||||
|
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
|
||||||
|
|
||||||
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
|
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
|
||||||
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
|
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
|
||||||
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
|
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
|
||||||
|
@ -8,11 +8,22 @@
|
|||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
#include "argcheck.h" // Need some checks here since we access comm
|
#include "argcheck.h" // Need some checks here since we access comm
|
||||||
|
|
||||||
|
struct NvtxParamsSendRecv {
|
||||||
|
size_t bytes;
|
||||||
|
int peer;
|
||||||
|
};
|
||||||
|
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
|
||||||
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
|
||||||
|
};
|
||||||
|
|
||||||
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||||
ncclComm_t comm, cudaStream_t stream);
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||||
ncclComm_t comm, cudaStream_t stream) {
|
ncclComm_t comm, cudaStream_t stream) {
|
||||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
|
||||||
|
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
|
||||||
|
|
||||||
struct ncclInfo info = { ncclFuncSend, "Send",
|
struct ncclInfo info = { ncclFuncSend, "Send",
|
||||||
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||||
1, 1 };
|
1, 1 };
|
||||||
@ -27,7 +38,9 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
|
|||||||
ncclComm_t comm, cudaStream_t stream);
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||||
ncclComm_t comm, cudaStream_t stream) {
|
ncclComm_t comm, cudaStream_t stream) {
|
||||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
|
||||||
|
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
|
||||||
|
|
||||||
struct ncclInfo info = { ncclFuncRecv, "Recv",
|
struct ncclInfo info = { ncclFuncRecv, "Recv",
|
||||||
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||||
1, 1 };
|
1, 1 };
|
||||||
|
@ -609,8 +609,7 @@ static ncclResult_t scheduleP2pTasksToPlan(
|
|||||||
|
|
||||||
// Compute how much to split operations
|
// Compute how much to split operations
|
||||||
// Natural step size matching buffer steps.
|
// Natural step size matching buffer steps.
|
||||||
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
ssize_t stepSize = comm->p2pChunkSize;
|
||||||
if (comm->nNodes > 1) stepSize = comm->p2pNetChunkSize;
|
|
||||||
// Try to use all channels
|
// Try to use all channels
|
||||||
int nChannelsMax = comm->p2pnChannelsPerPeer;
|
int nChannelsMax = comm->p2pnChannelsPerPeer;
|
||||||
int nChannelsMin = nChannelsMax;
|
int nChannelsMin = nChannelsMax;
|
||||||
@ -1008,7 +1007,13 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
|
|||||||
|
|
||||||
#if CUDART_VERSION >= 11080
|
#if CUDART_VERSION >= 11080
|
||||||
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
|
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
|
||||||
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", 0);
|
#define NCCL_CGA_CLUSTER_SIZE_SM90 4
|
||||||
|
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", -2);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= 12000
|
||||||
|
// NCCL uses the "Remote" Mem Sync domain by default
|
||||||
|
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||||
@ -1022,8 +1027,10 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
|||||||
#if CUDART_VERSION >= 11080
|
#if CUDART_VERSION >= 11080
|
||||||
int driverVersion;
|
int driverVersion;
|
||||||
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
|
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
|
||||||
|
if (driverVersion >= 11080) {
|
||||||
unsigned int clusterSize = 0;
|
int compCap = comm->compCap;
|
||||||
|
unsigned int clusterSize = (compCap == 90) ? NCCL_CGA_CLUSTER_SIZE_SM90 : 0;
|
||||||
|
if (ncclParamCGAClusterSize() != -2) {
|
||||||
clusterSize = ncclParamCGAClusterSize();
|
clusterSize = ncclParamCGAClusterSize();
|
||||||
if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
|
if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
|
||||||
static bool warned = false;
|
static bool warned = false;
|
||||||
@ -1034,10 +1041,11 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
|||||||
}
|
}
|
||||||
clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
|
clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (clusterSize && driverVersion >= 11080) {
|
|
||||||
cudaLaunchConfig_t launchConfig = {0};
|
cudaLaunchConfig_t launchConfig = {0};
|
||||||
cudaLaunchAttribute launchAttrs[2];
|
cudaLaunchAttribute launchAttrs[3];
|
||||||
|
int attrs = 0;
|
||||||
/* Cooperative Group Array (CGA)
|
/* Cooperative Group Array (CGA)
|
||||||
* On sm90 and later we have an extra level of hierarchy where we
|
* On sm90 and later we have an extra level of hierarchy where we
|
||||||
* can group together several blocks within the Grid, called
|
* can group together several blocks within the Grid, called
|
||||||
@ -1048,17 +1056,25 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
|||||||
* concurrently scheduled onto a group of SMs.
|
* concurrently scheduled onto a group of SMs.
|
||||||
* The maximum value is 8 and it must be divisible into the grid dimensions
|
* The maximum value is 8 and it must be divisible into the grid dimensions
|
||||||
*/
|
*/
|
||||||
|
if (clusterSize) {
|
||||||
// Grid dimension must be divisible by clusterSize
|
// Grid dimension must be divisible by clusterSize
|
||||||
if (grid.x % clusterSize) clusterSize = 1;
|
if (grid.x % clusterSize) clusterSize = 1;
|
||||||
launchAttrs[0].id = cudaLaunchAttributeClusterDimension;
|
launchAttrs[attrs].id = cudaLaunchAttributeClusterDimension;
|
||||||
launchAttrs[0].val.clusterDim = {clusterSize, 1, 1};
|
launchAttrs[attrs++].val.clusterDim = {clusterSize, 1, 1};
|
||||||
launchAttrs[1].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
|
launchAttrs[attrs].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
|
||||||
launchAttrs[1].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
|
launchAttrs[attrs++].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
|
||||||
|
}
|
||||||
|
#if CUDART_VERSION >= 12000
|
||||||
|
if (compCap >= 90 && driverVersion >= 12000) {
|
||||||
|
// Set the NCCL Mem Sync domain on CUDA 12.0 and later (sm90)
|
||||||
|
launchAttrs[attrs].id = cudaLaunchAttributeMemSyncDomain;
|
||||||
|
launchAttrs[attrs++].val.memSyncDomain = (cudaLaunchMemSyncDomain) ncclParamMemSyncDomain();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
launchConfig.gridDim = grid;
|
launchConfig.gridDim = grid;
|
||||||
launchConfig.blockDim = block;
|
launchConfig.blockDim = block;
|
||||||
launchConfig.attrs = launchAttrs;
|
launchConfig.attrs = launchAttrs;
|
||||||
launchConfig.numAttrs = sizeof(launchAttrs)/sizeof(launchAttrs[0]);
|
launchConfig.numAttrs = attrs;
|
||||||
launchConfig.stream = launchStream;
|
launchConfig.stream = launchStream;
|
||||||
|
|
||||||
CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
|
CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
|
||||||
@ -1093,14 +1109,18 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
|||||||
// back to us for reclaiming via callbackQueue.
|
// back to us for reclaiming via callbackQueue.
|
||||||
ncclIntruQueueConstruct(&comm->planQueue);
|
ncclIntruQueueConstruct(&comm->planQueue);
|
||||||
cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch
|
cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch
|
||||||
// Create dependency for deviceStream on launchStream.
|
// Create dependency for deviceStream on launchStream. We know that deviceStream
|
||||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream), result, resume1);
|
// hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
|
||||||
|
// so we can say that launchStream subsumes it.
|
||||||
|
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
|
||||||
resume1:
|
resume1:
|
||||||
// Create dependency for other user streams (skip launch stream).
|
// Create dependency for other user streams (skip launch stream) on deviceStream.
|
||||||
|
// Again, the user streams haven't been touched since deviceStream waited on them
|
||||||
|
// so we can say they are subsumed by deviceStream.
|
||||||
struct ncclCudaStreamList* sl = tasks->streams->next;
|
struct ncclCudaStreamList* sl = tasks->streams->next;
|
||||||
tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
|
tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
|
||||||
while (sl != nullptr) {
|
while (sl != nullptr) {
|
||||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream), result, resume2);
|
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream, /*b_subsumes_a=*/true), result, resume2);
|
||||||
resume2:
|
resume2:
|
||||||
sl = sl->next;
|
sl = sl->next;
|
||||||
}
|
}
|
||||||
|
@ -754,3 +754,15 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
|
|||||||
*nranks = nvbGpus;
|
*nranks = nvbGpus;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
|
||||||
|
int minPath = PATH_DIS;
|
||||||
|
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||||
|
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
|
||||||
|
for (int j=0; j<system->nodes[GPU].count; j++) {
|
||||||
|
if (i == j) continue;
|
||||||
|
minPath = std::min(minPath, paths[j].type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return minPath >= PATH_PIX ? 0 : 1;
|
||||||
|
}
|
||||||
|
@ -72,6 +72,9 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
|
|||||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||||
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
|
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
|
||||||
}
|
}
|
||||||
|
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
|
||||||
|
*bw = AMD_BW;
|
||||||
|
}
|
||||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||||
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
|
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
#define PCI_BW 12.0 // PCI Gen3 x16
|
#define PCI_BW 12.0 // PCI Gen3 x16
|
||||||
#define QPI_BW 6.0
|
#define QPI_BW 6.0
|
||||||
#define SKL_QPI_BW 9.0
|
#define SKL_QPI_BW 9.0
|
||||||
|
#define AMD_BW 16.0
|
||||||
#define ZPI_BW 6.0
|
#define ZPI_BW 6.0
|
||||||
#define YONGFENG_ZPI_BW 9.0
|
#define YONGFENG_ZPI_BW 9.0
|
||||||
#define P9_BW 32.0
|
#define P9_BW 32.0
|
||||||
|
@ -10,10 +10,16 @@
|
|||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
#include "comm.h"
|
#include "comm.h"
|
||||||
|
|
||||||
|
struct ncclBootstrapHandle {
|
||||||
|
uint64_t magic;
|
||||||
|
union ncclSocketAddress addr;
|
||||||
|
};
|
||||||
|
static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
|
||||||
|
|
||||||
ncclResult_t bootstrapNetInit();
|
ncclResult_t bootstrapNetInit();
|
||||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
|
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
|
||||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
|
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
|
||||||
ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm);
|
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
|
||||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
||||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
||||||
|
@ -171,9 +171,12 @@ struct ncclComm {
|
|||||||
uint64_t* connectSend;
|
uint64_t* connectSend;
|
||||||
uint64_t* connectRecv;
|
uint64_t* connectRecv;
|
||||||
|
|
||||||
|
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||||
|
|
||||||
int rank; // my rank in the communicator
|
int rank; // my rank in the communicator
|
||||||
int nRanks; // number of GPUs in communicator
|
int nRanks; // number of GPUs in communicator
|
||||||
int cudaDev; // my cuda device index
|
int cudaDev; // my cuda device index
|
||||||
|
int compCap; // compute capability of the GPU
|
||||||
int64_t busId; // my PCI bus ID in int format
|
int64_t busId; // my PCI bus ID in int format
|
||||||
cpu_set_t cpuAffinity; // CPU affinity of the GPU
|
cpu_set_t cpuAffinity; // CPU affinity of the GPU
|
||||||
|
|
||||||
@ -208,7 +211,7 @@ struct ncclComm {
|
|||||||
|
|
||||||
// Buffer sizes
|
// Buffer sizes
|
||||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||||
int p2pNetChunkSize;
|
int p2pChunkSize;
|
||||||
|
|
||||||
// Algorithm/Protocols thresholds
|
// Algorithm/Protocols thresholds
|
||||||
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
@ -240,7 +243,6 @@ struct ncclComm {
|
|||||||
// Intra-process sync
|
// Intra-process sync
|
||||||
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
|
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
|
||||||
struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
|
struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
|
||||||
int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks)
|
|
||||||
int intraRank;
|
int intraRank;
|
||||||
int intraRanks;
|
int intraRanks;
|
||||||
uint32_t intraBarrierPhase;
|
uint32_t intraBarrierPhase;
|
||||||
|
@ -28,6 +28,7 @@ void ncclTopoFree(struct ncclTopoSystem* system);
|
|||||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||||
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
|
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
|
||||||
|
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
|
||||||
|
|
||||||
// Query topology
|
// Query topology
|
||||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
|
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
|
||||||
|
@ -9,6 +9,77 @@
|
|||||||
|
|
||||||
#include "nvtx3.hpp"
|
#include "nvtx3.hpp"
|
||||||
|
|
||||||
|
#if __cpp_constexpr >= 201304L && !defined(NVTX3_RELAXED_CONSTEXPR)
|
||||||
|
#define NVTX3_RELAXED_CONSTEXPR constexpr
|
||||||
|
#else
|
||||||
|
#define NVTX3_RELAXED_CONSTEXPR
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Define all NCCL-provided static schema IDs here (avoid duplicates).
|
||||||
|
#define NVTX_SID_CommInitRank 0
|
||||||
|
#define NVTX_SID_CommInitAll 1
|
||||||
|
#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank
|
||||||
|
#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank
|
||||||
|
#define NVTX_SID_AllGather 4
|
||||||
|
#define NVTX_SID_AllReduce 5
|
||||||
|
#define NVTX_SID_Broadcast 6
|
||||||
|
#define NVTX_SID_ReduceScatter 7
|
||||||
|
#define NVTX_SID_Reduce 8
|
||||||
|
#define NVTX_SID_Send 9
|
||||||
|
#define NVTX_SID_Recv 10
|
||||||
|
|
||||||
|
// Define static schema ID for the reduction operation.
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||||
|
|
||||||
|
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
|
||||||
|
|
||||||
struct nccl_domain{static constexpr char const* name{"NCCL"};};
|
struct nccl_domain{static constexpr char const* name{"NCCL"};};
|
||||||
|
|
||||||
|
class payload_schema {
|
||||||
|
public:
|
||||||
|
NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
|
||||||
|
{
|
||||||
|
schema_attr.name = schemaName;
|
||||||
|
schema_attr.entries = entries;
|
||||||
|
schema_attr.numEntries = numEntries;
|
||||||
|
schema_attr.schemaId = schemaId;
|
||||||
|
nvtxPayloadSchemaRegister(nvtx3::domain::get<nccl_domain>(), &schema_attr);
|
||||||
|
}
|
||||||
|
|
||||||
|
payload_schema() = delete;
|
||||||
|
~payload_schema() = default;
|
||||||
|
payload_schema(payload_schema const&) = default;
|
||||||
|
payload_schema& operator=(payload_schema const&) = default;
|
||||||
|
payload_schema(payload_schema&&) = default;
|
||||||
|
payload_schema& operator=(payload_schema&&) = default;
|
||||||
|
|
||||||
|
private:
|
||||||
|
nvtxPayloadSchemaAttr_t schema_attr{
|
||||||
|
NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
|
||||||
|
NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
|
||||||
|
NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
|
||||||
|
NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
|
||||||
|
NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
|
||||||
|
nullptr,
|
||||||
|
NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
|
||||||
|
NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
|
||||||
|
nullptr, 0, 0, 0};
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create NVTX push/pop range with parameters
|
||||||
|
// @param name of the operation (see `NVTX_SID_*`)
|
||||||
|
// @param N schema name
|
||||||
|
// @param S schema (entries)
|
||||||
|
// @param P payload (struct)
|
||||||
|
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
|
||||||
|
static const payload_schema schema{S, std::extent<decltype(S)>::value, \
|
||||||
|
NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
|
||||||
|
static ::nvtx3::v1::registered_string<nccl_domain> const nvtx3_func_name__{__func__}; \
|
||||||
|
nvtxPayloadData_t nvtx3_bpl__[] = { \
|
||||||
|
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
|
||||||
|
::nvtx3::v1::event_attributes nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
|
||||||
|
::nvtx3::v1::domain_thread_range<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
|
||||||
|
|
||||||
|
extern void initNvtxRegisteredEnums();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -92,6 +92,7 @@
|
|||||||
/* clang-format on */
|
/* clang-format on */
|
||||||
|
|
||||||
#include <nvtx3/nvToolsExt.h>
|
#include <nvtx3/nvToolsExt.h>
|
||||||
|
#include <nvtx3/nvToolsExtPayload.h>
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -1732,6 +1733,22 @@ class event_attributes {
|
|||||||
attributes_.messageType = m.get_type();
|
attributes_.messageType = m.get_type();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Variadic constructor where the first argument is a binary payload.
|
||||||
|
*
|
||||||
|
* Sets the value of the `EventAttribute`s message based on `m` and forwards
|
||||||
|
* the remaining variadic parameter pack to the next constructor.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
template <typename... Args>
|
||||||
|
NVTX3_RELAXED_CONSTEXPR explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept
|
||||||
|
: event_attributes(args...)
|
||||||
|
{
|
||||||
|
attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY;
|
||||||
|
attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event.
|
||||||
|
attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl);
|
||||||
|
}
|
||||||
|
|
||||||
~event_attributes() = default;
|
~event_attributes() = default;
|
||||||
event_attributes(event_attributes const&) = default;
|
event_attributes(event_attributes const&) = default;
|
||||||
event_attributes& operator=(event_attributes const&) = default;
|
event_attributes& operator=(event_attributes const&) = default;
|
||||||
|
776
src/include/nvtx3/nvToolsExtPayload.h
Normal file
776
src/include/nvtx3/nvToolsExtPayload.h
Normal file
@ -0,0 +1,776 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2021 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "nvtx3/nvToolsExt.h"
|
||||||
|
|
||||||
|
#ifndef NVTOOLSEXT_PAYLOAD_H
|
||||||
|
#define NVTOOLSEXT_PAYLOAD_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief A compatibility ID value used in initialization to identify version
|
||||||
|
* differences.
|
||||||
|
*/
|
||||||
|
#define NVTX_EXT_COMPATID_PAYLOAD 0x0103
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief This module ID identifies the payload extension. It has to be unique
|
||||||
|
* among the extension modules.
|
||||||
|
*/
|
||||||
|
#define NVTX_EXT_MODULEID_PAYLOAD 2
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Additional values for the enum @ref nvtxPayloadType_t
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_TYPE_BINARY ((int32_t)0xDFBD0009)
|
||||||
|
|
||||||
|
|
||||||
|
/** ---------------------------------------------------------------------------
|
||||||
|
* Payload schema entry flags.
|
||||||
|
* ------------------------------------------------------------------------- */
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Absolute pointer into a payload (entry) of the same event.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_POINTER (1 << 1)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Offset from base address of the payload.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_BASE (1 << 2)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Offset from the end of this payload entry.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_HERE (1 << 3)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The value is an array with fixed length, set with the field `arrayLength`.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE (1 << 4)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The value is a zero-/null-terminated array.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief A single or multi-dimensional array of variable length.
|
||||||
|
*
|
||||||
|
* The field `arrayLength` contains the index of the schema entry that holds the
|
||||||
|
* length(s). If the other field points to a scalar entry then this will be the
|
||||||
|
* 1D array. If the other field points to a FIXED_SIZE array, then the number of
|
||||||
|
* dimensions is defined with the registration of the scheme. If the other field
|
||||||
|
* is ZERO_TERMINATED, the array the dimensions can be determined at runtime.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX (3 << 4)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A tool may not support deep copy and just ignore this flag.
|
||||||
|
* See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY (1 << 9)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The entry specifies the message in a deferred event. The entry type can be
|
||||||
|
* any string type. The flag is ignored for schemas that are not flagged with
|
||||||
|
* `NVTX_PAYLOAD_SCHEMA_FLAG_RANGE*` or `NVTX_PAYLOAD_SCHEMA_FLAG_MARK`.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE (1 << 10)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @note The ‘array’ flags assume that the array is embedded. Otherwise,
|
||||||
|
* @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some
|
||||||
|
* combinations may be invalid based on the `NVTX_PAYLOAD_SCHEMA_TYPE_*` this
|
||||||
|
* entry is enclosed. For instance, variable length embedded arrays are valid
|
||||||
|
* within @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC but invalid with
|
||||||
|
* @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC. See `NVTX_PAYLOAD_SCHEMA_TYPE_*` for
|
||||||
|
* additional details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Helper macro to check if an entry represents an array. */
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY (\
|
||||||
|
NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE | \
|
||||||
|
NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \
|
||||||
|
NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX)
|
||||||
|
|
||||||
|
/** ---------------------------------------------------------------------------
|
||||||
|
* Types of entries in a payload schema.
|
||||||
|
* ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @note Several of the predefined types contain the size (in bits) in their
|
||||||
|
* names. For some data types the size (in bytes) is not fixed and may differ
|
||||||
|
* for different platforms/operating systems/compilers. To provide portability,
|
||||||
|
* an array of sizes (in bytes) for type 1 to 28 ( @ref
|
||||||
|
* NVTX_PAYLOAD_ENTRY_TYPE_CHAR to @ref NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE)
|
||||||
|
* is passed to the NVTX extension initialization function
|
||||||
|
* @ref InitializeInjectionNvtxExtension via the `extInfo` field of
|
||||||
|
* @ref nvtxExtModuleInfo_t.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basic integer types.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR 1
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UCHAR 2
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_SHORT 3
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_USHORT 4
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_INT 5
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT 6
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_LONG 7
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_ULONG 8
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG 9
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG 10
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integer types with explicit size.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_INT8 11
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT8 12
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_INT16 13
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT16 14
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_INT32 15
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT32 16
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_INT64 17
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT64 18
|
||||||
|
|
||||||
|
/**
|
||||||
|
* C floating point types
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT 19
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE 20
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Size type (`size_t`)
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_SIZE 22
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Any address, e.g. `void*`. If the pointer type matters, use the flag @ref
|
||||||
|
* NVTX_PAYLOAD_ENTRY_FLAG_POINTER and the respective type instead.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS 23
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Special character types.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_WCHAR 24 /* wide character (since C90) */
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 25 /* since C2x and C++20 */
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 26
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 27
|
||||||
|
|
||||||
|
/**
|
||||||
|
* There is type size and alignment information for all previous types.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE (NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 + 1)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed.
|
||||||
|
* Typically a tool will display this as hex or binary.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_BYTE 32
|
||||||
|
|
||||||
|
/**
|
||||||
|
* These types do not have standardized equivalents. It is assumed that the
|
||||||
|
* number at the end corresponds to the bits used to store the value and that
|
||||||
|
* the alignment corresponds to standardized types of the same size.
|
||||||
|
* A tool may not support these types.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_INT128 33
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT128 34
|
||||||
|
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT16 42
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32 43
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64 44
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT128 45
|
||||||
|
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_BF16 50
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TF32 52
|
||||||
|
|
||||||
|
/**
|
||||||
|
* These types are normalized numbers stored in integers. UNORMs represent 0.0
|
||||||
|
* to 1.0 and SNORMs represent -1.0 to 1.0. The number after represents the
|
||||||
|
* number of integer bits. Alignment is take from equivalent types INT# matching
|
||||||
|
* to SNORM# and UINT# matching to UNORM#.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM8 61
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM8 62
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM16 63
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM16 64
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM32 65
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM32 66
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM64 67
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM64 68
|
||||||
|
|
||||||
|
/**
|
||||||
|
* String types.
|
||||||
|
*
|
||||||
|
* If `arrayOrUnionDetail` is greater than `0`, the entry is a fixed-size string
|
||||||
|
* with the provided length.
|
||||||
|
*
|
||||||
|
* `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is ignored for string types. It
|
||||||
|
* just specifies once more that the entry is a fixed-size string.
|
||||||
|
*
|
||||||
|
* Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` indicates a
|
||||||
|
* zero-terminated string. If `arrayOrUnionDetail` is greater than `0`, a zero-
|
||||||
|
* terminated array of fixed-size strings is assumed.
|
||||||
|
*
|
||||||
|
* Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies the
|
||||||
|
* entry index of the entry which contains the string length. It is not possible
|
||||||
|
* to describe a variable length array of strings.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING 75 /* `char*`, system LOCALE */
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8 76
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF16 77
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @ref nvtxStringHandle_t returned by @ref nvtxDomainRegisterString
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Entry types to be used in deferred events. Data types are as defined by
|
||||||
|
* NVTXv3 core: category -> uint32_t, color -> uint32_t, color type -> int32_t.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_CATEGORY 90
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLORTYPE 91
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLOR 92
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This type marks the union selector member (entry index) in schemas used by
|
||||||
|
* a union with internal internal selector.
|
||||||
|
* See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Timestamp types occupy the range from 128 to 255
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP64 128 /* data type is uint64_t */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CPU timestamp sources.
|
||||||
|
* \todo All 64 bits?
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC 129
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC_NONVIRTUALIZED 130
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME 131
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME_COARSE 132
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC 133
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_RAW 134
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_COARSE 135
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_BOOTTIME 136
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_PROCESS_CPUTIME_ID 137
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_THREAD_CPUTIME_ID 138
|
||||||
|
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_QPC 160
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFT 161
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFTP 162
|
||||||
|
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIME 163
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_CLOCK 164
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIMESPEC_GET 165
|
||||||
|
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_STEADY_CLOCK 166
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_HIGH_RESOLUTION_CLOCK 167
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_SYSTEM_CLOCK 168
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_UTC_CLOCK 169
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_TAI_CLOCK 170
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_GPS_CLOCK 171
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_FILE_CLOCK 172
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief GPU timestamp sources.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_GLOBALTIMER 192
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK 193
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK64 194
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_CUPTI 195
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The timestamp was provided by the NVTX handler’s timestamp routine.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_TOOL_PROVIDED 224
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that
|
||||||
|
* the payload is a blob of memory which other payload entries may point into.
|
||||||
|
* A tool will not expose this payload directly.
|
||||||
|
*/
|
||||||
|
#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that
|
||||||
|
* the payload is a blob which can be shown with an arbitrary data viewer.
|
||||||
|
*/
|
||||||
|
#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW 1023
|
||||||
|
|
||||||
|
/* Custom (static) schema IDs. */
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START (1 << 24)
|
||||||
|
|
||||||
|
/* Dynamic schema IDs (generated by the tool) start here. */
|
||||||
|
#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296 // 1 << 32
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Size and alignment information for predefined payload entry types.
|
||||||
|
*
|
||||||
|
* The struct contains the size and the alignment size in bytes. A respective
|
||||||
|
* array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX
|
||||||
|
* client/handler. The type (ID) is used as index into this array.
|
||||||
|
*/
|
||||||
|
typedef struct nvtxPayloadEntryTypeInfo_t
|
||||||
|
{
|
||||||
|
uint16_t size;
|
||||||
|
uint16_t align;
|
||||||
|
} nvtxPayloadEntryTypeInfo_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Entry in a schema.
|
||||||
|
*
|
||||||
|
* A payload schema consists of an array of payload schema entries. It is
|
||||||
|
* registered with @ref nvtxPayloadSchemaRegister. `flag` can be set to `0` for
|
||||||
|
* simple values, 'type' is the only "required" field. If not set explicitly,
|
||||||
|
* all other fields are zero-initialized, which means that the entry has no name
|
||||||
|
* and the offset is determined based on self-alignment rules.
|
||||||
|
*
|
||||||
|
* Example schema:
|
||||||
|
* nvtxPayloadSchemaEntry_t desc[] = {
|
||||||
|
* {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"},
|
||||||
|
* {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"}
|
||||||
|
* };
|
||||||
|
*/
|
||||||
|
typedef struct nvtxPayloadSchemaEntry_t
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* \brief Flags to augment the basic type.
|
||||||
|
*
|
||||||
|
* This field allows additional properties of the payload entry to be
|
||||||
|
* specified. Valid values are `NVTX_PAYLOAD_ENTRY_FLAG_*`.
|
||||||
|
*/
|
||||||
|
uint64_t flags;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Predefined payload schema entry type or ID of a registered payload
|
||||||
|
* schema.
|
||||||
|
*/
|
||||||
|
uint64_t type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Name of the payload entry. (Optional)
|
||||||
|
*
|
||||||
|
* Providing a name is useful to give a meaning to the associated value.
|
||||||
|
*/
|
||||||
|
const char* name;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Description of the payload entry. (Optional)
|
||||||
|
*/
|
||||||
|
const char* description;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief String or array length or union selector for union types.
|
||||||
|
*
|
||||||
|
* If @ref type is a C string type, this defines the length of the string.
|
||||||
|
*
|
||||||
|
* If @ref flags specify that the entry is an array, this field defines the
|
||||||
|
* length of the array. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more
|
||||||
|
* details.
|
||||||
|
*
|
||||||
|
* If @ref type implies that the entry is a union with schema type
|
||||||
|
* @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION (external selection of the union
|
||||||
|
* member), this field contains the index (starting with 0) to an entry of
|
||||||
|
* integer type in the same schema. The associated field contains the
|
||||||
|
* selected union member.
|
||||||
|
*
|
||||||
|
* @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not
|
||||||
|
* supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can
|
||||||
|
* be used instead.
|
||||||
|
*/
|
||||||
|
uint64_t arrayOrUnionDetail;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Offset in the binary payload data (in bytes).
|
||||||
|
*
|
||||||
|
* This field specifies the byte offset from the base address of the actual
|
||||||
|
* binary data (blob) to the data of this entry.
|
||||||
|
*
|
||||||
|
* This is an optional field, but it is recommended to specify this field to
|
||||||
|
* avoid issues in the automatic detection of the offset by a tool/handler.
|
||||||
|
*/
|
||||||
|
uint64_t offset;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Semantics are not yet defined.
|
||||||
|
*/
|
||||||
|
void* semantics;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reserved for future use. Do not use it!
|
||||||
|
*/
|
||||||
|
void* reserved;
|
||||||
|
} nvtxPayloadSchemaEntry_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Binary payload data, size and decoding information.
|
||||||
|
*
|
||||||
|
* An array of nvtxPayloadData_t is passed to the NVTX event attribute payload
|
||||||
|
* member. To attach a single payload the macro @ref NVTX_EXT_PAYLOAD_SET_ATTR
|
||||||
|
* can be used.
|
||||||
|
*/
|
||||||
|
typedef struct nvtxPayloadData_t
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* The schema ID, which defines the layout of the binary data.
|
||||||
|
*/
|
||||||
|
uint64_t schemaId;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Size of the binary payload (blob) in bytes.
|
||||||
|
*/
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pointer to the binary payload data.
|
||||||
|
*/
|
||||||
|
const void* payload;
|
||||||
|
} nvtxPayloadData_t;
|
||||||
|
|
||||||
|
/* Helper macros for safe double-cast of pointer to uint64_t value */
|
||||||
|
#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE
|
||||||
|
# ifdef __cplusplus
|
||||||
|
# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \
|
||||||
|
static_cast<uint64_t>(reinterpret_cast<uintptr_t>(p))
|
||||||
|
# else
|
||||||
|
#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p)
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#define NVTX_PAYLOAD_CONCAT2(a,b) a##b
|
||||||
|
#define NVTX_PAYLOAD_CONCAT(a,b) NVTX_PAYLOAD_CONCAT2(a,b)
|
||||||
|
#define NVTX_DATA_VAR NVTX_PAYLOAD_CONCAT(nvtxDFDB,__LINE__)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Helper macro to attach a single payload to an NVTX event attribute.
|
||||||
|
*
|
||||||
|
* @note The NVTX push, start or mark operation must not be in the same or a
|
||||||
|
* nested scope.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_EVTATTR_SET(EVTATTR, SCHEMA_ID, PAYLOAD_ADDR, SIZE) \
|
||||||
|
nvtxPayloadData_t NVTX_DATA_VAR[] = {{SCHEMA_ID, SIZE, PAYLOAD_ADDR}}; \
|
||||||
|
(EVTATTR).payload.ullValue = \
|
||||||
|
NVTX_POINTER_AS_PAYLOAD_ULLVALUE(NVTX_DATA_VAR); \
|
||||||
|
(EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \
|
||||||
|
(EVTATTR).reserved0 = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Helper macro to attach multiple payloads to an NVTX event attribute.
|
||||||
|
*
|
||||||
|
* The payload data array (`nvtxPayloadData_t`) is passed as first argument to
|
||||||
|
* this macro.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(EVTATTR, PAYLOADS) \
|
||||||
|
(EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \
|
||||||
|
(EVTATTR).reserved0 = sizeof(PAYLOADS)/sizeof(nvtxPayloadData_t); \
|
||||||
|
(EVTATTR).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(PAYLOADS);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief The payload schema type.
|
||||||
|
*
|
||||||
|
* A schema can be either of these types.
|
||||||
|
*/
|
||||||
|
enum nvtxPayloadSchemaType
|
||||||
|
{
|
||||||
|
NVTX_PAYLOAD_SCHEMA_TYPE_INVALID = 0,
|
||||||
|
|
||||||
|
NVTX_PAYLOAD_SCHEMA_TYPE_STATIC = 1,
|
||||||
|
NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC = 2,
|
||||||
|
|
||||||
|
NVTX_PAYLOAD_SCHEMA_TYPE_UNION = 3,
|
||||||
|
NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR = 4
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Flags for static and dynamic schemas.
|
||||||
|
*/
|
||||||
|
enum nvtxPayloadSchemaFlags
|
||||||
|
{
|
||||||
|
NVTX_PAYLOAD_SCHEMA_FLAG_NONE = 0,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This flag indicates that a schema and the corresponding payloads can
|
||||||
|
* contain fields which require a deep copy.
|
||||||
|
*/
|
||||||
|
NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY = (1 << 1),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This flag indicates that a schema and the corresponding payloads can
|
||||||
|
* be referenced by another payload of the same event.
|
||||||
|
*/
|
||||||
|
NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED = (1 << 2),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The schema describes a deferred event/marker. Such a schema requires one
|
||||||
|
* timestamp entry and one string entry with the flag
|
||||||
|
* `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be
|
||||||
|
* optionally specified with the respective entry types. The deferred event
|
||||||
|
* can contain a binary payload itself by using a custom schema ID as type
|
||||||
|
* its schema description. Multiple occurrences of the same event can be
|
||||||
|
* described by specifying an array timestamps.
|
||||||
|
*/
|
||||||
|
NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_EVENT = (1 << 3),
|
||||||
|
/**
|
||||||
|
* The schema describes a deferred event/marker. Such a schema requires
|
||||||
|
* one start timestamp, one end timestamp and one string entry with the flag
|
||||||
|
* `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be
|
||||||
|
* optionally specified with the respective entry types. The deferred range
|
||||||
|
* can contain a binary payload itself by using a custom schema ID as type
|
||||||
|
* its schema description.
|
||||||
|
*
|
||||||
|
* Timestamps can be provided in different ways:
|
||||||
|
* - A single range has two timestamp entries with the first (smaller entry
|
||||||
|
* index) being used as the start/push timestamp.
|
||||||
|
* - If the range schema contains one array of timestamps, the tool assumes
|
||||||
|
* that the array contains alternating start and end timestamps.
|
||||||
|
* - If two timestamp arrays are specified the first entry (with the
|
||||||
|
* smaller entry index) is assumed to contain the start timestamps. Both
|
||||||
|
* arrays have to be of the same size.
|
||||||
|
*/
|
||||||
|
NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_RANGE = (2 << 3)
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be
|
||||||
|
* specified via setting the field `fieldMask`.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME (1 << 1)
|
||||||
|
#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE (1 << 2)
|
||||||
|
#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS (1 << 3)
|
||||||
|
#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES (1 << 4)
|
||||||
|
#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5)
|
||||||
|
#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6)
|
||||||
|
#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT (1 << 7)
|
||||||
|
#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID (1 << 8)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NVTX payload schema attributes.
|
||||||
|
*/
|
||||||
|
typedef struct nvtxPayloadSchemaAttr_t
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* \brief Mask of valid fields in this structure.
|
||||||
|
*
|
||||||
|
* The values from `enum nvtxPayloadSchemaAttributes` have to be used.
|
||||||
|
*/
|
||||||
|
uint64_t fieldMask;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Name of the payload schema. (Optional)
|
||||||
|
*/
|
||||||
|
const char* name;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD
|
||||||
|
*
|
||||||
|
* A value from `enum nvtxPayloadSchemaType` has to be used.
|
||||||
|
*/
|
||||||
|
uint64_t type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Payload schema flags. (Optional)
|
||||||
|
*
|
||||||
|
* Flags defined in `enum nvtxPayloadSchemaFlags` can be used to set
|
||||||
|
* additional properties of the schema.
|
||||||
|
*/
|
||||||
|
uint64_t flags;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Entries of a payload schema. (Mandatory) \anchor ENTRIES_FIELD
|
||||||
|
*
|
||||||
|
* This field is a pointer to an array of schema entries, each describing a
|
||||||
|
* field in a data structure, e.g. in a C struct or union.
|
||||||
|
*/
|
||||||
|
const nvtxPayloadSchemaEntry_t* entries;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Number of entries in the payload schema. (Mandatory)
|
||||||
|
*
|
||||||
|
* Number of entries in the array of payload entries \ref ENTRIES_FIELD.
|
||||||
|
*/
|
||||||
|
size_t numEntries;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief The binary payload size in bytes for static payload schemas.
|
||||||
|
*
|
||||||
|
* If \ref PAYLOAD_TYPE_FIELD is @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC this
|
||||||
|
* value is ignored. If this field is not specified for a schema of type
|
||||||
|
* @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, the size can be automatically
|
||||||
|
* determined by a tool.
|
||||||
|
*/
|
||||||
|
size_t payloadStaticSize;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief The byte alignment for packed structures.
|
||||||
|
*
|
||||||
|
* If not specified, this field defaults to `0`, which means that the fields
|
||||||
|
* in the data structure are not packed and natural alignment rules can be
|
||||||
|
* applied.
|
||||||
|
*/
|
||||||
|
size_t packAlign;
|
||||||
|
|
||||||
|
/* Static/custom schema ID must be
|
||||||
|
>= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
|
||||||
|
< NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */
|
||||||
|
uint64_t schemaId;
|
||||||
|
} nvtxPayloadSchemaAttr_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Register a payload schema.
|
||||||
|
*
|
||||||
|
* @param domain NVTX domain handle.
|
||||||
|
* @param attr NVTX payload schema attributes.
|
||||||
|
*/
|
||||||
|
NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister(
|
||||||
|
nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Enumeration entry.
|
||||||
|
*
|
||||||
|
* Since the value of an enum entry might not be meaningful for the analysis,
|
||||||
|
* a tool can show the name of enum entry instead.
|
||||||
|
*
|
||||||
|
* @note EXPERIMENTAL
|
||||||
|
*/
|
||||||
|
typedef struct nvtxPayloadEnum_t
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Name of the enum value.
|
||||||
|
*/
|
||||||
|
const char* name;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Value of the enum entry.
|
||||||
|
*/
|
||||||
|
uint64_t value;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates that this entry sets a specific set of bits, which can be used
|
||||||
|
* to easily define bitsets.
|
||||||
|
*/
|
||||||
|
int8_t isFlag;
|
||||||
|
} nvtxPayloadEnum_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The values are used to set the field `fieldMask` and specify which fields in
|
||||||
|
* `nvtxPayloadEnumAttr_t` are set.
|
||||||
|
*/
|
||||||
|
#define NVTX_PAYLOAD_ENUM_ATTR_NAME (1 << 1)
|
||||||
|
#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES (1 << 2)
|
||||||
|
#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3)
|
||||||
|
#define NVTX_PAYLOAD_ENUM_ATTR_SIZE (1 << 4)
|
||||||
|
#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID (1 << 5)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NVTX payload enumeration type attributes.
|
||||||
|
*/
|
||||||
|
typedef struct nvtxPayloadEnumAttr_t {
|
||||||
|
/**
|
||||||
|
* Mask of valid fields in this struct.
|
||||||
|
* The values from `enum nvtxPayloadSchemaAttributes` have to be used.
|
||||||
|
*/
|
||||||
|
uint64_t fieldMask;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Name of the enum. (Optional)
|
||||||
|
*/
|
||||||
|
const char* name;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Entries of the enum. (Mandatory)
|
||||||
|
*/
|
||||||
|
const nvtxPayloadEnum_t* entries;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of entries in the enum. (Mandatory)
|
||||||
|
*/
|
||||||
|
size_t numEntries;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Size of enumeration type in bytes
|
||||||
|
*/
|
||||||
|
size_t sizeOfEnum;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Static/custom schema ID must be
|
||||||
|
* >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
|
||||||
|
* < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START
|
||||||
|
*/
|
||||||
|
uint64_t schemaId;
|
||||||
|
} nvtxPayloadEnumAttr_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Register an enumeration type with the payload extension.
|
||||||
|
*
|
||||||
|
* @param domain NVTX domain handle
|
||||||
|
* @param attr NVTX payload enumeration type attributes.
|
||||||
|
*/
|
||||||
|
NVTX_DECLSPEC uint64_t nvtxPayloadEnumRegister(nvtxDomainHandle_t domain,
|
||||||
|
const nvtxPayloadEnumAttr_t* attr);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Callback Ids of API functions in the payload extension.
|
||||||
|
*
|
||||||
|
* The NVTX handler can use these values to register a handler function. When
|
||||||
|
* InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is
|
||||||
|
* executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be
|
||||||
|
* registered as follows:
|
||||||
|
* moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] =
|
||||||
|
* (intptr_t)handlenvtxPayloadRegisterSchema;
|
||||||
|
*/
|
||||||
|
typedef enum NvtxExtPayloadCallbackId
|
||||||
|
{
|
||||||
|
NVTX3EXT_CBID_nvtxPayloadSchemaRegister = 0,
|
||||||
|
NVTX3EXT_CBID_nvtxPayloadEnumRegister = 1,
|
||||||
|
NVTX3EXT_CBID_PAYLOAD_FN_NUM = 2
|
||||||
|
} NvtxExtPayloadCallbackId;
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#pragma GCC visibility push(internal)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot include directly */
|
||||||
|
#include "nvtxExtDetail/nvtxExtTypes.h"
|
||||||
|
#undef NVTX_EXT_TYPES_GUARD
|
||||||
|
|
||||||
|
#ifndef NVTX_NO_IMPL
|
||||||
|
#define NVTX_EXT_IMPL_PAYLOAD_GUARD /* Ensure other headers cannot included directly */
|
||||||
|
#include "nvtxExtDetail/nvtxExtPayloadTypeInfo.h"
|
||||||
|
#include "nvtxExtDetail/nvtxExtImplPayload_v1.h"
|
||||||
|
#undef NVTX_EXT_IMPL_PAYLOAD_GUARD
|
||||||
|
#endif /*NVTX_NO_IMPL*/
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#pragma GCC visibility pop
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
|
#endif /* NVTOOLSEXT_PAYLOAD_H */
|
93
src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
Normal file
93
src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NVTX_EXT_IMPL_GUARD
|
||||||
|
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef NVTX_EXT_IMPL_H
|
||||||
|
#define NVTX_EXT_IMPL_H
|
||||||
|
/* ---- Include required platform headers ---- */
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
|
||||||
|
#include <Windows.h>
|
||||||
|
|
||||||
|
#else
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#if defined(__ANDROID__)
|
||||||
|
#include <android/api-level.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__) || defined(__CYGWIN__)
|
||||||
|
#include <sched.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <limits.h>
|
||||||
|
#include <dlfcn.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <wchar.h>
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* ---- Define macros used in this file ---- */
|
||||||
|
|
||||||
|
#ifdef NVTX_DEBUG_PRINT
|
||||||
|
#ifdef __ANDROID__
|
||||||
|
#include <android/log.h>
|
||||||
|
#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
|
||||||
|
#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
|
||||||
|
#else
|
||||||
|
#include <stdio.h>
|
||||||
|
#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
|
||||||
|
#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
|
||||||
|
#endif
|
||||||
|
#else /* !defined(NVTX_DEBUG_PRINT) */
|
||||||
|
#define NVTX_ERR(...)
|
||||||
|
#define NVTX_INFO(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
|
// #ifdef __GNUC__
|
||||||
|
// #pragma GCC visibility push(hidden)
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
#define NVTX_EXTENSION_FRESH 0
|
||||||
|
#define NVTX_EXTENSION_DISABLED 1
|
||||||
|
#define NVTX_EXTENSION_STARTING 2
|
||||||
|
#define NVTX_EXTENSION_LOADED 3
|
||||||
|
|
||||||
|
NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0;
|
||||||
|
|
||||||
|
#define NVTX_EXT_INIT_GUARD
|
||||||
|
#include "nvtxExtInit.h"
|
||||||
|
#undef NVTX_EXT_INIT_GUARD
|
||||||
|
|
||||||
|
// #ifdef __GNUC__
|
||||||
|
// #pragma GCC visibility pop
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} /* extern "C" */
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
|
#endif /* NVTX_EXT_IMPL_H */
|
85
src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
Normal file
85
src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2021 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
|
||||||
|
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define NVTX_EXT_IMPL_GUARD
|
||||||
|
#include "nvtxExtImpl.h"
|
||||||
|
#undef NVTX_EXT_IMPL_GUARD
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
|
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
|
||||||
|
NAME##_v##VERSION##_mem##COMPATID
|
||||||
|
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
|
||||||
|
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
|
||||||
|
#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
|
||||||
|
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Function slots for the binary payload extension. First entry is the module
|
||||||
|
* state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
|
||||||
|
*/
|
||||||
|
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
|
||||||
|
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1]
|
||||||
|
= {0};
|
||||||
|
|
||||||
|
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
|
||||||
|
{
|
||||||
|
nvtxExtModuleSegment_t segment = {
|
||||||
|
0, // unused (only one segment)
|
||||||
|
NVTX3EXT_CBID_PAYLOAD_FN_NUM,
|
||||||
|
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1
|
||||||
|
};
|
||||||
|
|
||||||
|
nvtxExtModuleInfo_t module = {
|
||||||
|
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
|
||||||
|
NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD,
|
||||||
|
1, &segment, // number of segments, segments
|
||||||
|
NULL, // no export function needed
|
||||||
|
// bake type sizes and alignment information into program binary
|
||||||
|
&nvtxExtPayloadTypeInfo
|
||||||
|
};
|
||||||
|
|
||||||
|
NVTX_INFO( "%s\n", __FUNCTION__ );
|
||||||
|
|
||||||
|
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
|
||||||
|
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
|
||||||
|
}
|
||||||
|
|
||||||
|
#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
|
||||||
|
typedef ret_val ( * fn_name##_impl_fntype )signature; \
|
||||||
|
NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \
|
||||||
|
intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||||
|
if (slot != NVTX_EXTENSION_DISABLED) { \
|
||||||
|
if (slot) { \
|
||||||
|
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||||
|
} else { \
|
||||||
|
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
|
||||||
|
slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||||
|
if (slot != NVTX_EXTENSION_DISABLED && slot) { \
|
||||||
|
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
return ((ret_val)(intptr_t)-1); \
|
||||||
|
}
|
||||||
|
|
||||||
|
NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr))
|
||||||
|
|
||||||
|
NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr))
|
||||||
|
|
||||||
|
#undef NVTX_EXT_FN_IMPL
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} /* extern "C" */
|
||||||
|
#endif /* __cplusplus */
|
363
src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h
Normal file
363
src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h
Normal file
@ -0,0 +1,363 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NVTX_EXT_INIT_GUARD
|
||||||
|
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
|
/* ---- Platform-independent helper definitions and functions ---- */
|
||||||
|
|
||||||
|
/* Prefer macros over inline functions to reduce symbol resolution at link time */
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define NVTX_PATHCHAR wchar_t
|
||||||
|
#define NVTX_STR(x) L##x
|
||||||
|
#define NVTX_GETENV _wgetenv
|
||||||
|
#define NVTX_BUFSIZE MAX_PATH
|
||||||
|
#define NVTX_DLLHANDLE HMODULE
|
||||||
|
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
|
||||||
|
#define NVTX_DLLFUNC GetProcAddress
|
||||||
|
#define NVTX_DLLCLOSE FreeLibrary
|
||||||
|
#define NVTX_YIELD() SwitchToThread()
|
||||||
|
#define NVTX_MEMBAR() MemoryBarrier()
|
||||||
|
#define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value)
|
||||||
|
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
|
||||||
|
#define NVTX_ATOMIC_WRITE_PTR(address, value) InterlockedExchangePointer((volatile PVOID*)address, (PVOID)value)
|
||||||
|
#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) old = (intptr_t)InterlockedCompareExchangePointer((volatile PVOID*)address, (PVOID)exchange, (PVOID)comparand)
|
||||||
|
|
||||||
|
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
#define NVTX_PATHCHAR char
|
||||||
|
#define NVTX_STR(x) x
|
||||||
|
#define NVTX_GETENV getenv
|
||||||
|
#define NVTX_BUFSIZE PATH_MAX
|
||||||
|
#define NVTX_DLLHANDLE void*
|
||||||
|
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
|
||||||
|
#define NVTX_DLLFUNC dlsym
|
||||||
|
#define NVTX_DLLCLOSE dlclose
|
||||||
|
#define NVTX_YIELD() sched_yield()
|
||||||
|
#define NVTX_MEMBAR() __sync_synchronize()
|
||||||
|
/* Ensure full memory barrier for atomics, to match Windows functions */
|
||||||
|
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
|
||||||
|
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
|
||||||
|
#define NVTX_ATOMIC_WRITE_PTR(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
|
||||||
|
#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
|
||||||
|
#else
|
||||||
|
#error The library does not support your configuration!
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
|
||||||
|
#if defined(_WIN32)
|
||||||
|
/* TODO */
|
||||||
|
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
|
||||||
|
#else
|
||||||
|
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Define this to 1 for platforms that support environment variables */
|
||||||
|
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
|
||||||
|
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
|
||||||
|
#define NVTX_SUPPORT_ENV_VARS 1
|
||||||
|
|
||||||
|
/* Define this to 1 for platforms that support dynamic/shared libraries */
|
||||||
|
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
|
||||||
|
|
||||||
|
/* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked,
|
||||||
|
* and this will override any dynamic injection. Useful for platforms where dynamic
|
||||||
|
* injection is not available. Since weak symbols not explicitly marked extern are
|
||||||
|
* guaranteed to be initialized to zero if no definitions are found by the linker, the
|
||||||
|
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
|
||||||
|
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
|
||||||
|
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
|
||||||
|
/* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal
|
||||||
|
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which
|
||||||
|
* does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic
|
||||||
|
* injection library. */
|
||||||
|
__attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr;
|
||||||
|
#else
|
||||||
|
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* This function tries to find or load an NVTX injection library and get the
|
||||||
|
* address of its InitializeInjectionExtension function. If such a function pointer
|
||||||
|
* is found, it is called, and passed the address of this NVTX instance's
|
||||||
|
* nvtxGetExportTable function, so the injection can attach to this instance.
|
||||||
|
* If the initialization fails for any reason, any dynamic library loaded will
|
||||||
|
* be freed, and all NVTX implementation functions will be set to no-ops. If
|
||||||
|
* initialization succeeds, NVTX functions not attached to the tool will be set
|
||||||
|
* to no-ops. This is implemented as one function instead of several small
|
||||||
|
* functions to minimize the number of weak symbols the linker must resolve.
|
||||||
|
* Order of search is:
|
||||||
|
* - Pre-injected library exporting InitializeInjectionNvtxExtension
|
||||||
|
* - Loadable library exporting InitializeInjectionNvtxExtension
|
||||||
|
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
|
||||||
|
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
|
||||||
|
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
|
||||||
|
*/
|
||||||
|
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
|
||||||
|
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
|
||||||
|
{
|
||||||
|
const char* const initFuncName = "InitializeInjectionNvtxExtension";
|
||||||
|
NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
|
||||||
|
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
|
||||||
|
|
||||||
|
if(out_init_fnptr){
|
||||||
|
*out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
|
||||||
|
/* Use POSIX global symbol chain to query for init function from any module */
|
||||||
|
init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
|
||||||
|
/* Try discovering dynamic injection library to load */
|
||||||
|
if (!init_fnptr)
|
||||||
|
{
|
||||||
|
#if NVTX_SUPPORT_ENV_VARS
|
||||||
|
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
|
||||||
|
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
|
||||||
|
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
|
||||||
|
? NVTX_STR("NVTX_INJECTION32_PATH")
|
||||||
|
: NVTX_STR("NVTX_INJECTION64_PATH");
|
||||||
|
#endif /* NVTX_SUPPORT_ENV_VARS */
|
||||||
|
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
|
||||||
|
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
|
||||||
|
|
||||||
|
/* Refer to this variable explicitly in case all references to it are #if'ed out */
|
||||||
|
(void)injectionLibraryPathBuf;
|
||||||
|
|
||||||
|
#if NVTX_SUPPORT_ENV_VARS
|
||||||
|
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
|
||||||
|
* these functions are not called again before using the returned value. */
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning( push )
|
||||||
|
#pragma warning( disable : 4996 )
|
||||||
|
#endif
|
||||||
|
injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning( pop )
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__ANDROID__)
|
||||||
|
if (!injectionLibraryPath)
|
||||||
|
{
|
||||||
|
const char *bits = (sizeof(void*) == 4) ? "32" : "64";
|
||||||
|
char cmdlineBuf[32];
|
||||||
|
char pkgName[PATH_MAX];
|
||||||
|
int count;
|
||||||
|
int pid;
|
||||||
|
FILE *fp;
|
||||||
|
size_t bytesRead;
|
||||||
|
size_t pos;
|
||||||
|
|
||||||
|
pid = (int)getpid();
|
||||||
|
count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
|
||||||
|
if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
|
||||||
|
{
|
||||||
|
NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
|
||||||
|
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||||
|
}
|
||||||
|
|
||||||
|
fp = fopen(cmdlineBuf, "r");
|
||||||
|
if (!fp)
|
||||||
|
{
|
||||||
|
NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
|
||||||
|
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||||
|
}
|
||||||
|
|
||||||
|
bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
|
||||||
|
fclose(fp);
|
||||||
|
if (bytesRead == 0)
|
||||||
|
{
|
||||||
|
NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
|
||||||
|
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||||
|
}
|
||||||
|
|
||||||
|
pkgName[bytesRead] = 0;
|
||||||
|
|
||||||
|
/* String can contain colon as a process separator. In this case the package name is before the colon. */
|
||||||
|
pos = 0;
|
||||||
|
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
|
||||||
|
{
|
||||||
|
++pos;
|
||||||
|
}
|
||||||
|
pkgName[pos] = 0;
|
||||||
|
|
||||||
|
count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
|
||||||
|
if (count <= 0 || count >= NVTX_BUFSIZE)
|
||||||
|
{
|
||||||
|
NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
|
||||||
|
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* On Android, verify path is accessible due to aggressive file access restrictions. */
|
||||||
|
/* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
|
||||||
|
/* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
|
||||||
|
if (injectionLibraryPathBuf[0] == '/')
|
||||||
|
{
|
||||||
|
#if (__ANDROID_API__ < 21)
|
||||||
|
int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
|
||||||
|
#else
|
||||||
|
int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
|
||||||
|
#endif
|
||||||
|
if (access_err != 0)
|
||||||
|
{
|
||||||
|
NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
|
||||||
|
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
injectionLibraryPath = injectionLibraryPathBuf;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* At this point, injectionLibraryPath is specified if a dynamic
|
||||||
|
* injection library was specified by a tool. */
|
||||||
|
if (injectionLibraryPath)
|
||||||
|
{
|
||||||
|
/* Load the injection library */
|
||||||
|
injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
|
||||||
|
if (!injectionLibraryHandle)
|
||||||
|
{
|
||||||
|
NVTX_ERR("Failed to load injection library\n");
|
||||||
|
return NVTX_ERR_INIT_LOAD_LIBRARY;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Attempt to get the injection library's entry-point */
|
||||||
|
init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
|
||||||
|
if (!init_fnptr)
|
||||||
|
{
|
||||||
|
NVTX_DLLCLOSE(injectionLibraryHandle);
|
||||||
|
NVTX_ERR("Failed to get address of function %s from injection library\n", initFuncName);
|
||||||
|
return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
|
||||||
|
if (!init_fnptr)
|
||||||
|
{
|
||||||
|
/* Check weakly-defined function pointer. A statically-linked injection can define this as
|
||||||
|
* a normal symbol and it will take precedence over a dynamic injection. */
|
||||||
|
if (InitializeInjectionNvtxExtension_fnptr)
|
||||||
|
{
|
||||||
|
init_fnptr = InitializeInjectionNvtxExtension_fnptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if(out_init_fnptr){
|
||||||
|
*out_init_fnptr = init_fnptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* At this point, if init_fnptr is not set, then no tool has specified
|
||||||
|
* an NVTX injection library -- return non-success result so all NVTX
|
||||||
|
* API functions will be set to no-ops. */
|
||||||
|
if (!init_fnptr)
|
||||||
|
{
|
||||||
|
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NVTX_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
|
||||||
|
nvtxExtModuleInfo_t* moduleInfo,
|
||||||
|
intptr_t* moduleState
|
||||||
|
)
|
||||||
|
{
|
||||||
|
intptr_t old;
|
||||||
|
|
||||||
|
NVTX_INFO( "%s\n", __FUNCTION__ );
|
||||||
|
|
||||||
|
if( *moduleState == NVTX_EXTENSION_LOADED) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
NVTX_ATOMIC_CAS_PTR(
|
||||||
|
old,
|
||||||
|
moduleState,
|
||||||
|
NVTX_EXTENSION_STARTING,
|
||||||
|
NVTX_EXTENSION_FRESH);
|
||||||
|
if (old == NVTX_EXTENSION_FRESH)
|
||||||
|
{
|
||||||
|
NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr);
|
||||||
|
int entryPointStatus = 0;
|
||||||
|
int forceAllToNoops = 0;
|
||||||
|
|
||||||
|
/* Load & initialize injection library -- it will assign the function pointers */
|
||||||
|
if(init_fnptr == 0){
|
||||||
|
int result = 0;
|
||||||
|
|
||||||
|
/* try to load vanilla NVTX first*/
|
||||||
|
nvtxInitialize(0);
|
||||||
|
|
||||||
|
result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr);
|
||||||
|
/*at this point init_fnptr will be either 0 or a real function*/
|
||||||
|
|
||||||
|
if(result == NVTX_SUCCESS) {
|
||||||
|
NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
NVTX_ERR("Failed to load injection library\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(init_fnptr != 0) {
|
||||||
|
/* Invoke injection library's initialization function. If it returns
|
||||||
|
* 0 (failure) and a dynamic injection was loaded, unload it. */
|
||||||
|
entryPointStatus = init_fnptr(moduleInfo);
|
||||||
|
if (entryPointStatus == 0) {
|
||||||
|
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Clean up any functions that are still uninitialized so that they are skipped.
|
||||||
|
* Set all to null if injection init function failed as well.
|
||||||
|
*/
|
||||||
|
forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0);
|
||||||
|
for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){
|
||||||
|
nvtxExtModuleSegment_t* segment = moduleInfo->segments+s;
|
||||||
|
for(size_t i = 0; i < segment->slotCount; ++i){
|
||||||
|
if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){
|
||||||
|
segment->functionSlots[i] = NVTX_EXTENSION_DISABLED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NVTX_MEMBAR();
|
||||||
|
|
||||||
|
/* Signal that initialization has finished, so now the assigned function pointers will be used */
|
||||||
|
NVTX_ATOMIC_WRITE_PTR(
|
||||||
|
moduleState,
|
||||||
|
NVTX_EXTENSION_LOADED);
|
||||||
|
}
|
||||||
|
else /* Spin-wait until initialization has finished */
|
||||||
|
{
|
||||||
|
NVTX_MEMBAR();
|
||||||
|
while (*moduleState != NVTX_EXTENSION_LOADED)
|
||||||
|
{
|
||||||
|
NVTX_YIELD();
|
||||||
|
NVTX_MEMBAR();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif /* __cplusplus */
|
128
src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
Normal file
128
src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
|
||||||
|
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Helper array to get the alignment for each predefined C language type.
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef void* pointer_type;
|
||||||
|
|
||||||
|
#if __STDC_VERSION__ >= 201112L /* or CPP11 */
|
||||||
|
#include <stdalign.h>
|
||||||
|
#define nvtx_alignof(type) alignof(type)
|
||||||
|
#define nvtx_alignof2(type,tname) alignof(type)
|
||||||
|
#else /* __STDC_VERSION__ >= 201112L */
|
||||||
|
#ifndef __cplusplus
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
|
||||||
|
#define nvtx_alignof2(type,tname) nvtx_alignof(type)
|
||||||
|
|
||||||
|
#else /* __cplusplus */
|
||||||
|
|
||||||
|
#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
|
||||||
|
#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
|
||||||
|
#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
|
||||||
|
#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
|
||||||
|
|
||||||
|
MKTYPEDEF(char);
|
||||||
|
MKTYPEDEF2(unsigned char, uchar);
|
||||||
|
MKTYPEDEF(short);
|
||||||
|
MKTYPEDEF2(unsigned short, ushort);
|
||||||
|
MKTYPEDEF(int);
|
||||||
|
MKTYPEDEF2(unsigned int, uint);
|
||||||
|
MKTYPEDEF(long);
|
||||||
|
MKTYPEDEF2(unsigned long, ulong);
|
||||||
|
MKTYPEDEF2(long long, longlong);
|
||||||
|
MKTYPEDEF2(unsigned long long, ulonglong);
|
||||||
|
|
||||||
|
MKTYPEDEF(int8_t);
|
||||||
|
MKTYPEDEF(uint8_t);
|
||||||
|
MKTYPEDEF(int16_t);
|
||||||
|
MKTYPEDEF(uint16_t);
|
||||||
|
MKTYPEDEF(int32_t);
|
||||||
|
MKTYPEDEF(uint32_t);
|
||||||
|
MKTYPEDEF(int64_t);
|
||||||
|
MKTYPEDEF(uint64_t);
|
||||||
|
|
||||||
|
MKTYPEDEF(float);
|
||||||
|
MKTYPEDEF(double);
|
||||||
|
MKTYPEDEF2(long double, longdouble);
|
||||||
|
|
||||||
|
MKTYPEDEF(size_t);
|
||||||
|
MKTYPEDEF(pointer_type);
|
||||||
|
|
||||||
|
MKTYPEDEF(wchar_t);
|
||||||
|
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
|
||||||
|
{sizeof(char8_t), nvtx_alignof(char8_t)},
|
||||||
|
MKTYPEDEF(char8_t);
|
||||||
|
#endif
|
||||||
|
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
|
||||||
|
MKTYPEDEF(char16_t);
|
||||||
|
MKTYPEDEF(char32_t);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef MKTYPEDEF
|
||||||
|
#undef MKTYPEDEF2
|
||||||
|
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
#endif /* __STDC_VERSION__ >= 201112L */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
|
||||||
|
*/
|
||||||
|
const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
|
||||||
|
{
|
||||||
|
/* The first entry contains this array's length and the size of each entry in this array. */
|
||||||
|
{NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)},
|
||||||
|
|
||||||
|
/*** C integer types ***/
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR */ {sizeof(char), nvtx_alignof(char)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_UCHAR */ {sizeof(unsigned char), nvtx_alignof2(unsigned char, uchar)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_SHORT */ {sizeof(short), nvtx_alignof(short)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_USHORT */ {sizeof(unsigned short), nvtx_alignof2(unsigned short, ushort)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_INT */ {sizeof(int), nvtx_alignof(int)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT */ {sizeof(unsigned int), nvtx_alignof2(unsigned int, uint)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_LONG */ {sizeof(long), nvtx_alignof(long)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_ULONG */ {sizeof(unsigned long), nvtx_alignof2(unsigned long, ulong)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG */ {sizeof(long long), nvtx_alignof2(long long, longlong)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG */ {sizeof(unsigned long long), nvtx_alignof2(unsigned long long,ulonglong)},
|
||||||
|
|
||||||
|
/*** Integer types with explicit size ***/
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_INT8 */ {sizeof(int8_t), nvtx_alignof(int8_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT8 */ {sizeof(uint8_t), nvtx_alignof(uint8_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_INT16 */ {sizeof(int16_t), nvtx_alignof(int16_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT16 */ {sizeof(uint16_t), nvtx_alignof(uint16_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_INT32 */ {sizeof(int32_t), nvtx_alignof(int32_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT32 */ {sizeof(uint32_t), nvtx_alignof(uint32_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_INT64 */ {sizeof(int64_t), nvtx_alignof(int64_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT64 */ {sizeof(uint64_t), nvtx_alignof(uint64_t)},
|
||||||
|
|
||||||
|
/*** C floating point types ***/
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_FLOAT */ {sizeof(float), nvtx_alignof(float)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE */ {sizeof(double), nvtx_alignof(double)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)},
|
||||||
|
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */ {sizeof(size_t), nvtx_alignof(size_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)},
|
||||||
|
|
||||||
|
/*** Special character types ***/
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
|
||||||
|
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
|
||||||
|
{sizeof(char8_t), nvtx_alignof(char8_t)},
|
||||||
|
#else
|
||||||
|
{0, 0},
|
||||||
|
#endif
|
||||||
|
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
|
||||||
|
#else
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {0, 0},
|
||||||
|
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {0, 0}
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
#undef nvtx_alignof
|
||||||
|
#undef nvtx_alignof2
|
44
src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h
Normal file
44
src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2021 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* This header defines types which are used by the internal implementation
|
||||||
|
* of NVTX and callback subscribers. API clients do not use these types,
|
||||||
|
* so they are defined here instead of in nvToolsExt.h to clarify they are
|
||||||
|
* not part of the NVTX client API. */
|
||||||
|
|
||||||
|
#ifndef NVTXEXTTYPES_H
|
||||||
|
#define NVTXEXTTYPES_H
|
||||||
|
|
||||||
|
#ifndef NVTX_EXT_TYPES_GUARD
|
||||||
|
#error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h.
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
|
||||||
|
|
||||||
|
typedef struct nvtxExtModuleSegment_t
|
||||||
|
{
|
||||||
|
size_t segmentId;
|
||||||
|
size_t slotCount;
|
||||||
|
intptr_t* functionSlots;
|
||||||
|
} nvtxExtModuleSegment_t;
|
||||||
|
|
||||||
|
typedef struct nvtxExtModuleInfo_t
|
||||||
|
{
|
||||||
|
uint16_t nvtxVer;
|
||||||
|
uint16_t structSize;
|
||||||
|
uint16_t moduleId;
|
||||||
|
uint16_t compatId;
|
||||||
|
size_t segmentsCount;
|
||||||
|
nvtxExtModuleSegment_t* segments;
|
||||||
|
NvtxExtGetExportFunction_t getExportFunction;
|
||||||
|
const void* extInfo;
|
||||||
|
} nvtxExtModuleInfo_t;
|
||||||
|
|
||||||
|
typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
|
||||||
|
|
||||||
|
#endif /* NVTXEXTTYPES_H */
|
@ -11,6 +11,7 @@
|
|||||||
#include "info.h"
|
#include "info.h"
|
||||||
#include "socket.h"
|
#include "socket.h"
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
|
#include "shm.h"
|
||||||
|
|
||||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||||
|
|
||||||
@ -106,6 +107,7 @@ struct ncclProxyOpsPool {
|
|||||||
|
|
||||||
struct ncclProxyOps {
|
struct ncclProxyOps {
|
||||||
ncclProxyOpsPool* pool;
|
ncclProxyOpsPool* pool;
|
||||||
|
ncclShmHandle_t handle;
|
||||||
int count;
|
int count;
|
||||||
int freeOp;
|
int freeOp;
|
||||||
int nextOps;
|
int nextOps;
|
||||||
@ -145,6 +147,7 @@ struct ncclProxyPool;
|
|||||||
struct ncclProxyProgressState {
|
struct ncclProxyProgressState {
|
||||||
// Used by main threads to send work to progress thread
|
// Used by main threads to send work to progress thread
|
||||||
struct ncclProxyOpsPool* opsPool;
|
struct ncclProxyOpsPool* opsPool;
|
||||||
|
ncclShmHandle_t handle;
|
||||||
char opsPoolShmSuffix[6];
|
char opsPoolShmSuffix[6];
|
||||||
|
|
||||||
pthread_t thread;
|
pthread_t thread;
|
||||||
@ -164,7 +167,6 @@ struct ncclProxyState {
|
|||||||
struct ncclSocket* listenSock;
|
struct ncclSocket* listenSock;
|
||||||
int stop;
|
int stop;
|
||||||
CUcontext cudaCtx;
|
CUcontext cudaCtx;
|
||||||
int safeAbortFlag;
|
|
||||||
|
|
||||||
// Used by main thread
|
// Used by main thread
|
||||||
union ncclSocketAddress* peerAddresses;
|
union ncclSocketAddress* peerAddresses;
|
||||||
@ -176,6 +178,15 @@ struct ncclProxyState {
|
|||||||
struct ncclProxyProgressState progressState;
|
struct ncclProxyProgressState progressState;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum proxyConnectState {
|
||||||
|
connUninitialized = 0,
|
||||||
|
connInitialized = 1,
|
||||||
|
connSharedInitialized = 2,
|
||||||
|
connSetupDone = 3,
|
||||||
|
connConnected = 4,
|
||||||
|
numConnStates = 5
|
||||||
|
};
|
||||||
|
|
||||||
struct ncclProxyConnection {
|
struct ncclProxyConnection {
|
||||||
int send, transport, shared;
|
int send, transport, shared;
|
||||||
int localRank;
|
int localRank;
|
||||||
@ -184,7 +195,7 @@ struct ncclProxyConnection {
|
|||||||
struct ncclProxyArgs *proxyAppend;
|
struct ncclProxyArgs *proxyAppend;
|
||||||
struct ncclProxyArgs **proxyAppendPtr;
|
struct ncclProxyArgs **proxyAppendPtr;
|
||||||
void* transportResources;
|
void* transportResources;
|
||||||
bool initFlag;
|
proxyConnectState state;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
|
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
|
|
||||||
ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create);
|
typedef void* ncclShmHandle_t;
|
||||||
ncclResult_t ncclShmUnlink(const char* shmname);
|
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
|
||||||
ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize);
|
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
|
||||||
|
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
|
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
|
||||||
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
|
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
|
||||||
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
|
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
|
||||||
|
#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
|
||||||
|
|
||||||
/* Common socket address storage structure for IPv4/IPv6 */
|
/* Common socket address storage structure for IPv4/IPv6 */
|
||||||
union ncclSocketAddress {
|
union ncclSocketAddress {
|
||||||
@ -30,32 +31,59 @@ union ncclSocketAddress {
|
|||||||
};
|
};
|
||||||
|
|
||||||
enum ncclSocketState {
|
enum ncclSocketState {
|
||||||
ncclSocketConnecting = 0,
|
ncclSocketStateNone = 0,
|
||||||
ncclSocketConnected = 1,
|
ncclSocketStateInitialized = 1,
|
||||||
ncclSocketError = 2,
|
ncclSocketStateAccepting = 2,
|
||||||
ncclSocketStateNum = 3
|
ncclSocketStateAccepted = 3,
|
||||||
} ;
|
ncclSocketStateConnecting = 4,
|
||||||
|
ncclSocketStateConnectPolling = 5,
|
||||||
|
ncclSocketStateConnected = 6,
|
||||||
|
ncclSocketStateReady = 7,
|
||||||
|
ncclSocketStateClosed = 8,
|
||||||
|
ncclSocketStateError = 9,
|
||||||
|
ncclSocketStateNum = 10
|
||||||
|
};
|
||||||
|
|
||||||
|
enum ncclSocketType {
|
||||||
|
ncclSocketTypeUnknown = 0,
|
||||||
|
ncclSocketTypeBootstrap = 1,
|
||||||
|
ncclSocketTypeProxy = 2,
|
||||||
|
ncclSocketTypeNetSocket = 3,
|
||||||
|
ncclSocketTypeNetIb = 4
|
||||||
|
};
|
||||||
|
|
||||||
struct ncclSocket {
|
struct ncclSocket {
|
||||||
int fd;
|
int fd;
|
||||||
|
int acceptFd;
|
||||||
|
int timedOutRetries;
|
||||||
|
int refusedRetries;
|
||||||
union ncclSocketAddress addr;
|
union ncclSocketAddress addr;
|
||||||
volatile uint32_t* abortFlag;
|
volatile uint32_t* abortFlag;
|
||||||
int asyncFlag;
|
int asyncFlag;
|
||||||
enum ncclSocketState state;
|
enum ncclSocketState state;
|
||||||
|
int salen;
|
||||||
|
uint64_t magic;
|
||||||
|
enum ncclSocketType type;
|
||||||
};
|
};
|
||||||
|
|
||||||
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
|
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
|
||||||
ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
|
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
|
||||||
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
||||||
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
|
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
|
||||||
|
|
||||||
|
// Initialize a socket
|
||||||
|
ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
|
||||||
// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
|
// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
|
||||||
ncclResult_t ncclSocketListen(struct ncclSocket* sock);
|
ncclResult_t ncclSocketListen(struct ncclSocket* sock);
|
||||||
|
ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
|
||||||
// Connect to sock->addr. sock->fd is set after a successful call.
|
// Connect to sock->addr. sock->fd is set after a successful call.
|
||||||
ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
|
ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
|
||||||
// Return socket connection state.
|
// Return socket connection state.
|
||||||
ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state);
|
ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
|
||||||
// Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
|
// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
|
||||||
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket);
|
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
|
||||||
|
ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
|
||||||
|
ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
|
||||||
|
|
||||||
#define NCCL_SOCKET_SEND 0
|
#define NCCL_SOCKET_SEND 0
|
||||||
#define NCCL_SOCKET_RECV 1
|
#define NCCL_SOCKET_RECV 1
|
||||||
@ -65,6 +93,5 @@ ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size
|
|||||||
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
|
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
|
||||||
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
|
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
|
||||||
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
|
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
|
||||||
/* initialize a socket. */
|
ncclResult_t ncclSocketClose(struct ncclSocket* sock);
|
||||||
ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -98,16 +98,19 @@ ncclResult_t ncclStrongStreamLaunchKernel(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
|
// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
|
||||||
|
// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
|
||||||
|
// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
|
||||||
|
// implementation to induce few graph dependencies.
|
||||||
ncclResult_t ncclStrongStreamWaitStream(
|
ncclResult_t ncclStrongStreamWaitStream(
|
||||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
|
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
|
||||||
);
|
);
|
||||||
// `b` must be capturing within `graph`.
|
// `b` must be capturing within `graph`.
|
||||||
ncclResult_t ncclStrongStreamWaitStream(
|
ncclResult_t ncclStrongStreamWaitStream(
|
||||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
|
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
|
||||||
);
|
);
|
||||||
// `a` must be capturing within `graph`.
|
// `a` must be capturing within `graph`.
|
||||||
ncclResult_t ncclStrongStreamWaitStream(
|
ncclResult_t ncclStrongStreamWaitStream(
|
||||||
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
|
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
|
||||||
);
|
);
|
||||||
|
|
||||||
// Synchrnoization does not need the strong stream to be acquired.
|
// Synchrnoization does not need the strong stream to be acquired.
|
||||||
|
@ -27,6 +27,7 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
|||||||
uint64_t getHash(const char* string, int n);
|
uint64_t getHash(const char* string, int n);
|
||||||
uint64_t getHostHash();
|
uint64_t getHostHash();
|
||||||
uint64_t getPidHash();
|
uint64_t getPidHash();
|
||||||
|
ncclResult_t getRandomData(void* buffer, size_t bytes);
|
||||||
|
|
||||||
struct netIf {
|
struct netIf {
|
||||||
char prefix[64];
|
char prefix[64];
|
||||||
@ -48,6 +49,19 @@ inline uint64_t clockNano() {
|
|||||||
return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
|
return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
|
||||||
|
* return -1 */
|
||||||
|
inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
|
if (bytes > 0) {
|
||||||
|
const size_t one = 1UL;
|
||||||
|
FILE* fp = fopen("/dev/urandom", "r");
|
||||||
|
if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError;
|
||||||
|
if (fp) fclose(fp);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<typename Int>
|
template<typename Int>
|
||||||
|
642
src/init.cc
642
src/init.cc
File diff suppressed because it is too large
Load Diff
26
src/init_nvtx.cc
Normal file
26
src/init_nvtx.cc
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#include "nccl.h"
|
||||||
|
#include "nvtx.h"
|
||||||
|
|
||||||
|
static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
|
||||||
|
{"Sum", ncclSum},
|
||||||
|
{"Product", ncclProd},
|
||||||
|
{"Max", ncclMax},
|
||||||
|
{"Min", ncclMin},
|
||||||
|
{"Avg", ncclAvg}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Must be called before the first call to any reduction operation.
|
||||||
|
void initNvtxRegisteredEnums() {
|
||||||
|
// Register schemas and strings
|
||||||
|
constexpr const nvtxPayloadEnumAttr_t eAttr {
|
||||||
|
.fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES |
|
||||||
|
NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID,
|
||||||
|
.name = NULL,
|
||||||
|
.entries = NvtxEnumRedSchema,
|
||||||
|
.numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
|
||||||
|
.sizeOfEnum = sizeof(ncclRedOp_t),
|
||||||
|
.schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP
|
||||||
|
};
|
||||||
|
|
||||||
|
nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
|
||||||
|
}
|
@ -87,7 +87,7 @@ static void initOnceFunc() {
|
|||||||
|
|
||||||
cudaLib = dlopen(path, RTLD_LAZY);
|
cudaLib = dlopen(path, RTLD_LAZY);
|
||||||
if (cudaLib == NULL) {
|
if (cudaLib == NULL) {
|
||||||
WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath);
|
WARN("Failed to find CUDA library (NCCL_CUDA_PATH='%s') : %s", ncclCudaPath ? ncclCudaPath : "", dlerror());
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,79 +15,152 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
// Change functions behavior to match other SYS functions
|
struct shmHandleInternal {
|
||||||
static int shm_allocate(int fd, const int shmSize) {
|
int fd;
|
||||||
int err = posix_fallocate(fd, 0, shmSize);
|
char* shmPath;
|
||||||
if (err) { errno = err; return -1; }
|
char* shmPtr;
|
||||||
return 0;
|
void* devShmPtr;
|
||||||
}
|
size_t shmSize;
|
||||||
static int shm_map(int fd, const int shmSize, void** ptr) {
|
size_t realShmSize;
|
||||||
*ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
int* refcount;
|
||||||
return (*ptr == MAP_FAILED) ? -1 : 0;
|
};
|
||||||
|
|
||||||
|
static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmSize, char* hptr, void* dptr, bool create, struct shmHandleInternal* handle) {
|
||||||
|
handle->fd = fd;
|
||||||
|
handle->shmPtr = hptr;
|
||||||
|
handle->devShmPtr = dptr;
|
||||||
|
handle->shmSize = shmSize;
|
||||||
|
handle->realShmSize = realShmSize;
|
||||||
|
handle->refcount = (int*)(hptr + shmSize);
|
||||||
|
if (create) {
|
||||||
|
int slen = strlen(shmPath);
|
||||||
|
handle->shmPath = (char*)malloc(slen + 1);
|
||||||
|
memcpy(handle->shmPath, shmPath, slen + 1);
|
||||||
|
if (hptr) memset(hptr, 0, shmSize);
|
||||||
|
} else {
|
||||||
|
handle->shmPath = NULL;
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) {
|
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
|
||||||
|
int fd = -1;
|
||||||
|
char* hptr = NULL;
|
||||||
|
void* dptr = NULL;
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
|
struct shmHandleInternal* tmphandle;
|
||||||
|
bool create = refcount > 0 ? true : false;
|
||||||
|
const size_t refSize = sizeof(int); /* extra sizeof(int) bytes for reference count */
|
||||||
|
const size_t realShmSize = shmSize + refSize;
|
||||||
|
|
||||||
|
*handle = *shmPtr = NULL; /* assume shmPtr and handle always set correctly by users. */
|
||||||
|
EQCHECKGOTO(tmphandle = (struct shmHandleInternal*)malloc(sizeof(struct shmHandleInternal)), NULL, ret, fail);
|
||||||
if (create) {
|
if (create) {
|
||||||
|
/* refcount > 0 means the caller tries to allocate a shared memory. This shared memory segment will have
|
||||||
|
* refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it
|
||||||
|
* goes down to 0, unlink should be called in order to delete shared memory file. */
|
||||||
if (shmPath[0] == '\0') {
|
if (shmPath[0] == '\0') {
|
||||||
sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
|
sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
|
||||||
*fd = mkstemp(shmPath);
|
fd = mkstemp(shmPath);
|
||||||
} else {
|
} else {
|
||||||
SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
|
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
|
||||||
}
|
}
|
||||||
if (ftruncate(*fd, shmSize) != 0) {
|
|
||||||
WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
|
if (ftruncate(fd, realShmSize) != 0) {
|
||||||
return ncclSystemError;
|
WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
|
||||||
|
ret = ncclSystemError;
|
||||||
|
goto fail;
|
||||||
}
|
}
|
||||||
INFO(NCCL_ALLOC, "Allocated %d bytes of shared memory in %s\n", shmSize, shmPath);
|
INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath);
|
||||||
} else {
|
} else {
|
||||||
SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
|
SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
|
||||||
}
|
}
|
||||||
*ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
|
|
||||||
if (*ptr == NULL) {
|
hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||||
WARN("Could not map %s\n", shmPath);
|
if (hptr == MAP_FAILED) {
|
||||||
return ncclSystemError;
|
WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
|
||||||
|
ret = ncclSystemError;
|
||||||
|
goto fail;
|
||||||
}
|
}
|
||||||
close(*fd);
|
|
||||||
*fd = -1;
|
|
||||||
if (create) memset(*ptr, 0, shmSize);
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) {
|
if (create) {
|
||||||
int fd = -1;
|
*(int*)(hptr + shmSize) = refcount;
|
||||||
void* ptr = MAP_FAILED;
|
} else {
|
||||||
ncclResult_t res = ncclSuccess;
|
int remref = __atomic_sub_fetch((int*)(hptr + shmSize), 1, __ATOMIC_RELAXED);
|
||||||
|
if (remref == 0) {
|
||||||
|
/* the last peer has completed attachment, it should unlink the shm mem file. */
|
||||||
|
if (unlink(shmPath) != 0) {
|
||||||
|
WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (refcount != -1) {
|
||||||
|
WARN("attaching memory should only reduce refcount by 1 but %d is passed", refcount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
|
|
||||||
if (devShmPtr) {
|
if (devShmPtr) {
|
||||||
CUDACHECKGOTO(cudaHostRegister(ptr, shmSize, cudaHostRegisterMapped), res, cudaError);
|
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail);
|
||||||
CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
|
CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
|
||||||
}
|
}
|
||||||
|
|
||||||
*shmPtr = ptr;
|
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
|
||||||
return ncclSuccess;
|
exit:
|
||||||
sysError:
|
*shmPtr = hptr;
|
||||||
WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
|
if (devShmPtr) *devShmPtr = dptr;
|
||||||
cudaError:
|
*handle = (ncclShmHandle_t)tmphandle;
|
||||||
if (fd != -1) close(fd);
|
return ret;
|
||||||
if (create) shm_unlink(shmPath);
|
fail:
|
||||||
if (ptr != MAP_FAILED) munmap(ptr, shmSize);
|
WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize);
|
||||||
*shmPtr = NULL;
|
if (tmphandle) {
|
||||||
return res;
|
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
|
||||||
|
ncclShmClose((ncclShmHandle_t)tmphandle);
|
||||||
|
tmphandle = NULL;
|
||||||
|
}
|
||||||
|
hptr = NULL;
|
||||||
|
dptr = NULL;
|
||||||
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclShmUnlink(const char* shmPath) {
|
ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
|
||||||
if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink");
|
ncclResult_t ret = ncclSuccess;
|
||||||
return ncclSuccess;
|
struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
|
||||||
|
if (tmphandle) {
|
||||||
|
if (tmphandle->fd >= 0) {
|
||||||
|
close(tmphandle->fd);
|
||||||
|
if (tmphandle->shmPath != NULL && *tmphandle->refcount > 0) {
|
||||||
|
if (unlink(tmphandle->shmPath) != 0) {
|
||||||
|
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
|
||||||
|
ret = ncclSystemError;
|
||||||
|
}
|
||||||
|
free(tmphandle->shmPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tmphandle->shmPtr) {
|
||||||
|
if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr));
|
||||||
|
if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) {
|
||||||
|
WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno));
|
||||||
|
ret = ncclSystemError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(tmphandle);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
|
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
|
||||||
if (shmPtr) {
|
ncclResult_t ret = ncclSuccess;
|
||||||
if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr));
|
struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
|
||||||
if (munmap(shmPtr, shmSize) != 0) {
|
if (tmphandle) {
|
||||||
WARN("munmap of shared memory failed");
|
if (tmphandle->shmPath != NULL) {
|
||||||
return ncclSystemError;
|
if (unlink(tmphandle->shmPath) != 0) {
|
||||||
|
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
|
||||||
|
ret = ncclSystemError;
|
||||||
|
}
|
||||||
|
free(tmphandle->shmPath);
|
||||||
|
tmphandle->shmPath = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,52 @@
|
|||||||
#include <ifaddrs.h>
|
#include <ifaddrs.h>
|
||||||
#include <net/if.h>
|
#include <net/if.h>
|
||||||
|
|
||||||
|
static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
|
||||||
|
int bytes = 0;
|
||||||
|
*closed = 0;
|
||||||
|
char* data = (char*)ptr;
|
||||||
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
|
do {
|
||||||
|
if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
|
||||||
|
if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
|
||||||
|
if (op == NCCL_SOCKET_RECV && bytes == 0) {
|
||||||
|
*closed = 1;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
if (bytes == -1) {
|
||||||
|
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
|
||||||
|
WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
|
||||||
|
return ncclRemoteError;
|
||||||
|
} else {
|
||||||
|
bytes = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(*offset) += bytes;
|
||||||
|
if (sock->abortFlag && *sock->abortFlag != 0) {
|
||||||
|
INFO(NCCL_NET, "socketProgressOpt: abort called");
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
} while (bytes > 0 && (*offset) < size);
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
|
||||||
|
int closed;
|
||||||
|
NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
|
||||||
|
if (closed) {
|
||||||
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
|
WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
|
||||||
|
return ncclRemoteError;
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
|
||||||
|
while (*offset < size)
|
||||||
|
NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
|
/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
|
||||||
*
|
*
|
||||||
* Output: "IPv4/IPv6 address<port>"
|
* Output: "IPv4/IPv6 address<port>"
|
||||||
@ -194,7 +240,7 @@ int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAd
|
|||||||
return found;
|
return found;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
|
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
|
||||||
if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
|
if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
|
||||||
WARN("Net : string is null");
|
WARN("Net : string is null");
|
||||||
return ncclInvalidArgument;
|
return ncclInvalidArgument;
|
||||||
@ -296,7 +342,7 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
|
|||||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
|
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
|
||||||
// Try to find interface that is in the same subnet as the IP in comm id
|
// Try to find interface that is in the same subnet as the IP in comm id
|
||||||
union ncclSocketAddress idAddr;
|
union ncclSocketAddress idAddr;
|
||||||
ncclGetSocketAddrFromString(&idAddr, commId);
|
ncclSocketGetAddrFromString(&idAddr, commId);
|
||||||
nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
|
nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -310,39 +356,31 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
|
ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
|
||||||
/* IPv4/IPv6 support */
|
if (sock == NULL) {
|
||||||
int family = sock->addr.sa.sa_family;
|
WARN("ncclSocketListen: pass NULL socket");
|
||||||
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
|
return ncclInvalidArgument;
|
||||||
int flags;
|
}
|
||||||
|
if (sock->fd == -1) {
|
||||||
/* Create socket and bind it to a port */
|
WARN("ncclSocketListen: file descriptor is -1");
|
||||||
int fd = socket(family, SOCK_STREAM, 0);
|
return ncclInvalidArgument;
|
||||||
if (fd == -1) {
|
|
||||||
WARN("Net : Socket creation failed : %s", strerror(errno));
|
|
||||||
return ncclSystemError;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (socketToPort(&sock->addr)) {
|
if (socketToPort(&sock->addr)) {
|
||||||
// Port is forced by env. Make sure we get the port.
|
// Port is forced by env. Make sure we get the port.
|
||||||
int opt = 1;
|
int opt = 1;
|
||||||
#if defined(SO_REUSEPORT)
|
#if defined(SO_REUSEPORT)
|
||||||
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
|
SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
|
||||||
#else
|
#else
|
||||||
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
|
SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The socket is set non-blocking for OS level, but asyncFlag is used to control
|
|
||||||
* blocking and non-blocking behavior in user level. */
|
|
||||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
|
||||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
|
||||||
|
|
||||||
// addr port should be 0 (Any port)
|
// addr port should be 0 (Any port)
|
||||||
SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
|
SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind");
|
||||||
|
|
||||||
/* Get the assigned Port */
|
/* Get the assigned Port */
|
||||||
socklen_t size = salen;
|
socklen_t size = sock->salen;
|
||||||
SYSCHECK(getsockname(fd, &sock->addr.sa, &size), "getsockname");
|
SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");
|
||||||
|
|
||||||
#ifdef ENABLE_TRACE
|
#ifdef ENABLE_TRACE
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
@ -352,220 +390,431 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
|
|||||||
/* Put the socket in listen mode
|
/* Put the socket in listen mode
|
||||||
* NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
|
* NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
|
||||||
*/
|
*/
|
||||||
SYSCHECK(listen(fd, 16384), "listen");
|
SYSCHECK(listen(sock->fd, 16384), "listen");
|
||||||
sock->fd = fd;
|
sock->state = ncclSocketStateReady;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
|
ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) {
|
||||||
|
if (sock == NULL) {
|
||||||
|
WARN("ncclSocketGetAddr: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
if (sock->state != ncclSocketStateReady) return ncclInternalError;
|
||||||
|
memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
|
||||||
|
socklen_t socklen = sizeof(union ncclSocketAddress);
|
||||||
|
sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
|
||||||
|
if (sock->fd != -1) {
|
||||||
|
sock->state = ncclSocketStateAccepted;
|
||||||
|
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||||
|
WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno);
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
|
||||||
|
uint64_t magic;
|
||||||
|
enum ncclSocketType type;
|
||||||
|
int received = 0;
|
||||||
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
|
||||||
|
if (received == 0) return ncclSuccess;
|
||||||
|
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
|
||||||
|
if (magic != sock->magic) {
|
||||||
|
WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
|
||||||
|
close(sock->fd);
|
||||||
|
sock->fd = -1;
|
||||||
|
// Ignore spurious connection and accept again
|
||||||
|
sock->state = ncclSocketStateAccepting;
|
||||||
|
return ncclSuccess;
|
||||||
|
} else {
|
||||||
|
received = 0;
|
||||||
|
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
|
||||||
|
if (type != sock->type) {
|
||||||
|
WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
|
||||||
|
sock->state = ncclSocketStateError;
|
||||||
|
close(sock->fd);
|
||||||
|
sock->fd = -1;
|
||||||
|
return ncclInternalError;
|
||||||
|
} else {
|
||||||
|
sock->state = ncclSocketStateReady;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
|
||||||
|
/* blocking/non-blocking connect() is determined by asyncFlag. */
|
||||||
|
int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
|
||||||
|
|
||||||
|
if (ret == 0) {
|
||||||
|
sock->state = ncclSocketStateConnected;
|
||||||
|
return ncclSuccess;
|
||||||
|
} else if (errno == EINPROGRESS) {
|
||||||
|
sock->state = ncclSocketStateConnectPolling;
|
||||||
|
return ncclSuccess;
|
||||||
|
} else if (errno == ECONNREFUSED) {
|
||||||
|
if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
|
||||||
|
sock->state = ncclSocketStateError;
|
||||||
|
WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries);
|
||||||
|
return ncclRemoteError;
|
||||||
|
}
|
||||||
|
usleep(SLEEP_INT);
|
||||||
|
if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||||
|
return ncclSuccess;
|
||||||
|
} else if (errno == ETIMEDOUT) {
|
||||||
|
if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
|
||||||
|
sock->state = ncclSocketStateError;
|
||||||
|
WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries);
|
||||||
|
return ncclRemoteError;
|
||||||
|
}
|
||||||
|
usleep(SLEEP_INT);
|
||||||
|
return ncclSuccess;
|
||||||
|
} else {
|
||||||
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
|
sock->state = ncclSocketStateError;
|
||||||
|
WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
|
||||||
struct pollfd pfd;
|
struct pollfd pfd;
|
||||||
int timeout = 1, ret;
|
int timeout = 1, ret;
|
||||||
socklen_t rlen = sizeof(int);
|
socklen_t rlen = sizeof(int);
|
||||||
|
|
||||||
memset(&pfd, 0, sizeof(struct pollfd));
|
memset(&pfd, 0, sizeof(struct pollfd));
|
||||||
pfd.fd = fd;
|
pfd.fd = sock->fd;
|
||||||
pfd.events = POLLOUT;
|
pfd.events = POLLOUT;
|
||||||
SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
|
SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
|
||||||
if (ret == 0) {
|
if (ret == 0) return ncclSuccess;
|
||||||
ret = EINPROGRESS;
|
|
||||||
} else {
|
|
||||||
/* check socket status */
|
/* check socket status */
|
||||||
EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
|
EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
|
||||||
SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
|
SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
|
||||||
}
|
|
||||||
|
|
||||||
if (ret == EINPROGRESS || ret == ECONNREFUSED)
|
if (ret == 0) {
|
||||||
*state = ncclSocketConnecting;
|
sock->state = ncclSocketStateConnected;
|
||||||
else if (ret == 0)
|
} else if (ret == ECONNREFUSED) {
|
||||||
*state = ncclSocketConnected;
|
if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
|
||||||
else
|
sock->state = ncclSocketStateError;
|
||||||
*state = ncclSocketError;
|
WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries);
|
||||||
|
return ncclRemoteError;
|
||||||
|
}
|
||||||
|
if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||||
|
usleep(SLEEP_INT);
|
||||||
|
sock->state = ncclSocketStateConnecting;
|
||||||
|
} else if (ret == ETIMEDOUT) {
|
||||||
|
if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
|
||||||
|
sock->state = ncclSocketStateError;
|
||||||
|
WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries);
|
||||||
|
return ncclRemoteError;
|
||||||
|
}
|
||||||
|
usleep(SLEEP_INT);
|
||||||
|
sock->state = ncclSocketStateConnecting;
|
||||||
|
} else if (ret != EINPROGRESS) {
|
||||||
|
sock->state = ncclSocketStateError;
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state) {
|
ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
|
||||||
NCCLCHECK(getFdState(sock->fd, state));
|
if (sock == NULL) {
|
||||||
sock->state = *state;
|
WARN("ncclSocketPollConnect: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
NCCLCHECK(socketPollConnect(sock));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
|
||||||
|
int sent = 0;
|
||||||
|
NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
|
||||||
|
if (sent == 0) return ncclSuccess;
|
||||||
|
NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
|
||||||
|
sent = 0;
|
||||||
|
NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
|
||||||
|
sock->state = ncclSocketStateReady;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t socketProgressState(struct ncclSocket* sock) {
|
||||||
|
if (sock->state == ncclSocketStateAccepting) {
|
||||||
|
NCCLCHECK(socketTryAccept(sock));
|
||||||
|
}
|
||||||
|
if (sock->state == ncclSocketStateAccepted) {
|
||||||
|
NCCLCHECK(socketFinalizeAccept(sock));
|
||||||
|
}
|
||||||
|
if (sock->state == ncclSocketStateConnecting) {
|
||||||
|
NCCLCHECK(socketStartConnect(sock));
|
||||||
|
}
|
||||||
|
if (sock->state == ncclSocketStateConnectPolling) {
|
||||||
|
NCCLCHECK(socketPollConnect(sock));
|
||||||
|
}
|
||||||
|
if (sock->state == ncclSocketStateConnected) {
|
||||||
|
NCCLCHECK(socketFinalizeConnect(sock));
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
|
||||||
|
if (sock == NULL) {
|
||||||
|
*running = 0;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) {
|
||||||
|
WARN("ncclSocketReady: unexpected socket state %d", sock->state);
|
||||||
|
return ncclRemoteError;
|
||||||
|
}
|
||||||
|
*running = (sock->state == ncclSocketStateReady) ? 1 : 0;
|
||||||
|
if (*running == 0) {
|
||||||
|
NCCLCHECK(socketProgressState(sock));
|
||||||
|
*running = (sock->state == ncclSocketStateReady) ? 1 : 0;
|
||||||
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
|
ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
|
||||||
|
#ifdef ENABLE_TRACE
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
/* IPv4/IPv6 support */
|
#endif
|
||||||
int family = sock->addr.sa.sa_family;
|
const int one = 1;
|
||||||
if (family != AF_INET && family != AF_INET6) {
|
|
||||||
WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
|
if (sock == NULL) {
|
||||||
ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
|
WARN("ncclSocketConnect: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
if (sock->fd == -1) {
|
||||||
|
WARN("ncclSocketConnect: file descriptor is -1");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sock->state != ncclSocketStateInitialized) {
|
||||||
|
WARN("ncclSocketConnect: wrong socket state %d", sock->state);
|
||||||
|
if (sock->state == ncclSocketStateError) return ncclRemoteError;
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
|
|
||||||
int flags;
|
|
||||||
|
|
||||||
/* Connect to a hostname / port */
|
|
||||||
int fd = socket(family, SOCK_STREAM, 0);
|
|
||||||
if (fd == -1) {
|
|
||||||
WARN("Net : Socket creation failed : %s", strerror(errno));
|
|
||||||
return ncclSystemError;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int one = 1;
|
|
||||||
SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
|
|
||||||
|
|
||||||
/* The socket is set non-blocking for OS level, but asyncFlag is used to control
|
|
||||||
* blocking and non-blocking behavior in user level. */
|
|
||||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
|
||||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
|
||||||
|
|
||||||
/* const int bufsize = 128*1024;
|
|
||||||
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
|
|
||||||
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
|
|
||||||
|
|
||||||
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
|
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
|
||||||
|
|
||||||
int ret;
|
SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
|
||||||
int timedout_retries = 0;
|
|
||||||
int refused_retries = 0;
|
sock->state = ncclSocketStateConnecting;
|
||||||
retry:
|
do {
|
||||||
/* blocking/non-blocking connect() is determined by asyncFlag. */
|
NCCLCHECK(socketProgressState(sock));
|
||||||
ret = connect(fd, &sock->addr.sa, salen);
|
} while (sock->asyncFlag == 0 &&
|
||||||
|
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||||
|
(sock->state == ncclSocketStateConnecting ||
|
||||||
|
sock->state == ncclSocketStateConnectPolling ||
|
||||||
|
sock->state == ncclSocketStateConnected));
|
||||||
|
|
||||||
if (!sock->asyncFlag) {
|
|
||||||
/* blocking socket, need retry if connect fails. */
|
|
||||||
if (errno == EINPROGRESS || errno == EAGAIN || errno == EALREADY ||
|
|
||||||
(errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
|
||||||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
|
|
||||||
/* check abortFlag as long as we have chance to retry. */
|
|
||||||
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
|
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
|
||||||
if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
|
||||||
usleep(SLEEP_INT);
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
|
switch (sock->state) {
|
||||||
* However, it can return EISCONN instead of success which indicates connection is built up in
|
case ncclSocketStateConnecting:
|
||||||
* background already. No need to call connect() again. */
|
case ncclSocketStateConnectPolling:
|
||||||
if (ret == 0 || errno == EISCONN) {
|
case ncclSocketStateConnected:
|
||||||
sock->fd = fd;
|
case ncclSocketStateReady:
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
case ncclSocketStateError:
|
||||||
} else {
|
|
||||||
sock->fd = fd;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
|
|
||||||
return ncclRemoteError;
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
|
|
||||||
socklen_t socklen = sizeof(union ncclSocketAddress);
|
|
||||||
struct pollfd pollfd;
|
|
||||||
int tmpFd = sock->fd = -1;
|
|
||||||
int pollret;
|
|
||||||
|
|
||||||
pollfd.fd = listenSocket->fd;
|
|
||||||
pollfd.events = POLLIN;
|
|
||||||
retry:
|
|
||||||
if ((pollret = poll(&pollfd, 1, listenSocket->asyncFlag ? 0 : 100)) < 0) {
|
|
||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
} else {
|
default:
|
||||||
tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen);
|
WARN("ncclSocketConnect: wrong socket state %d", sock->state);
|
||||||
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!listenSocket->asyncFlag) {
|
|
||||||
/* blocking socket, if tmpFd is still -1, we need to retry */
|
|
||||||
if (tmpFd == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
|
|
||||||
if (listenSocket->abortFlag && *listenSocket->abortFlag != 0) return ncclInternalError;
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
EQCHECK(tmpFd, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
sock->fd = tmpFd;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, volatile uint32_t* abortFlag, int asyncFlag) {
|
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) {
|
||||||
if (sock == NULL)
|
ncclResult_t ret = ncclSuccess;
|
||||||
return ncclSuccess;
|
|
||||||
|
|
||||||
|
if (listenSock == NULL || sock == NULL) {
|
||||||
|
WARN("ncclSocketAccept: pass NULL socket");
|
||||||
|
ret = ncclInvalidArgument;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
if (listenSock->state != ncclSocketStateReady) {
|
||||||
|
WARN("ncclSocketAccept: wrong socket state %d", listenSock->state);
|
||||||
|
if (listenSock->state == ncclSocketStateError)
|
||||||
|
ret = ncclSystemError;
|
||||||
|
else
|
||||||
|
ret = ncclInternalError;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sock->acceptFd == -1) {
|
||||||
|
memcpy(sock, listenSock, sizeof(struct ncclSocket));
|
||||||
|
sock->acceptFd = listenSock->fd;
|
||||||
|
sock->state = ncclSocketStateAccepting;
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
|
||||||
|
} while (sock->asyncFlag == 0 &&
|
||||||
|
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||||
|
(sock->state == ncclSocketStateAccepting ||
|
||||||
|
sock->state == ncclSocketStateAccepted));
|
||||||
|
|
||||||
|
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
|
||||||
|
|
||||||
|
switch (sock->state) {
|
||||||
|
case ncclSocketStateAccepting:
|
||||||
|
case ncclSocketStateAccepted:
|
||||||
|
case ncclSocketStateReady:
|
||||||
|
ret = ncclSuccess;
|
||||||
|
break;
|
||||||
|
case ncclSocketStateError:
|
||||||
|
ret = ncclSystemError;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
WARN("ncclSocketAccept: wrong socket state %d", sock->state);
|
||||||
|
ret = ncclInternalError;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
exit:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) {
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
|
|
||||||
|
if (sock == NULL) goto exit;
|
||||||
|
sock->timedOutRetries = 0;
|
||||||
|
sock->refusedRetries = 0;
|
||||||
|
sock->abortFlag = abortFlag;
|
||||||
|
sock->asyncFlag = asyncFlag;
|
||||||
|
sock->state = ncclSocketStateInitialized;
|
||||||
|
sock->magic = magic;
|
||||||
|
sock->type = type;
|
||||||
sock->fd = -1;
|
sock->fd = -1;
|
||||||
|
sock->acceptFd = -1;
|
||||||
|
|
||||||
if (addr) {
|
if (addr) {
|
||||||
|
/* IPv4/IPv6 support */
|
||||||
|
int family;
|
||||||
memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
|
memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
|
||||||
|
family = sock->addr.sa.sa_family;
|
||||||
|
if (family != AF_INET && family != AF_INET6) {
|
||||||
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
|
WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
|
||||||
|
ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
|
||||||
|
ret = ncclInternalError;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
|
||||||
|
|
||||||
|
/* Connect to a hostname / port */
|
||||||
|
sock->fd = socket(family, SOCK_STREAM, 0);
|
||||||
|
if (sock->fd == -1) {
|
||||||
|
WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno));
|
||||||
|
ret = ncclSystemError;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
|
memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
|
||||||
}
|
}
|
||||||
sock->abortFlag = abortFlag;
|
|
||||||
sock->asyncFlag = asyncFlag;
|
|
||||||
sock->state = ncclSocketStateNum;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
|
/* Set socket as non-blocking if async or if we need to be able to abort */
|
||||||
int bytes = 0;
|
if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
|
||||||
*closed = 0;
|
int flags;
|
||||||
char* data = (char*)ptr;
|
EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail);
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail);
|
||||||
do {
|
|
||||||
if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
|
|
||||||
if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
|
|
||||||
if (op == NCCL_SOCKET_RECV && bytes == 0) {
|
|
||||||
*closed = 1;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
}
|
||||||
if (bytes == -1) {
|
|
||||||
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
|
exit:
|
||||||
WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
|
return ret;
|
||||||
return ncclRemoteError;
|
fail:
|
||||||
} else {
|
goto exit;
|
||||||
bytes = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(*offset) += bytes;
|
|
||||||
if (sock->abortFlag && *sock->abortFlag != 0) {
|
|
||||||
INFO(NCCL_NET, "Socket progress: abort called");
|
|
||||||
return ncclInternalError;
|
|
||||||
}
|
|
||||||
} while (bytes > 0 && (*offset) < size);
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
|
ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
|
||||||
int closed;
|
if (sock == NULL) {
|
||||||
NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
|
WARN("ncclSocketProgress: pass NULL socket");
|
||||||
if (closed) {
|
return ncclInvalidArgument;
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
|
||||||
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
|
|
||||||
return ncclRemoteError;
|
|
||||||
}
|
}
|
||||||
|
NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
|
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
|
||||||
while (*offset < size)
|
if (sock == NULL) {
|
||||||
NCCLCHECK(ncclSocketProgress(op, sock, ptr, size, offset));
|
WARN("ncclSocketWait: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
NCCLCHECK(socketWait(op, sock, ptr, size, offset));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
|
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
|
if (sock == NULL) {
|
||||||
|
WARN("ncclSocketSend: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
if (sock->state != ncclSocketStateReady) {
|
||||||
|
WARN("ncclSocketSend: socket state (%d) is not ready", sock->state);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
|
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
|
if (sock == NULL) {
|
||||||
|
WARN("ncclSocketRecv: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
if (sock->state != ncclSocketStateReady) {
|
||||||
|
WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Receive or detect connection closed
|
// Receive or detect connection closed
|
||||||
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
|
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
|
if (sock == NULL) {
|
||||||
|
WARN("ncclSocketTryRecv: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
*closed = 0;
|
*closed = 0;
|
||||||
while (offset < size) {
|
while (offset < size) {
|
||||||
NCCLCHECK(ncclSocketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
|
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
|
||||||
if (*closed) return ncclSuccess;
|
if (*closed) return ncclSuccess;
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
|
||||||
|
if (sock != NULL) {
|
||||||
|
if (sock->fd >= 0) close(sock->fd);
|
||||||
|
sock->state = ncclSocketStateClosed;
|
||||||
|
sock->fd = -1;
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) {
|
||||||
|
if (sock == NULL) {
|
||||||
|
WARN("ncclSocketGetFd: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
if (fd) *fd = sock->fd;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) {
|
||||||
|
if (sock == NULL) {
|
||||||
|
WARN("ncclSocketGetFd: pass NULL socket");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
sock->fd = fd;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
@ -305,7 +305,8 @@ static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bN
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclStrongStreamWaitStream(
|
ncclResult_t ncclStrongStreamWaitStream(
|
||||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
|
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b,
|
||||||
|
bool b_subsumes_a
|
||||||
) {
|
) {
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
if (graph.graph == nullptr) {
|
if (graph.graph == nullptr) {
|
||||||
@ -319,6 +320,7 @@ ncclResult_t ncclStrongStreamWaitStream(
|
|||||||
NCCLCHECK(checkGraphId(ag, graph.graphId));
|
NCCLCHECK(checkGraphId(ag, graph.graphId));
|
||||||
struct ncclStrongStreamGraph* bg = b->graphHead;
|
struct ncclStrongStreamGraph* bg = b->graphHead;
|
||||||
NCCLCHECK(checkGraphId(bg, graph.graphId));
|
NCCLCHECK(checkGraphId(bg, graph.graphId));
|
||||||
|
if (b_subsumes_a) ag->tipCount = 0;
|
||||||
mergeTips(ag, bg->tipNodes, bg->tipCount);
|
mergeTips(ag, bg->tipNodes, bg->tipCount);
|
||||||
}
|
}
|
||||||
a->serialEventNeedsRecord = true;
|
a->serialEventNeedsRecord = true;
|
||||||
@ -330,7 +332,8 @@ ncclResult_t ncclStrongStreamWaitStream(
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclStrongStreamWaitStream(
|
ncclResult_t ncclStrongStreamWaitStream(
|
||||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
|
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b,
|
||||||
|
bool b_subsumes_a
|
||||||
) {
|
) {
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
if (graph.graph == nullptr) {
|
if (graph.graph == nullptr) {
|
||||||
@ -351,6 +354,7 @@ ncclResult_t ncclStrongStreamWaitStream(
|
|||||||
}
|
}
|
||||||
struct ncclStrongStreamGraph* ag = a->graphHead;
|
struct ncclStrongStreamGraph* ag = a->graphHead;
|
||||||
NCCLCHECK(checkGraphId(ag, graph.graphId));
|
NCCLCHECK(checkGraphId(ag, graph.graphId));
|
||||||
|
if (b_subsumes_a) ag->tipCount = 0;
|
||||||
mergeTips(ag, bNodes, bCount);
|
mergeTips(ag, bNodes, bCount);
|
||||||
}
|
}
|
||||||
a->serialEventNeedsRecord = true;
|
a->serialEventNeedsRecord = true;
|
||||||
@ -362,7 +366,8 @@ ncclResult_t ncclStrongStreamWaitStream(
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclStrongStreamWaitStream(
|
ncclResult_t ncclStrongStreamWaitStream(
|
||||||
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
|
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b,
|
||||||
|
bool b_subsumes_a
|
||||||
) {
|
) {
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
if (graph.graph == nullptr) {
|
if (graph.graph == nullptr) {
|
||||||
@ -374,7 +379,9 @@ ncclResult_t ncclStrongStreamWaitStream(
|
|||||||
} else {
|
} else {
|
||||||
struct ncclStrongStreamGraph* bg = b->graphHead;
|
struct ncclStrongStreamGraph* bg = b->graphHead;
|
||||||
NCCLCHECK(checkGraphId(bg, graph.graphId));
|
NCCLCHECK(checkGraphId(bg, graph.graphId));
|
||||||
CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount, cudaStreamAddCaptureDependencies));
|
CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount,
|
||||||
|
b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies
|
||||||
|
));
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
|
CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
|
||||||
|
@ -123,7 +123,7 @@ const char* pncclGetErrorString(ncclResult_t result);
|
|||||||
* comm is currently unused and can be set to NULL
|
* comm is currently unused and can be set to NULL
|
||||||
*/
|
*/
|
||||||
const char* ncclGetLastError(ncclComm_t comm);
|
const char* ncclGetLastError(ncclComm_t comm);
|
||||||
const char* pncclGetError(ncclComm_t comm);
|
const char* pncclGetLastError(ncclComm_t comm);
|
||||||
|
|
||||||
/* Checks whether the comm has encountered any asynchronous errors */
|
/* Checks whether the comm has encountered any asynchronous errors */
|
||||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||||
|
175
src/proxy.cc
175
src/proxy.cc
@ -437,8 +437,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
|||||||
|
|
||||||
int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS;
|
int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS;
|
||||||
|
|
||||||
// If nNodes > 1 and we're using Simple, reduce the stepSize to increase shared buffer utilization
|
if (op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pChunkSize;
|
||||||
if (info->comm->nNodes > 1 && op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pNetChunkSize;
|
|
||||||
info->chunkSize = stepSize;
|
info->chunkSize = stepSize;
|
||||||
op->root = info->root;
|
op->root = info->root;
|
||||||
|
|
||||||
@ -532,6 +531,8 @@ static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressS
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
|
||||||
|
|
||||||
static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) {
|
static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) {
|
||||||
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
||||||
if (state->opsPool == NULL) return ncclInternalError;
|
if (state->opsPool == NULL) return ncclInternalError;
|
||||||
@ -570,9 +571,16 @@ process_nextops:
|
|||||||
int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
|
int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
|
||||||
for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1;
|
for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1;
|
||||||
|
|
||||||
|
uint64_t lastOpCount = 0;
|
||||||
|
int lastPeer = -1;
|
||||||
|
int count = 0;
|
||||||
for (int opIndex = state->nextOps; opIndex != -1;) {
|
for (int opIndex = state->nextOps; opIndex != -1;) {
|
||||||
struct ncclProxyOp* peerOp = pool->ops+opIndex;
|
struct ncclProxyOp* peerOp = pool->ops+opIndex;
|
||||||
int peer = opIndex / MAX_OPS_PER_PEER;
|
int peer = opIndex / MAX_OPS_PER_PEER;
|
||||||
|
if ((lastOpCount && peerOp->opCount != lastOpCount) || ((lastPeer != -1) && peer != lastPeer)) count++;
|
||||||
|
if (count == ncclParamProxyAppendBatchSize()+1) break;
|
||||||
|
lastOpCount = peerOp->opCount;
|
||||||
|
lastPeer = peer;
|
||||||
if (peerOp->connection == NULL) return ncclInternalError;
|
if (peerOp->connection == NULL) return ncclInternalError;
|
||||||
if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next);
|
if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next);
|
||||||
NCCLCHECK(ProxyAppend(state, peerOp));
|
NCCLCHECK(ProxyAppend(state, peerOp));
|
||||||
@ -676,7 +684,7 @@ void* ncclProxyProgress(void *comm_) {
|
|||||||
|
|
||||||
int lastIdle = 0;
|
int lastIdle = 0;
|
||||||
struct ncclProxyArgs profArgs; // Only used for profiling purposes
|
struct ncclProxyArgs profArgs; // Only used for profiling purposes
|
||||||
while (state->stop == 0 && *comm->abortFlag == 0) {
|
while ((state->stop == false || (state->stop == true && state->active)) && *comm->abortFlag == 0) {
|
||||||
int idle = 1;
|
int idle = 1;
|
||||||
ncclResult_t ret = progressOps(comm, state, state->active, &idle);
|
ncclResult_t ret = progressOps(comm, state, state->active, &idle);
|
||||||
if (ret != ncclSuccess) {
|
if (ret != ncclSuccess) {
|
||||||
@ -686,9 +694,9 @@ void* ncclProxyProgress(void *comm_) {
|
|||||||
}
|
}
|
||||||
if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
|
if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
|
||||||
if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
|
if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
|
||||||
if (idle) {
|
|
||||||
int added = 0;
|
int added = 0;
|
||||||
TIME_START(3);
|
TIME_START(3);
|
||||||
|
if (state->stop == false)
|
||||||
ret = ncclProxyGetPostedOps(comm, &added);
|
ret = ncclProxyGetPostedOps(comm, &added);
|
||||||
if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
|
if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
|
||||||
if (ret != ncclSuccess) {
|
if (ret != ncclSuccess) {
|
||||||
@ -698,7 +706,6 @@ void* ncclProxyProgress(void *comm_) {
|
|||||||
if (added == 0) {
|
if (added == 0) {
|
||||||
sched_yield(); // No request progressed. Let others run.
|
sched_yield(); // No request progressed. Let others run.
|
||||||
}
|
}
|
||||||
}
|
|
||||||
lastIdle = idle;
|
lastIdle = idle;
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -814,7 +821,7 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
|
|||||||
int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
|
int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
|
||||||
for (int i=0; i<max; i++) {
|
for (int i=0; i<max; i++) {
|
||||||
ncclProxyConnection *connection = pool->pools[b]+i;
|
ncclProxyConnection *connection = pool->pools[b]+i;
|
||||||
if (connection->initFlag == true) {
|
if (connection->state != connUninitialized) {
|
||||||
NCCLCHECK(proxyFree(connection, comm));
|
NCCLCHECK(proxyFree(connection, comm));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -827,6 +834,10 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
|
|||||||
#include "transport.h"
|
#include "transport.h"
|
||||||
|
|
||||||
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) {
|
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) {
|
||||||
|
struct ncclSocket* sock;
|
||||||
|
int ready;
|
||||||
|
int type = ncclProxyMsgInit;
|
||||||
|
|
||||||
// Keep one connection per mlocal rank
|
// Keep one connection per mlocal rank
|
||||||
proxyConn->connection = NULL;
|
proxyConn->connection = NULL;
|
||||||
proxyConn->rank = rank;
|
proxyConn->rank = rank;
|
||||||
@ -834,17 +845,18 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
|||||||
NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks));
|
NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks));
|
||||||
NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
|
NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
|
||||||
NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
|
NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
|
||||||
for (int r=0; r<comm->localRanks; r++) {
|
for (int i = 0; i < comm->localRanks; ++i) {
|
||||||
NCCLCHECK(ncclSocketInit(&comm->proxyState.peerSocks[r], NULL, comm->abortFlag, 0));
|
NCCLCHECK(ncclSocketSetFd(-1, &comm->proxyState.peerSocks[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
|
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
|
||||||
struct ncclSocket* sock = comm->proxyState.peerSocks+proxyConn->localRank;
|
sock = comm->proxyState.peerSocks + proxyConn->localRank;
|
||||||
if (sock->fd == -1) {
|
NCCLCHECK(ncclSocketReady(sock, &ready));
|
||||||
memcpy(&sock->addr, comm->proxyState.peerAddresses+rank, sizeof(union ncclSocketAddress));
|
if (!ready) {
|
||||||
|
NCCLCHECK(ncclSocketInit(sock, comm->proxyState.peerAddresses+rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
|
||||||
NCCLCHECK(ncclSocketConnect(sock));
|
NCCLCHECK(ncclSocketConnect(sock));
|
||||||
}
|
}
|
||||||
int type = ncclProxyMsgInit;
|
|
||||||
NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int)));
|
NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int)));
|
||||||
NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int)));
|
NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int)));
|
||||||
NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
|
NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
|
||||||
@ -857,7 +869,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
|||||||
NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1));
|
NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1));
|
||||||
struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank;
|
struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank;
|
||||||
if (proxyOps->pool == NULL) {
|
if (proxyOps->pool == NULL) {
|
||||||
NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0));
|
NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
|
||||||
proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
|
proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -868,11 +880,12 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
|||||||
|
|
||||||
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
|
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
|
||||||
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
|
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
|
||||||
if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
|
struct ncclSocket* sock;
|
||||||
struct ncclSocket* sock = proxyConn->comm->proxyState.peerSocks+proxyConn->localRank;
|
ncclResult_t ret = ncclSuccess;
|
||||||
if (sock->fd == -1) return ncclInternalError;
|
|
||||||
ncclResult_t ret;
|
|
||||||
|
|
||||||
|
if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
|
||||||
|
sock = proxyConn->comm->proxyState.peerSocks + proxyConn->localRank;
|
||||||
|
if (sock == NULL) return ncclInternalError;
|
||||||
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
|
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
|
||||||
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
|
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
|
||||||
NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
|
NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
|
||||||
@ -882,7 +895,6 @@ ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void*
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
error:
|
error:
|
||||||
WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
|
WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
|
||||||
sock->fd = -1;
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -892,16 +904,15 @@ static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
|
|||||||
int size = sizeof(struct ncclProxyOpsPool);
|
int size = sizeof(struct ncclProxyOpsPool);
|
||||||
struct ncclProxyOpsPool* pool = NULL;
|
struct ncclProxyOpsPool* pool = NULL;
|
||||||
|
|
||||||
char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
|
|
||||||
shmPath[0] = '\0';
|
|
||||||
NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, 1));
|
|
||||||
|
|
||||||
// Init pool
|
|
||||||
pool->nextOps = -1;
|
|
||||||
|
|
||||||
// The service thread may be launched already but localRanks may not be set yet.
|
// The service thread may be launched already but localRanks may not be set yet.
|
||||||
while (comm->localRanks == 0) sched_yield();
|
while (comm->localRanks == 0) sched_yield();
|
||||||
|
|
||||||
|
char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
|
||||||
|
shmPath[0] = '\0';
|
||||||
|
NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, comm->localRanks + 1, &state->handle));
|
||||||
|
// Init pool
|
||||||
|
pool->nextOps = -1;
|
||||||
|
|
||||||
for (int r=0; r<comm->localRanks; r++) {
|
for (int r=0; r<comm->localRanks; r++) {
|
||||||
pool->freeOps[r] = r*MAX_OPS_PER_PEER;
|
pool->freeOps[r] = r*MAX_OPS_PER_PEER;
|
||||||
for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1;
|
for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1;
|
||||||
@ -928,7 +939,7 @@ static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
|
|||||||
|
|
||||||
static void proxyOpsFree(struct ncclComm* comm) {
|
static void proxyOpsFree(struct ncclComm* comm) {
|
||||||
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
||||||
if (ncclShmClose(state->opsPool, NULL, sizeof(struct ncclProxyOpsPool)) != ncclSuccess) {
|
if (ncclShmClose(state->handle) != ncclSuccess) {
|
||||||
WARN("[Service thread] shm close failed");
|
WARN("[Service thread] shm close failed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -937,10 +948,8 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
|
|||||||
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
||||||
if (state->opsPool == NULL) return ncclSuccess;
|
if (state->opsPool == NULL) return ncclSuccess;
|
||||||
|
|
||||||
char shmPath[] = "/dev/shm/nccl-XXXXXX";
|
if (ncclShmUnlink(state->handle) != ncclSuccess) {
|
||||||
memcpy(shmPath+sizeof("/dev/shm/nccl-")-1, state->opsPoolShmSuffix, sizeof("XXXXXX")-1);
|
WARN("[Service thread] proxy ops shm unlink failed");
|
||||||
if (ncclShmUnlink(shmPath) != ncclSuccess) {
|
|
||||||
WARN("[Service thread] shm unlink failed");
|
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -965,7 +974,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
|||||||
NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
|
NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
|
||||||
}
|
}
|
||||||
INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
|
INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
|
||||||
__atomic_store_n(&connection->initFlag, true, __ATOMIC_RELEASE);
|
__atomic_store_n(&connection->state, connInitialized, __ATOMIC_RELEASE);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -980,6 +989,7 @@ static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct
|
|||||||
int nChannels;
|
int nChannels;
|
||||||
NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
|
NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
|
||||||
if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
|
if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
|
||||||
|
__atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -991,14 +1001,29 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC
|
|||||||
NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
|
NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
|
||||||
} else return ncclInternalError;
|
} else return ncclInternalError;
|
||||||
if (done) {
|
if (done) {
|
||||||
if (op->respSize) NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
|
if (op->type == ncclProxyMsgSetup)
|
||||||
if (op->reqBuff) free(op->reqBuff);
|
__atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
|
||||||
if (op->respBuff) free(op->respBuff);
|
else if (op->type == ncclProxyMsgConnect)
|
||||||
|
__atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
|
||||||
|
/* if setup or connect is done, we should not return any error at this point since
|
||||||
|
* ncclSocketSend might already send the respBuff to the requester. If we still choose
|
||||||
|
* to abort and close the connection, it can cause segfault if the requester is using
|
||||||
|
* the respBuff. */
|
||||||
|
if (op->respSize) ncclSocketSend(op->connection->sock, op->respBuff, op->respSize);
|
||||||
|
if (op->reqBuff) {
|
||||||
|
free(op->reqBuff);
|
||||||
op->reqBuff = NULL;
|
op->reqBuff = NULL;
|
||||||
|
}
|
||||||
|
if (op->respBuff) {
|
||||||
|
free(op->respBuff);
|
||||||
op->respBuff = NULL;
|
op->respBuff = NULL;
|
||||||
|
}
|
||||||
op->type = 0;
|
op->type = 0;
|
||||||
(*asyncOpCount)--;
|
(*asyncOpCount)--;
|
||||||
|
} else if (*comm->abortFlag != 0) {
|
||||||
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1042,36 +1067,52 @@ void* ncclProxyService(void* _args) {
|
|||||||
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
|
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
|
||||||
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
|
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
|
||||||
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
|
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
|
||||||
ncclSocketInit(&peers[s].sock, NULL, comm->abortFlag, 0);
|
|
||||||
pollfds[s].fd = -1;
|
pollfds[s].fd = -1;
|
||||||
pollfds[s].events = POLLHUP|POLLIN;
|
pollfds[s].events = POLLHUP|POLLIN;
|
||||||
}
|
}
|
||||||
pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
|
if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
|
||||||
|
WARN("[Proxy Service] Get listenSock fd fails\n");
|
||||||
|
return NULL;
|
||||||
|
};
|
||||||
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
|
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
|
||||||
|
|
||||||
int maxnpeers = 0;
|
int maxnpeers = 0;
|
||||||
int npeers = 0;
|
int npeers = 0;
|
||||||
int stop = 0;
|
int stop = 0;
|
||||||
int asyncOpCount = 0;
|
int asyncOpCount = 0;
|
||||||
while ((stop == 0 || (stop == 1 && npeers > 0)) && *comm->abortFlag == 0) {
|
while (stop == 0 || (stop == 1 && npeers > 0)) {
|
||||||
|
/* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
|
||||||
|
* connections. Need to wait until all other related comms call abort and safely exit
|
||||||
|
* together, or we could face segmentation fault. */
|
||||||
|
if (*comm->abortFlag != 0) stop = 1;
|
||||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||||
if (poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500) < 0) {
|
int ret;
|
||||||
WARN("[Proxy Service] Poll failed: %s\n", strerror(errno));
|
do {
|
||||||
|
ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500);
|
||||||
|
} while (ret < 0 && errno == EINTR);
|
||||||
|
if (ret < 0) {
|
||||||
|
WARN("[Proxy Service] Poll failed: %s", strerror(errno));
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
|
if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
|
||||||
int s = 0;
|
int s = 0;
|
||||||
while (s < NCCL_MAX_LOCAL_RANKS && peers[s].sock.fd != -1) s++;
|
while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++;
|
||||||
if (s == NCCL_MAX_LOCAL_RANKS) {
|
if (s == NCCL_MAX_LOCAL_RANKS) {
|
||||||
WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
|
WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (maxnpeers < s+1) maxnpeers = s+1;
|
if (maxnpeers < s+1) maxnpeers = s+1;
|
||||||
struct ncclSocket* sock = &peers[s].sock;
|
if (ncclSocketInit(&peers[s].sock) != ncclSuccess) {
|
||||||
if (ncclSocketAccept(sock, comm->proxyState.listenSock) != ncclSuccess) {
|
WARN("[Service thread] Initialize peers[%d].sock fails\n", s);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
|
||||||
WARN("[Service thread] Accept failed %s", strerror(errno));
|
WARN("[Service thread] Accept failed %s", strerror(errno));
|
||||||
} else {
|
} else {
|
||||||
pollfds[s].fd = sock->fd;
|
if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
|
||||||
|
WARN("[Service thread] Get peers[%d].sock fd fails\n", s);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
npeers++;
|
npeers++;
|
||||||
peers[s].localRank = -1;
|
peers[s].localRank = -1;
|
||||||
}
|
}
|
||||||
@ -1083,10 +1124,12 @@ void* ncclProxyService(void* _args) {
|
|||||||
int closeConn = 0;
|
int closeConn = 0;
|
||||||
int type = 0;
|
int type = 0;
|
||||||
ncclResult_t res = ncclSuccess;
|
ncclResult_t res = ncclSuccess;
|
||||||
|
|
||||||
|
if (pollfds[s].fd == -1) continue;
|
||||||
if (op->type != 0) {
|
if (op->type != 0) {
|
||||||
res = proxyProgressAsync(op, comm, &asyncOpCount);
|
res = proxyProgressAsync(op, comm, &asyncOpCount);
|
||||||
type = op->type;
|
type = op->type;
|
||||||
if (res != ncclSuccess) op->type = 0;
|
if (res != ncclSuccess) closeConn = 1;
|
||||||
} else if (pollfds[s].revents & POLLIN) {
|
} else if (pollfds[s].revents & POLLIN) {
|
||||||
int closed;
|
int closed;
|
||||||
if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
|
if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
|
||||||
@ -1120,26 +1163,31 @@ void* ncclProxyService(void* _args) {
|
|||||||
closeConn = 1;
|
closeConn = 1;
|
||||||
}
|
}
|
||||||
if (closeConn) {
|
if (closeConn) {
|
||||||
close(sock->fd);
|
ncclSocketClose(sock);
|
||||||
sock->fd = pollfds[s].fd = -1;
|
if (op->reqBuff) {
|
||||||
|
free(op->reqBuff);
|
||||||
|
op->reqBuff = NULL;
|
||||||
|
}
|
||||||
|
if (op->respBuff) {
|
||||||
|
free(op->respBuff);
|
||||||
|
op->respBuff = NULL;
|
||||||
|
}
|
||||||
|
op->type = 0;
|
||||||
|
pollfds[s].fd = -1;
|
||||||
npeers--;
|
npeers--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* wait until main thread flush all NCCL operations. */
|
|
||||||
while (*comm->abortFlag != 0 && __atomic_load_n(&comm->proxyState.safeAbortFlag, __ATOMIC_ACQUIRE) == 0)
|
|
||||||
usleep(1000);
|
|
||||||
|
|
||||||
// Wait for all operations to complete and stop progress thread before freeing any resource
|
// Wait for all operations to complete and stop progress thread before freeing any resource
|
||||||
if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
|
if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
|
||||||
WARN("[Proxy Service] proxyDestroy failed");
|
WARN("[Proxy Service] proxyDestroy failed");
|
||||||
}
|
}
|
||||||
for (int s=0; s<maxnpeers; s++) {
|
for (int s=0; s<maxnpeers; s++) {
|
||||||
if (peers[s].sock.fd != -1) close(peers[s].sock.fd);
|
ncclSocketClose(&peers[s].sock);
|
||||||
}
|
}
|
||||||
ncclProxyFreeConnections(&connectionPool, comm);
|
ncclProxyFreeConnections(&connectionPool, comm);
|
||||||
close(comm->proxyState.listenSock->fd);
|
ncclSocketClose(comm->proxyState.listenSock);
|
||||||
free(comm->proxyState.listenSock);
|
|
||||||
proxyOpsFree(comm);
|
proxyOpsFree(comm);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@ -1164,32 +1212,29 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
|
|||||||
if (state->peerAddresses) {
|
if (state->peerAddresses) {
|
||||||
if (*comm->abortFlag == 0) {
|
if (*comm->abortFlag == 0) {
|
||||||
struct ncclSocket sock;
|
struct ncclSocket sock;
|
||||||
sock.abortFlag = NULL;
|
|
||||||
sock.asyncFlag = 0;
|
|
||||||
memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
|
|
||||||
NCCLCHECK(ncclSocketConnect(&sock));
|
|
||||||
int type = ncclProxyMsgStop;
|
int type = ncclProxyMsgStop;
|
||||||
|
NCCLCHECK(ncclSocketInit(&sock, comm->proxyState.peerAddresses + comm->rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
|
||||||
|
NCCLCHECK(ncclSocketConnect(&sock));
|
||||||
NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
|
NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
|
||||||
close(sock.fd);
|
NCCLCHECK(ncclSocketClose(&sock));
|
||||||
} else {
|
|
||||||
/* when abortFlag is set, all socket related communications are no longer reliable. We need to
|
|
||||||
* set a flag to let proxy thread exit. */
|
|
||||||
__atomic_store_n(&state->safeAbortFlag, 1, __ATOMIC_RELEASE);
|
|
||||||
}
|
}
|
||||||
free(state->peerAddresses);
|
free(state->peerAddresses);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (state->peerSocks) {
|
if (state->peerSocks) {
|
||||||
for (int i=0; i<comm->localRanks; i++) {
|
for (int i=0; i<comm->localRanks; i++) {
|
||||||
if (state->peerSocks[i].fd != -1) {
|
int fd;
|
||||||
|
NCCLCHECK(ncclSocketGetFd(state->peerSocks + i, &fd));
|
||||||
|
if (fd >= 0) {
|
||||||
if (state->proxyOps[i].pool) {
|
if (state->proxyOps[i].pool) {
|
||||||
NCCLCHECK(ncclShmClose(state->proxyOps[i].pool, NULL, sizeof(struct ncclProxyOpsPool)));
|
NCCLCHECK(ncclShmClose(state->proxyOps[i].handle));
|
||||||
}
|
}
|
||||||
if (state->sharedDevMems[i]) {
|
if (state->sharedDevMems[i]) {
|
||||||
CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i]));
|
CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i]));
|
||||||
}
|
}
|
||||||
int type = ncclProxyMsgClose;
|
int type = ncclProxyMsgClose;
|
||||||
if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks+i, &type, sizeof(int)));
|
if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks + i, &type, sizeof(int)));
|
||||||
close(state->peerSocks[i].fd);
|
NCCLCHECK(ncclSocketClose(state->peerSocks + i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
free(state->peerSocks);
|
free(state->peerSocks);
|
||||||
|
@ -67,12 +67,11 @@ void dumpData(struct ncclConnect* data, int ndata) {
|
|||||||
|
|
||||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
|
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
|
||||||
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
int highestType = TRANSPORT_P2P; // track highest transport type
|
int highestType = TRANSPORT_P2P; // track highest transport type
|
||||||
|
|
||||||
cudaStream_t transportSetupStream;
|
|
||||||
CUDACHECK(cudaStreamCreateWithFlags(&transportSetupStream, cudaStreamNonBlocking));
|
|
||||||
|
|
||||||
struct ncclConnect data[2*MAXCHANNELS];
|
struct ncclConnect data[2*MAXCHANNELS];
|
||||||
|
|
||||||
|
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
|
||||||
for (int i=1; i<comm->nRanks; i++) {
|
for (int i=1; i<comm->nRanks; i++) {
|
||||||
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
||||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||||
@ -86,7 +85,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
TIME_START(0);
|
TIME_START(0);
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (recvMask & (1UL<<c)) {
|
if (recvMask & (1UL<<c)) {
|
||||||
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
|
NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
|
||||||
if (type > highestType) highestType = type;
|
if (type > highestType) highestType = type;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -95,7 +94,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
struct ncclConnect* sendData = recvData+recvChannels;
|
struct ncclConnect* sendData = recvData+recvChannels;
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (sendMask & (1UL<<c)) {
|
if (sendMask & (1UL<<c)) {
|
||||||
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
|
NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
|
||||||
if (type > highestType) highestType = type;
|
if (type > highestType) highestType = type;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -104,16 +103,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
TIME_START(2);
|
TIME_START(2);
|
||||||
if (sendPeer == recvPeer) {
|
if (sendPeer == recvPeer) {
|
||||||
if (recvChannels+sendChannels) {
|
if (recvChannels+sendChannels) {
|
||||||
NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
|
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
|
||||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
|
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
|
||||||
sendData = data;
|
sendData = data;
|
||||||
recvData = data+sendChannels;
|
recvData = data+sendChannels;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
|
if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
|
||||||
if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
|
if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
|
||||||
if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
|
if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
|
||||||
if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
|
if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
|
||||||
}
|
}
|
||||||
TIME_STOP(2);
|
TIME_STOP(2);
|
||||||
|
|
||||||
@ -121,9 +120,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (sendMask & (1UL<<c)) {
|
if (sendMask & (1UL<<c)) {
|
||||||
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
||||||
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
|
||||||
conn->connected = 1;
|
conn->connected = 1;
|
||||||
CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, transportSetupStream));
|
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TIME_STOP(3);
|
TIME_STOP(3);
|
||||||
@ -131,19 +130,23 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (recvMask & (1UL<<c)) {
|
if (recvMask & (1UL<<c)) {
|
||||||
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
||||||
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
|
||||||
conn->connected = 1;
|
conn->connected = 1;
|
||||||
CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, transportSetupStream));
|
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TIME_STOP(4);
|
TIME_STOP(4);
|
||||||
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
|
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
|
||||||
}
|
}
|
||||||
CUDACHECK(cudaStreamSynchronize(transportSetupStream));
|
|
||||||
CUDACHECK(cudaStreamDestroy(transportSetupStream));
|
|
||||||
if (highestTransportType != NULL) *highestTransportType = highestType;
|
if (highestTransportType != NULL) *highestTransportType = highestType;
|
||||||
TIME_PRINT("P2P Setup/Connect");
|
TIME_PRINT("P2P Setup/Connect");
|
||||||
return ncclSuccess;
|
exit:
|
||||||
|
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
|
||||||
|
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
|
||||||
|
return ret;
|
||||||
|
fail:
|
||||||
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern struct ncclTransport collNetTransport;
|
extern struct ncclTransport collNetTransport;
|
||||||
|
@ -63,6 +63,7 @@ struct connectMapMem{
|
|||||||
char shmPath[PATH_MAX];
|
char shmPath[PATH_MAX];
|
||||||
cudaIpcMemHandle_t ipc;
|
cudaIpcMemHandle_t ipc;
|
||||||
};
|
};
|
||||||
|
ncclShmHandle_t handle;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct connectMap {
|
struct connectMap {
|
||||||
@ -224,13 +225,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t netMapShm(struct connectMapMem* mem) {
|
static ncclResult_t netMapShm(struct connectMapMem* mem) {
|
||||||
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, 0));
|
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->handle));
|
||||||
NCCLCHECK(ncclShmUnlink(mem->shmPath));
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
static ncclResult_t netCreateShm(struct connectMapMem* mem) {
|
static ncclResult_t netCreateShm(struct connectMapMem* mem) {
|
||||||
mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
|
mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
|
||||||
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1));
|
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->handle));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -339,7 +339,7 @@ static ncclResult_t sendFree(struct ncclConnector* send) {
|
|||||||
struct connectMap* map = (struct connectMap*)(send->transportResources);
|
struct connectMap* map = (struct connectMap*)(send->transportResources);
|
||||||
if (map) {
|
if (map) {
|
||||||
if (map->sameProcess == 0) {
|
if (map->sameProcess == 0) {
|
||||||
NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
|
NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].handle));
|
||||||
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||||
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||||
}
|
}
|
||||||
@ -372,7 +372,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
|
|||||||
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
|
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
|
||||||
state->refcount++;
|
state->refcount++;
|
||||||
if (state->size == 0) {
|
if (state->size == 0) {
|
||||||
state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pNetChunkSize;
|
state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pChunkSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (size) *size = state->size;
|
if (size) *size = state->size;
|
||||||
@ -399,7 +399,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
|
|||||||
static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
|
static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
|
||||||
// Use different pools for different channels and also separate send/recv.
|
// Use different pools for different channels and also separate send/recv.
|
||||||
int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
|
int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
|
||||||
*offset = comm->p2pNetChunkSize * globalSlot;
|
*offset = comm->p2pChunkSize * globalSlot;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -523,6 +523,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
|
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
|
||||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (resources->netSendComm == NULL) {
|
if (resources->netSendComm == NULL) {
|
||||||
*done = 0;
|
*done = 0;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -657,6 +658,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
|
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
|
||||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (resources->netRecvComm == NULL) {
|
if (resources->netRecvComm == NULL) {
|
||||||
*done = 0;
|
*done = 0;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -746,10 +748,12 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
|
|
||||||
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||||
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
||||||
if (resources == NULL) { // NVB Preconnect
|
if (connection->state == connSharedInitialized) { // NVB Preconnect
|
||||||
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
|
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (connection->state == connConnected) {
|
||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
if (resources->buffers[p]) {
|
if (resources->buffers[p]) {
|
||||||
NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
|
NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
|
||||||
@ -759,7 +763,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
|
|||||||
if (resources->map.sameProcess) {
|
if (resources->map.sameProcess) {
|
||||||
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
|
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
|
||||||
} else {
|
} else {
|
||||||
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, NULL, mems[NCCL_NET_MAP_HOSTMEM].size));
|
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].handle));
|
||||||
}
|
}
|
||||||
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
|
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
|
||||||
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
|
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
|
||||||
@ -775,16 +779,20 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
|
|||||||
} else {
|
} else {
|
||||||
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
|
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
|
||||||
}
|
}
|
||||||
free(resources);
|
}
|
||||||
|
|
||||||
|
if (connection->state == connSetupDone) free(resources);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||||
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
||||||
if (resources == NULL) { // NVB Preconnect
|
if (connection->state == connSharedInitialized) { // NVB Preconnect
|
||||||
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
|
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (connection->state == connConnected) {
|
||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
if (resources->buffers[p]) {
|
if (resources->buffers[p]) {
|
||||||
NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
|
NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
|
||||||
@ -806,7 +814,9 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
|||||||
} else {
|
} else {
|
||||||
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
|
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
|
||||||
}
|
}
|
||||||
free(resources);
|
}
|
||||||
|
|
||||||
|
if (connection->state == connSetupDone) free(resources);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,6 +57,7 @@ struct alignas(64) ncclIbDev {
|
|||||||
int realPort;
|
int realPort;
|
||||||
int maxQp;
|
int maxQp;
|
||||||
struct ncclIbMrCache mrCache;
|
struct ncclIbMrCache mrCache;
|
||||||
|
int ar; // ADAPTIVE_ROUTING
|
||||||
};
|
};
|
||||||
|
|
||||||
#define MAX_IB_PORT 15
|
#define MAX_IB_PORT 15
|
||||||
@ -80,6 +81,7 @@ NCCL_PARAM(IbSl, "IB_SL", 0);
|
|||||||
NCCL_PARAM(IbTc, "IB_TC", 0);
|
NCCL_PARAM(IbTc, "IB_TC", 0);
|
||||||
NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
|
NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
|
||||||
NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
|
NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
|
||||||
|
NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
|
||||||
|
|
||||||
pthread_t ncclIbAsyncThread;
|
pthread_t ncclIbAsyncThread;
|
||||||
static void* ncclIbAsyncThreadMain(void* args) {
|
static void* ncclIbAsyncThreadMain(void* args) {
|
||||||
@ -221,6 +223,11 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
|||||||
ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
|
ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
|
||||||
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
|
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
|
||||||
|
|
||||||
|
// Enable ADAPTIVE_ROUTING by default on IB networks
|
||||||
|
// But allow it to be overloaded by an env parameter
|
||||||
|
ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
|
||||||
|
if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
|
||||||
|
|
||||||
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
|
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
|
||||||
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
|
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
|
||||||
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
|
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
|
||||||
@ -298,11 +305,6 @@ failure:
|
|||||||
return ncclSystemError;
|
return ncclSystemError;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
|
|
||||||
memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define NCCL_NET_IB_MAX_RECVS 8
|
#define NCCL_NET_IB_MAX_RECVS 8
|
||||||
|
|
||||||
ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
|
ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
|
||||||
@ -364,6 +366,7 @@ struct ncclIbCommStage {
|
|||||||
|
|
||||||
struct ncclIbHandle {
|
struct ncclIbHandle {
|
||||||
union ncclSocketAddress connectAddr; // Filled by the target
|
union ncclSocketAddress connectAddr; // Filled by the target
|
||||||
|
uint64_t magic; // random number to help debugging
|
||||||
struct ncclIbCommStage stage; // Used by the other side when connecting
|
struct ncclIbCommStage stage; // Used by the other side when connecting
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -376,7 +379,7 @@ struct ncclIbRequest {
|
|||||||
struct ncclIbVerbs* verbs;
|
struct ncclIbVerbs* verbs;
|
||||||
int type;
|
int type;
|
||||||
int events;
|
int events;
|
||||||
union ncclSocketAddress *addr;
|
struct ncclSocket* sock;
|
||||||
int nreqs;
|
int nreqs;
|
||||||
union {
|
union {
|
||||||
struct {
|
struct {
|
||||||
@ -427,6 +430,7 @@ struct ncclIbSendComm {
|
|||||||
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
|
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
|
||||||
int nqps;
|
int nqps;
|
||||||
struct ibv_mr* fifoMr;
|
struct ibv_mr* fifoMr;
|
||||||
|
int ar;
|
||||||
};
|
};
|
||||||
// The SendFifo needs to be 32-byte aligned and each element needs
|
// The SendFifo needs to be 32-byte aligned and each element needs
|
||||||
// to be a 32-byte multiple, so that an entry does not get split and
|
// to be a 32-byte multiple, so that an entry does not get split and
|
||||||
@ -571,19 +575,19 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
|
|||||||
static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
|
static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
|
||||||
memset(handle, 0, sizeof(struct ncclIbHandle));
|
memset(handle, 0, sizeof(struct ncclIbHandle));
|
||||||
comm->dev = dev;
|
comm->dev = dev;
|
||||||
comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */
|
handle->magic = NCCL_SOCKET_MAGIC;
|
||||||
NCCLCHECK(GetSocketAddr(&comm->sock.addr));
|
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
|
||||||
NCCLCHECK(ncclSocketListen(&comm->sock));
|
NCCLCHECK(ncclSocketListen(&comm->sock));
|
||||||
memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
|
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
|
||||||
*listenComm = comm;
|
*listenComm = comm;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
|
ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
|
||||||
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
|
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
|
||||||
enum ncclSocketState conState;
|
|
||||||
struct ncclIbCommStage* stage = &handle->stage;
|
struct ncclIbCommStage* stage = &handle->stage;
|
||||||
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
|
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
|
||||||
|
int ready;
|
||||||
*sendComm = NULL;
|
*sendComm = NULL;
|
||||||
|
|
||||||
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
|
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
|
||||||
@ -594,20 +598,15 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
|
NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
|
||||||
NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, NULL, 1));
|
NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
|
||||||
stage->comm = comm;
|
stage->comm = comm;
|
||||||
stage->state = ncclIbCommStateConnect;
|
stage->state = ncclIbCommStateConnect;
|
||||||
NCCLCHECK(ncclSocketConnect(&comm->sock));
|
NCCLCHECK(ncclSocketConnect(&comm->sock));
|
||||||
|
|
||||||
ib_connect_check:
|
ib_connect_check:
|
||||||
/* since ncclSocketConnect is async, we must check if connection is complete */
|
/* since ncclSocketConnect is async, we must check if connection is complete */
|
||||||
NCCLCHECK(ncclGetSocketState(&comm->sock, &conState));
|
NCCLCHECK(ncclSocketReady(&comm->sock, &ready));
|
||||||
if (conState == ncclSocketConnecting) {
|
if (!ready) return ncclSuccess;
|
||||||
/* expect user to call again */
|
|
||||||
return ncclSuccess;
|
|
||||||
} else if (conState == ncclSocketError) {
|
|
||||||
return ncclRemoteError;
|
|
||||||
}
|
|
||||||
|
|
||||||
// IB Setup
|
// IB Setup
|
||||||
struct ibv_context* ctx;
|
struct ibv_context* ctx;
|
||||||
@ -619,6 +618,7 @@ ib_connect_check:
|
|||||||
for (int q=0; q<comm->nqps; q++) {
|
for (int q=0; q<comm->nqps; q++) {
|
||||||
NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q));
|
NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q));
|
||||||
}
|
}
|
||||||
|
comm->ar = ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING
|
||||||
|
|
||||||
// Send my QP Info to receiver through the socket. Hope this won't block.
|
// Send my QP Info to receiver through the socket. Hope this won't block.
|
||||||
struct ibv_port_attr portAttr;
|
struct ibv_port_attr portAttr;
|
||||||
@ -670,9 +670,10 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
|
|||||||
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
|
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
|
||||||
struct ncclIbCommStage* stage = &lComm->stage;
|
struct ncclIbCommStage* stage = &lComm->stage;
|
||||||
struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
|
struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
|
||||||
|
int ready;
|
||||||
*recvComm = NULL;
|
*recvComm = NULL;
|
||||||
|
|
||||||
if (stage->state == ncclIbCommStateAccept) goto ib_accept;
|
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
|
||||||
if (stage->state == ncclIbCommStateRecv) goto ib_recv;
|
if (stage->state == ncclIbCommStateRecv) goto ib_recv;
|
||||||
if (stage->state == ncclIbCommStateSend) goto ib_send;
|
if (stage->state == ncclIbCommStateSend) goto ib_send;
|
||||||
if (stage->state != ncclIbCommStateStart) {
|
if (stage->state != ncclIbCommStateStart) {
|
||||||
@ -683,12 +684,12 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
|
|||||||
NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
|
NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
|
||||||
stage->comm = rComm;
|
stage->comm = rComm;
|
||||||
stage->state = ncclIbCommStateAccept;
|
stage->state = ncclIbCommStateAccept;
|
||||||
NCCLCHECK(ncclSocketInit(&rComm->sock, NULL, lComm->sock.abortFlag, 1));
|
NCCLCHECK(ncclSocketInit(&rComm->sock));
|
||||||
|
|
||||||
ib_accept:
|
|
||||||
NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
|
NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
|
||||||
if (rComm->sock.fd == -1)
|
|
||||||
return ncclSuccess;
|
ib_accept_check:
|
||||||
|
NCCLCHECK(ncclSocketReady(&rComm->sock, &ready));
|
||||||
|
if (!ready) return ncclSuccess;
|
||||||
|
|
||||||
struct ncclIbQpInfo remQpInfo;
|
struct ncclIbQpInfo remQpInfo;
|
||||||
stage->state = ncclIbCommStateRecv;
|
stage->state = ncclIbCommStateRecv;
|
||||||
@ -791,7 +792,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
|
|||||||
if (r->type == NCCL_NET_IB_REQ_UNUSED) {
|
if (r->type == NCCL_NET_IB_REQ_UNUSED) {
|
||||||
r->verbs = verbs;
|
r->verbs = verbs;
|
||||||
r->events = 1;
|
r->events = 1;
|
||||||
r->addr = NULL;
|
r->sock = NULL;
|
||||||
*req = r;
|
*req = r;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -966,8 +967,8 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
|
struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
|
||||||
if (nreqs > 1 || reqs[0]->send.size > ncclParamIbArThreshold()) {
|
if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
|
||||||
// When using adaptive routing, send the bulk of the data first as an
|
// When using ADAPTIVE_ROUTING, send the bulk of the data first as an
|
||||||
// RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
|
// RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
|
||||||
// completion.
|
// completion.
|
||||||
lastWr++;
|
lastWr++;
|
||||||
@ -1033,28 +1034,31 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
|
|||||||
|
|
||||||
// Sanity checks to catch user collective call count/size mismatches
|
// Sanity checks to catch user collective call count/size mismatches
|
||||||
if (size > slots[r].size) {
|
if (size > slots[r].size) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN + 1];
|
||||||
|
union ncclSocketAddress addr;
|
||||||
|
ncclSocketGetAddr(&comm->sock, &addr);
|
||||||
WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d",
|
WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d",
|
||||||
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size);
|
r, nreqs, tag, ncclSocketToString(&addr, line), size, slots[r].size);
|
||||||
return ncclInvalidUsage;
|
return ncclInvalidUsage;
|
||||||
} // plus any potential programming errors
|
} // plus any potential programming errors
|
||||||
else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
|
else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN + 1];
|
||||||
|
union ncclSocketAddress addr;
|
||||||
|
ncclSocketGetAddr(&comm->sock, &addr);
|
||||||
WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
|
WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
|
||||||
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), slots[r].size, slots[r].addr, slots[r].rkey);
|
r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkey);
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
struct ncclIbRequest* req;
|
struct ncclIbRequest* req;
|
||||||
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
||||||
req->type = NCCL_NET_IB_REQ_SEND;
|
req->type = NCCL_NET_IB_REQ_SEND;
|
||||||
req->addr = &comm->sock.addr;
|
req->sock = &comm->sock;
|
||||||
req->verbs = &comm->verbs;
|
req->verbs = &comm->verbs;
|
||||||
req->nreqs = nreqs;
|
req->nreqs = nreqs;
|
||||||
req->send.size = size;
|
req->send.size = size;
|
||||||
req->send.data = data;
|
req->send.data = data;
|
||||||
req->send.lkey = mr->lkey;
|
req->send.lkey = mr->lkey;
|
||||||
req->send.offset = 0;
|
req->send.offset = 0;
|
||||||
req->addr = &comm->sock.addr;
|
|
||||||
req->events = comm->nqps;
|
req->events = comm->nqps;
|
||||||
*request = reqs[r] = req;
|
*request = reqs[r] = req;
|
||||||
|
|
||||||
@ -1147,7 +1151,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
|
|||||||
struct ncclIbRequest* req;
|
struct ncclIbRequest* req;
|
||||||
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
||||||
req->type = NCCL_NET_IB_REQ_RECV;
|
req->type = NCCL_NET_IB_REQ_RECV;
|
||||||
req->addr = &comm->sock.addr;
|
req->sock = &comm->sock;
|
||||||
req->nreqs = n;
|
req->nreqs = n;
|
||||||
for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
|
for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
|
||||||
|
|
||||||
@ -1186,7 +1190,7 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
|
|||||||
struct ncclIbRequest* req;
|
struct ncclIbRequest* req;
|
||||||
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
||||||
req->type = NCCL_NET_IB_REQ_FLUSH;
|
req->type = NCCL_NET_IB_REQ_FLUSH;
|
||||||
req->addr = &comm->sock.addr;
|
req->sock = &comm->sock;
|
||||||
struct ibv_mr* mr = (struct ibv_mr*)mhandles[last];
|
struct ibv_mr* mr = (struct ibv_mr*)mhandles[last];
|
||||||
|
|
||||||
struct ibv_send_wr wr;
|
struct ibv_send_wr wr;
|
||||||
@ -1234,8 +1238,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
|||||||
struct ibv_wc *wc = wcs+w;
|
struct ibv_wc *wc = wcs+w;
|
||||||
if (wc->status != IBV_WC_SUCCESS) {
|
if (wc->status != IBV_WC_SUCCESS) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
|
union ncclSocketAddress addr;
|
||||||
|
ncclSocketGetAddr(r->sock, &addr);
|
||||||
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
|
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
|
||||||
ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
|
ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
|
||||||
return ncclRemoteError;
|
return ncclRemoteError;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1267,7 +1273,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
|||||||
ncclResult_t ncclIbCloseSend(void* sendComm) {
|
ncclResult_t ncclIbCloseSend(void* sendComm) {
|
||||||
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
|
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
|
||||||
if (comm) {
|
if (comm) {
|
||||||
close(comm->sock.fd);
|
NCCLCHECK(ncclSocketClose(&comm->sock));
|
||||||
for (int q=0; q<comm->nqps; q++)
|
for (int q=0; q<comm->nqps; q++)
|
||||||
if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
|
if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
|
||||||
if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
|
if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
|
||||||
@ -1281,7 +1287,7 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
|
|||||||
ncclResult_t ncclIbCloseRecv(void* recvComm) {
|
ncclResult_t ncclIbCloseRecv(void* recvComm) {
|
||||||
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
|
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
|
||||||
if (comm) {
|
if (comm) {
|
||||||
close(comm->sock.fd);
|
NCCLCHECK(ncclSocketClose(&comm->sock));
|
||||||
for (int q=0; q<comm->nqps; q++)
|
for (int q=0; q<comm->nqps; q++)
|
||||||
if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
|
if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
|
||||||
if (comm->gpuFlush.enabled) {
|
if (comm->gpuFlush.enabled) {
|
||||||
@ -1298,7 +1304,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
|
|||||||
ncclResult_t ncclIbCloseListen(void* listenComm) {
|
ncclResult_t ncclIbCloseListen(void* listenComm) {
|
||||||
struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
|
struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
|
||||||
if (comm) {
|
if (comm) {
|
||||||
close(comm->sock.fd);
|
NCCLCHECK(ncclSocketClose(&comm->sock));
|
||||||
free(comm);
|
free(comm);
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
@ -18,16 +18,16 @@
|
|||||||
|
|
||||||
/* Init functions */
|
/* Init functions */
|
||||||
static int ncclNetIfs = -1;
|
static int ncclNetIfs = -1;
|
||||||
struct ncclSocketDev {
|
struct ncclNetSocketDev {
|
||||||
union ncclSocketAddress addr;
|
union ncclSocketAddress addr;
|
||||||
char devName[MAX_IF_NAME_SIZE];
|
char devName[MAX_IF_NAME_SIZE];
|
||||||
char* pciPath;
|
char* pciPath;
|
||||||
};
|
};
|
||||||
static struct ncclSocketDev ncclSocketDevs[MAX_IFS];
|
static struct ncclNetSocketDev ncclNetSocketDevs[MAX_IFS];
|
||||||
|
|
||||||
pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
|
pthread_mutex_t ncclNetSocketLock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
|
static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) {
|
||||||
char devicePath[PATH_MAX];
|
char devicePath[PATH_MAX];
|
||||||
snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName);
|
snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName);
|
||||||
// May return NULL if the file doesn't exist.
|
// May return NULL if the file doesn't exist.
|
||||||
@ -35,9 +35,9 @@ static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
|
ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
|
||||||
if (ncclNetIfs == -1) {
|
if (ncclNetIfs == -1) {
|
||||||
pthread_mutex_lock(&ncclSocketLock);
|
pthread_mutex_lock(&ncclNetSocketLock);
|
||||||
if (ncclNetIfs == -1) {
|
if (ncclNetIfs == -1) {
|
||||||
char names[MAX_IF_NAME_SIZE*MAX_IFS];
|
char names[MAX_IF_NAME_SIZE*MAX_IFS];
|
||||||
union ncclSocketAddress addrs[MAX_IFS];
|
union ncclSocketAddress addrs[MAX_IFS];
|
||||||
@ -52,9 +52,9 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
|
|||||||
line[0] = '\0';
|
line[0] = '\0';
|
||||||
addrline[SOCKET_NAME_MAXLEN] = '\0';
|
addrline[SOCKET_NAME_MAXLEN] = '\0';
|
||||||
for (int i=0; i<ncclNetIfs; i++) {
|
for (int i=0; i<ncclNetIfs; i++) {
|
||||||
strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
|
strcpy(ncclNetSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
|
||||||
memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
|
memcpy(&ncclNetSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
|
||||||
NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
|
NCCLCHECK(ncclNetSocketGetPciPath(ncclNetSocketDevs[i].devName, &ncclNetSocketDevs[i].pciPath));
|
||||||
snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
|
snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
|
||||||
ncclSocketToString(&addrs[i], addrline));
|
ncclSocketToString(&addrs[i], addrline));
|
||||||
}
|
}
|
||||||
@ -62,17 +62,17 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
|
|||||||
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
|
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pthread_mutex_unlock(&ncclSocketLock);
|
pthread_mutex_unlock(&ncclNetSocketLock);
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketDevices(int* ndev) {
|
ncclResult_t ncclNetSocketDevices(int* ndev) {
|
||||||
*ndev = ncclNetIfs;
|
*ndev = ncclNetIfs;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
|
static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
|
||||||
*speed = 0;
|
*speed = 0;
|
||||||
char speedPath[PATH_MAX];
|
char speedPath[PATH_MAX];
|
||||||
sprintf(speedPath, "/sys/class/net/%s/speed", devName);
|
sprintf(speedPath, "/sys/class/net/%s/speed", devName);
|
||||||
@ -91,12 +91,12 @@ static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
|
ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
|
||||||
props->name = ncclSocketDevs[dev].devName;
|
props->name = ncclNetSocketDevs[dev].devName;
|
||||||
props->pciPath = ncclSocketDevs[dev].pciPath;
|
props->pciPath = ncclNetSocketDevs[dev].pciPath;
|
||||||
props->guid = dev;
|
props->guid = dev;
|
||||||
props->ptrSupport = NCCL_PTR_HOST;
|
props->ptrSupport = NCCL_PTR_HOST;
|
||||||
NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
|
NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
|
||||||
props->latency = 0; // Not set
|
props->latency = 0; // Not set
|
||||||
props->port = 0;
|
props->port = 0;
|
||||||
props->maxComms = 65536;
|
props->maxComms = 65536;
|
||||||
@ -104,12 +104,6 @@ ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
|
|
||||||
if (dev >= ncclNetIfs) return ncclInternalError;
|
|
||||||
memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Communication functions */
|
/* Communication functions */
|
||||||
|
|
||||||
#define MAX_SOCKETS 64
|
#define MAX_SOCKETS 64
|
||||||
@ -120,29 +114,30 @@ ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
|
|||||||
NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
|
NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
|
||||||
NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
|
NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
|
||||||
|
|
||||||
enum ncclSocketCommState {
|
enum ncclNetSocketCommState {
|
||||||
ncclSocketCommStateStart = 0,
|
ncclNetSocketCommStateStart = 0,
|
||||||
ncclSocketCommStateConnect = 1,
|
ncclNetSocketCommStateConnect = 1,
|
||||||
ncclSocketCommStateAccept = 3,
|
ncclNetSocketCommStateAccept = 3,
|
||||||
ncclSocketCommStateSend = 4,
|
ncclNetSocketCommStateSend = 4,
|
||||||
ncclSocketCommStateRecv = 5,
|
ncclNetSocketCommStateRecv = 5,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclSocketCommStage {
|
struct ncclNetSocketCommStage {
|
||||||
enum ncclSocketCommState state;
|
enum ncclNetSocketCommState state;
|
||||||
uint8_t iteration;
|
uint8_t iteration;
|
||||||
struct ncclSocket* sock;
|
struct ncclSocket* sock;
|
||||||
struct ncclSocketComm* comm;
|
struct ncclNetSocketComm* comm;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclSocketHandle {
|
struct ncclNetSocketHandle {
|
||||||
union ncclSocketAddress connectAddr;
|
union ncclSocketAddress connectAddr;
|
||||||
|
uint64_t magic; // random number to help debugging
|
||||||
int nSocks;
|
int nSocks;
|
||||||
int nThreads;
|
int nThreads;
|
||||||
struct ncclSocketCommStage stage;
|
struct ncclNetSocketCommStage stage;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclSocketTask {
|
struct ncclNetSocketTask {
|
||||||
int op;
|
int op;
|
||||||
void* data;
|
void* data;
|
||||||
int size;
|
int size;
|
||||||
@ -152,41 +147,41 @@ struct ncclSocketTask {
|
|||||||
ncclResult_t result;
|
ncclResult_t result;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclSocketRequest {
|
struct ncclNetSocketRequest {
|
||||||
int op;
|
int op;
|
||||||
void* data;
|
void* data;
|
||||||
int size;
|
int size;
|
||||||
struct ncclSocket* ctrlSock;
|
struct ncclSocket* ctrlSock;
|
||||||
int offset;
|
int offset;
|
||||||
int used;
|
int used;
|
||||||
struct ncclSocketComm* comm;
|
struct ncclNetSocketComm* comm;
|
||||||
struct ncclSocketTask* tasks[MAX_SOCKETS];
|
struct ncclNetSocketTask* tasks[MAX_SOCKETS];
|
||||||
int nSubs;
|
int nSubs;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclSocketTaskQueue {
|
struct ncclNetSocketTaskQueue {
|
||||||
int next;
|
int next;
|
||||||
int len;
|
int len;
|
||||||
struct ncclSocketTask* tasks;
|
struct ncclNetSocketTask* tasks;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclSocketThreadResources {
|
struct ncclNetSocketThreadResources {
|
||||||
struct ncclSocketTaskQueue threadTaskQueue;
|
struct ncclNetSocketTaskQueue threadTaskQueue;
|
||||||
int stop;
|
int stop;
|
||||||
struct ncclSocketComm* comm;
|
struct ncclNetSocketComm* comm;
|
||||||
pthread_mutex_t threadLock;
|
pthread_mutex_t threadLock;
|
||||||
pthread_cond_t threadCond;
|
pthread_cond_t threadCond;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclSocketListenComm {
|
struct ncclNetSocketListenComm {
|
||||||
struct ncclSocket sock;
|
struct ncclSocket sock;
|
||||||
struct ncclSocketCommStage stage;
|
struct ncclNetSocketCommStage stage;
|
||||||
int nSocks;
|
int nSocks;
|
||||||
int nThreads;
|
int nThreads;
|
||||||
int dev;
|
int dev;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclSocketComm {
|
struct ncclNetSocketComm {
|
||||||
struct ncclSocket ctrlSock;
|
struct ncclSocket ctrlSock;
|
||||||
struct ncclSocket socks[MAX_SOCKETS];
|
struct ncclSocket socks[MAX_SOCKETS];
|
||||||
int dev;
|
int dev;
|
||||||
@ -194,15 +189,15 @@ struct ncclSocketComm {
|
|||||||
int nSocks;
|
int nSocks;
|
||||||
int nThreads;
|
int nThreads;
|
||||||
int nextSock;
|
int nextSock;
|
||||||
struct ncclSocketRequest requests[MAX_REQUESTS];
|
struct ncclNetSocketRequest requests[MAX_REQUESTS];
|
||||||
pthread_t helperThread[MAX_THREADS];
|
pthread_t helperThread[MAX_THREADS];
|
||||||
struct ncclSocketThreadResources threadResources[MAX_THREADS];
|
struct ncclNetSocketThreadResources threadResources[MAX_THREADS];
|
||||||
};
|
};
|
||||||
|
|
||||||
void* persistentSocketThread(void *args_) {
|
void* persistentSocketThread(void *args_) {
|
||||||
struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
|
struct ncclNetSocketThreadResources* resource = (struct ncclNetSocketThreadResources*)args_;
|
||||||
struct ncclSocketComm* comm = resource->comm;
|
struct ncclNetSocketComm* comm = resource->comm;
|
||||||
struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
|
struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue;
|
||||||
int nSocksPerThread = comm->nSocks / comm->nThreads;
|
int nSocksPerThread = comm->nSocks / comm->nThreads;
|
||||||
while (1) {
|
while (1) {
|
||||||
int idle = 1;
|
int idle = 1;
|
||||||
@ -212,7 +207,7 @@ void* persistentSocketThread(void *args_) {
|
|||||||
do {
|
do {
|
||||||
repeat = 0;
|
repeat = 0;
|
||||||
for (int j=0; j<nSocksPerThread; j++) {
|
for (int j=0; j<nSocksPerThread; j++) {
|
||||||
struct ncclSocketTask* r = myQueue->tasks+i+j;
|
struct ncclNetSocketTask* r = myQueue->tasks+i+j;
|
||||||
if (r != NULL && r->used == 1 && r->offset < r->size) {
|
if (r != NULL && r->used == 1 && r->offset < r->size) {
|
||||||
r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
|
r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
|
||||||
if (r->result != ncclSuccess) {
|
if (r->result != ncclSuccess) {
|
||||||
@ -236,7 +231,7 @@ void* persistentSocketThread(void *args_) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
|
ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
|
||||||
int nSocksPerThread = ncclParamSocketNsocksPerThread();
|
int nSocksPerThread = ncclParamSocketNsocksPerThread();
|
||||||
int nThreads = ncclParamSocketNthreads();
|
int nThreads = ncclParamSocketNthreads();
|
||||||
if (nThreads > MAX_THREADS) {
|
if (nThreads > MAX_THREADS) {
|
||||||
@ -247,7 +242,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
|
|||||||
// Auto-detection
|
// Auto-detection
|
||||||
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
|
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
|
||||||
char vendorPath[PATH_MAX];
|
char vendorPath[PATH_MAX];
|
||||||
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclSocketDevs[dev].devName);
|
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName);
|
||||||
char* rPath = realpath(vendorPath, NULL);
|
char* rPath = realpath(vendorPath, NULL);
|
||||||
int fd = open(rPath, O_RDONLY);
|
int fd = open(rPath, O_RDONLY);
|
||||||
free(rPath);
|
free(rPath);
|
||||||
@ -285,36 +280,20 @@ end:
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
|
ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||||
NCCLCHECK(ncclCalloc(comm, 1));
|
if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
|
||||||
(*comm)->sock.fd = -1;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
|
|
||||||
NCCLCHECK(ncclCalloc(comm, 1));
|
|
||||||
(*comm)->ctrlSock.fd = -1;
|
|
||||||
for (int i=0; i < MAX_SOCKETS; i++) {
|
|
||||||
(*comm)->socks[i].fd = -1;
|
|
||||||
}
|
|
||||||
(*comm)->nextSock = 0;
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
|
|
||||||
if (dev < 0) { // data transfer socket is based on specified dev
|
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
|
struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
|
||||||
memset(handle, 0, sizeof(struct ncclSocketHandle));
|
memset(handle, 0, sizeof(struct ncclNetSocketHandle));
|
||||||
static_assert(sizeof(struct ncclSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
|
static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large");
|
||||||
struct ncclSocketListenComm* comm;
|
struct ncclNetSocketListenComm* comm;
|
||||||
NCCLCHECK(ncclSocketNewListenComm(&comm));
|
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||||
NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
|
handle->magic = NCCL_SOCKET_MAGIC;
|
||||||
comm->sock.asyncFlag = 1;
|
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
|
||||||
NCCLCHECK(ncclSocketListen(&comm->sock));
|
NCCLCHECK(ncclSocketListen(&comm->sock));
|
||||||
memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
|
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
|
||||||
NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
|
NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
|
||||||
handle->nSocks = comm->nSocks;
|
handle->nSocks = comm->nSocks;
|
||||||
handle->nThreads = comm->nThreads;
|
handle->nThreads = comm->nThreads;
|
||||||
comm->dev = dev;
|
comm->dev = dev;
|
||||||
@ -322,46 +301,41 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
|
ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
|
||||||
if (dev < 0) { // data transfer socket is based on specified dev
|
if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ncclSocketState conState;
|
int ready;
|
||||||
struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
|
struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
|
||||||
struct ncclSocketCommStage* stage = &handle->stage;
|
struct ncclNetSocketCommStage* stage = &handle->stage;
|
||||||
struct ncclSocketComm* comm = stage->comm;
|
struct ncclNetSocketComm* comm = stage->comm;
|
||||||
uint8_t i = stage->iteration;
|
uint8_t i = stage->iteration;
|
||||||
struct ncclSocket* sock = stage->sock;
|
struct ncclSocket* sock = stage->sock;
|
||||||
*sendComm = NULL;
|
*sendComm = NULL;
|
||||||
|
|
||||||
if (stage->state == ncclSocketCommStateConnect) goto socket_connect_check;
|
if (stage->state == ncclNetSocketCommStateConnect) goto socket_connect_check;
|
||||||
if (stage->state == ncclSocketCommStateSend) goto socket_send;
|
if (stage->state == ncclNetSocketCommStateSend) goto socket_send;
|
||||||
|
|
||||||
NCCLCHECK(ncclSocketNewComm(&comm));
|
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||||
stage->comm = comm;
|
stage->comm = comm;
|
||||||
comm->nSocks = handle->nSocks;
|
comm->nSocks = handle->nSocks;
|
||||||
comm->nThreads = handle->nThreads;
|
comm->nThreads = handle->nThreads;
|
||||||
comm->dev = dev;
|
comm->dev = dev;
|
||||||
CUDACHECK(cudaGetDevice(&comm->cudaDev));
|
CUDACHECK(cudaGetDevice(&comm->cudaDev));
|
||||||
for (; i<comm->nSocks+1; i++) {
|
for (; i<comm->nSocks+1; i++) {
|
||||||
sock = i == comm->nSocks ? &comm->ctrlSock : comm->socks+i;
|
sock = (i == comm->nSocks) ? &comm->ctrlSock : comm->socks+i;
|
||||||
NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, NULL, 1));
|
NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
|
||||||
|
|
||||||
stage->sock = sock;
|
stage->sock = sock;
|
||||||
stage->state = ncclSocketCommStateConnect;
|
stage->state = ncclNetSocketCommStateConnect;
|
||||||
stage->iteration = i;
|
stage->iteration = i;
|
||||||
NCCLCHECK(ncclSocketConnect(sock));
|
NCCLCHECK(ncclSocketConnect(sock));
|
||||||
|
|
||||||
socket_connect_check:
|
socket_connect_check:
|
||||||
NCCLCHECK(ncclGetSocketState(sock, &conState));
|
NCCLCHECK(ncclSocketReady(sock, &ready));
|
||||||
if (conState == ncclSocketConnecting) {
|
if (! ready) return ncclSuccess;
|
||||||
/* expect user to call again */
|
stage->state = ncclNetSocketCommStateSend;
|
||||||
return ncclSuccess;
|
|
||||||
} else if (conState == ncclSocketError) {
|
|
||||||
return ncclRemoteError;
|
|
||||||
}
|
|
||||||
stage->state = ncclSocketCommStateSend;
|
|
||||||
|
|
||||||
socket_send:
|
socket_send:
|
||||||
int done = 0;
|
int done = 0;
|
||||||
@ -372,59 +346,63 @@ socket_send:
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
|
ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm) {
|
||||||
struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
|
struct ncclNetSocketListenComm* lComm = (struct ncclNetSocketListenComm*)listenComm;
|
||||||
struct ncclSocketCommStage* stage = &lComm->stage;
|
struct ncclNetSocketCommStage* stage = &lComm->stage;
|
||||||
struct ncclSocketComm* rComm = stage->comm;
|
struct ncclNetSocketComm* rComm = stage->comm;
|
||||||
uint8_t i = stage->iteration;
|
uint8_t i = stage->iteration;
|
||||||
struct ncclSocket* sock = stage->sock;
|
struct ncclSocket* sock = stage->sock;
|
||||||
|
int ready;
|
||||||
|
|
||||||
*recvComm = NULL;
|
*recvComm = NULL;
|
||||||
if (stage->state == ncclSocketCommStateAccept) goto socket_accept;
|
if (stage->state == ncclNetSocketCommStateAccept) goto socket_accept_check;
|
||||||
if (stage->state == ncclSocketCommStateRecv) goto socket_recv;
|
if (stage->state == ncclNetSocketCommStateRecv) goto socket_recv;
|
||||||
|
|
||||||
NCCLCHECK(ncclSocketNewComm(&rComm));
|
NCCLCHECK(ncclCalloc(&rComm, 1));
|
||||||
stage->comm = rComm;
|
stage->comm = rComm;
|
||||||
rComm->nSocks = lComm->nSocks;
|
rComm->nSocks = lComm->nSocks;
|
||||||
rComm->nThreads = lComm->nThreads;
|
rComm->nThreads = lComm->nThreads;
|
||||||
rComm->dev = lComm->dev;
|
rComm->dev = lComm->dev;
|
||||||
CUDACHECK(cudaGetDevice(&rComm->cudaDev));
|
CUDACHECK(cudaGetDevice(&rComm->cudaDev));
|
||||||
lComm->sock.asyncFlag = 1;
|
|
||||||
for (; i<rComm->nSocks+1; i++) {
|
for (; i<rComm->nSocks+1; i++) {
|
||||||
uint8_t sendSockIdx;
|
uint8_t sendSockIdx;
|
||||||
ncclCalloc(&sock, 1);
|
|
||||||
NCCLCHECK(ncclSocketInit(sock, NULL, lComm->sock.abortFlag, 1));
|
|
||||||
stage->sock = sock;
|
|
||||||
stage->state = ncclSocketCommStateAccept;
|
|
||||||
stage->iteration = i;
|
|
||||||
socket_accept:
|
|
||||||
NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
|
|
||||||
if (sock->fd == -1) return ncclSuccess;
|
|
||||||
|
|
||||||
stage->state = ncclSocketCommStateRecv;
|
NCCLCHECK(ncclCalloc(&sock, 1));
|
||||||
|
NCCLCHECK(ncclSocketInit(sock));
|
||||||
|
stage->sock = sock;
|
||||||
|
stage->state = ncclNetSocketCommStateAccept;
|
||||||
|
stage->iteration = i;
|
||||||
|
NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
|
||||||
|
|
||||||
|
socket_accept_check:
|
||||||
|
NCCLCHECK(ncclSocketReady(sock, &ready));
|
||||||
|
if (!ready) return ncclSuccess;
|
||||||
|
|
||||||
|
stage->state = ncclNetSocketCommStateRecv;
|
||||||
socket_recv:
|
socket_recv:
|
||||||
int done = 0;
|
int done = 0;
|
||||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done));
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done));
|
||||||
if (done == 0) return ncclSuccess;
|
if (done == 0) return ncclSuccess;
|
||||||
|
|
||||||
if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
|
if (sendSockIdx == rComm->nSocks)
|
||||||
else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
|
memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
|
||||||
|
else
|
||||||
|
memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
|
||||||
free(sock);
|
free(sock);
|
||||||
}
|
}
|
||||||
*recvComm = rComm;
|
*recvComm = rComm;
|
||||||
|
|
||||||
/* reset lComm state */
|
/* reset lComm state */
|
||||||
stage->state = ncclSocketCommStateStart;
|
stage->state = ncclNetSocketCommStateStart;
|
||||||
stage->iteration = 0;
|
stage->iteration = 0;
|
||||||
stage->sock = NULL;
|
stage->sock = NULL;
|
||||||
stage->comm = NULL;
|
stage->comm = NULL;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) {
|
ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketRequest** req) {
|
||||||
for (int i=0; i<MAX_REQUESTS; i++) {
|
for (int i=0; i<MAX_REQUESTS; i++) {
|
||||||
struct ncclSocketRequest* r = comm->requests+i;
|
struct ncclNetSocketRequest* r = comm->requests+i;
|
||||||
if (r->used == 0) {
|
if (r->used == 0) {
|
||||||
r->op = op;
|
r->op = op;
|
||||||
r->data = data;
|
r->data = data;
|
||||||
@ -441,10 +419,10 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
|
|||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
|
ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) {
|
||||||
int tid = comm->nextSock % comm->nThreads;
|
int tid = comm->nextSock % comm->nThreads;
|
||||||
struct ncclSocketThreadResources* res = comm->threadResources+tid;
|
struct ncclNetSocketThreadResources* res = comm->threadResources+tid;
|
||||||
struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
|
struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue;
|
||||||
// create helper threads and prepare per-thread task queue
|
// create helper threads and prepare per-thread task queue
|
||||||
if (queue->tasks == NULL) {
|
if (queue->tasks == NULL) {
|
||||||
// each request can be divided up to nSocks tasks, and
|
// each request can be divided up to nSocks tasks, and
|
||||||
@ -459,12 +437,12 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
|
|||||||
pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
|
pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
|
||||||
ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
|
ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
|
||||||
}
|
}
|
||||||
struct ncclSocketTask* r = queue->tasks+queue->next;
|
struct ncclNetSocketTask* r = queue->tasks+queue->next;
|
||||||
if (r->used == 0) {
|
if (r->used == 0) {
|
||||||
r->op = op;
|
r->op = op;
|
||||||
r->data = data;
|
r->data = data;
|
||||||
r->size = size;
|
r->size = size;
|
||||||
r->sock = comm->socks+comm->nextSock;
|
r->sock = comm->socks + comm->nextSock;
|
||||||
r->offset = 0;
|
r->offset = 0;
|
||||||
r->result = ncclSuccess;
|
r->result = ncclSuccess;
|
||||||
comm->nextSock = (comm->nextSock + 1) % comm->nSocks;
|
comm->nextSock = (comm->nextSock + 1) % comm->nSocks;
|
||||||
@ -480,9 +458,9 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
|
|||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
|
||||||
*done = 0;
|
*done = 0;
|
||||||
struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
|
struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request;
|
||||||
if (r == NULL) {
|
if (r == NULL) {
|
||||||
WARN("NET/Socket : test called with NULL request");
|
WARN("NET/Socket : test called with NULL request");
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
@ -500,9 +478,11 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
|||||||
// Check size is less or equal to the size provided by the user
|
// Check size is less or equal to the size provided by the user
|
||||||
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
|
union ncclSocketAddress addr;
|
||||||
|
ncclSocketGetAddr(r->ctrlSock, &addr);
|
||||||
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
|
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
|
||||||
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
|
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
|
||||||
ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
|
ncclSocketToString(&addr, line), data, r->size);
|
||||||
return ncclInvalidUsage;
|
return ncclInvalidUsage;
|
||||||
}
|
}
|
||||||
r->size = data;
|
r->size = data;
|
||||||
@ -515,7 +495,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
|||||||
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
|
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
|
||||||
while (chunkOffset < r->size) {
|
while (chunkOffset < r->size) {
|
||||||
int chunkSize = std::min(taskSize, r->size-chunkOffset);
|
int chunkSize = std::min(taskSize, r->size-chunkOffset);
|
||||||
NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
|
NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
|
||||||
chunkOffset += chunkSize;
|
chunkOffset += chunkSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -525,7 +505,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
|||||||
if (r->nSubs > 0) {
|
if (r->nSubs > 0) {
|
||||||
int nCompleted = 0;
|
int nCompleted = 0;
|
||||||
for (int i=0; i<r->nSubs; i++) {
|
for (int i=0; i<r->nSubs; i++) {
|
||||||
struct ncclSocketTask* sub = r->tasks[i];
|
struct ncclNetSocketTask* sub = r->tasks[i];
|
||||||
if (sub->result != ncclSuccess) return sub->result;
|
if (sub->result != ncclSuccess) return sub->result;
|
||||||
if (sub->offset == sub->size) nCompleted++;
|
if (sub->offset == sub->size) nCompleted++;
|
||||||
}
|
}
|
||||||
@ -534,7 +514,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
|||||||
*done = 1;
|
*done = 1;
|
||||||
r->used = 0;
|
r->used = 0;
|
||||||
for (int i=0; i<r->nSubs; i++) {
|
for (int i=0; i<r->nSubs; i++) {
|
||||||
struct ncclSocketTask* sub = r->tasks[i];
|
struct ncclNetSocketTask* sub = r->tasks[i];
|
||||||
sub->used = 0;
|
sub->used = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -552,43 +532,45 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
|
ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
|
||||||
return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
|
return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
|
||||||
}
|
}
|
||||||
ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
|
ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
|
||||||
|
|
||||||
ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
|
ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
|
||||||
struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
|
struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
|
||||||
NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
|
NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
|
ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
|
||||||
struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
|
struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
|
||||||
if (n != 1) return ncclInternalError;
|
if (n != 1) return ncclInternalError;
|
||||||
NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclSocketRequest**)request));
|
NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
|
ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
|
||||||
// We don't support CUDA pointers, so we don't need a flush operation
|
// We don't support CUDA pointers, so we don't need a flush operation
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
|
ncclResult_t ncclNetSocketCloseListen(void* opaqueComm) {
|
||||||
struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
|
struct ncclNetSocketListenComm* comm = (struct ncclNetSocketListenComm*)opaqueComm;
|
||||||
if (comm) {
|
if (comm) {
|
||||||
if (comm->sock.fd != -1) close(comm->sock.fd);
|
int ready;
|
||||||
|
NCCLCHECK(ncclSocketReady(&comm->sock, &ready));
|
||||||
|
if (ready) NCCLCHECK(ncclSocketClose(&comm->sock));
|
||||||
free(comm);
|
free(comm);
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSocketClose(void* opaqueComm) {
|
ncclResult_t ncclNetSocketClose(void* opaqueComm) {
|
||||||
struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
|
struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)opaqueComm;
|
||||||
if (comm) {
|
if (comm) {
|
||||||
for (int i=0; i<comm->nThreads; i++) {
|
for (int i=0; i<comm->nThreads; i++) {
|
||||||
struct ncclSocketThreadResources* res = comm->threadResources+i;
|
struct ncclNetSocketThreadResources* res = comm->threadResources+i;
|
||||||
if (comm->helperThread[i]) {
|
if (comm->helperThread[i]) {
|
||||||
pthread_mutex_lock(&res->threadLock);
|
pthread_mutex_lock(&res->threadLock);
|
||||||
res->stop = 1;
|
res->stop = 1;
|
||||||
@ -598,9 +580,12 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
|
|||||||
}
|
}
|
||||||
free(res->threadTaskQueue.tasks);
|
free(res->threadTaskQueue.tasks);
|
||||||
}
|
}
|
||||||
if (comm->ctrlSock.fd != -1) close(comm->ctrlSock.fd);
|
int ready;
|
||||||
|
NCCLCHECK(ncclSocketReady(&comm->ctrlSock, &ready));
|
||||||
|
if (ready) NCCLCHECK(ncclSocketClose(&comm->ctrlSock));
|
||||||
for (int i=0; i<comm->nSocks; i++) {
|
for (int i=0; i<comm->nSocks; i++) {
|
||||||
if (comm->socks[i].fd != -1) close(comm->socks[i].fd);
|
NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready));
|
||||||
|
if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i]));
|
||||||
}
|
}
|
||||||
free(comm);
|
free(comm);
|
||||||
}
|
}
|
||||||
@ -609,20 +594,20 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
|
|||||||
|
|
||||||
ncclNet_t ncclNetSocket = {
|
ncclNet_t ncclNetSocket = {
|
||||||
"Socket",
|
"Socket",
|
||||||
ncclSocketInit,
|
ncclNetSocketInit,
|
||||||
ncclSocketDevices,
|
ncclNetSocketDevices,
|
||||||
ncclSocketGetProperties,
|
ncclNetSocketGetProperties,
|
||||||
ncclSocketListen,
|
ncclNetSocketListen,
|
||||||
ncclSocketConnect,
|
ncclNetSocketConnect,
|
||||||
ncclSocketAccept,
|
ncclNetSocketAccept,
|
||||||
ncclSocketRegMr,
|
ncclNetSocketRegMr,
|
||||||
NULL, // No DMA-BUF support
|
NULL, // No DMA-BUF support
|
||||||
ncclSocketDeregMr,
|
ncclNetSocketDeregMr,
|
||||||
ncclSocketIsend,
|
ncclNetSocketIsend,
|
||||||
ncclSocketIrecv,
|
ncclNetSocketIrecv,
|
||||||
ncclSocketIflush,
|
ncclNetSocketIflush,
|
||||||
ncclSocketTest,
|
ncclNetSocketTest,
|
||||||
ncclSocketClose,
|
ncclNetSocketClose,
|
||||||
ncclSocketClose,
|
ncclNetSocketClose,
|
||||||
ncclSocketCloseListen
|
ncclNetSocketCloseListen
|
||||||
};
|
};
|
||||||
|
@ -34,6 +34,7 @@ struct p2pProxyInfo {
|
|||||||
struct p2pShm* devShm;
|
struct p2pShm* devShm;
|
||||||
char shmName[7];
|
char shmName[7];
|
||||||
int shmSize;
|
int shmSize;
|
||||||
|
ncclShmHandle_t handle;
|
||||||
|
|
||||||
// Intermediate step for sender
|
// Intermediate step for sender
|
||||||
struct ncclRecvMem* ceRecvMem;
|
struct ncclRecvMem* ceRecvMem;
|
||||||
@ -63,6 +64,7 @@ struct p2pRecvResources {
|
|||||||
struct p2pShm* shm;
|
struct p2pShm* shm;
|
||||||
struct p2pShm* devShm;
|
struct p2pShm* devShm;
|
||||||
int shmSize;
|
int shmSize;
|
||||||
|
ncclShmHandle_t handle;
|
||||||
};
|
};
|
||||||
|
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
@ -351,9 +353,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
|
|||||||
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
||||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
||||||
resources->shmSize = info->shmSize;
|
resources->shmSize = info->shmSize;
|
||||||
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0));
|
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
|
||||||
// Remove the file to ensure proper clean-up
|
|
||||||
NCCLCHECK(ncclShmUnlink(shmPath));
|
|
||||||
|
|
||||||
recv->conn.tail = &resources->devShm->recvMem.tail;
|
recv->conn.tail = &resources->devShm->recvMem.tail;
|
||||||
recv->conn.head = &resources->devShm->sendMem.head;
|
recv->conn.head = &resources->devShm->sendMem.head;
|
||||||
@ -396,7 +396,7 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
|
|||||||
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
|
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
|
||||||
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
|
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
|
||||||
if (useMemcpy) {
|
if (useMemcpy) {
|
||||||
NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
|
NCCLCHECK(ncclShmClose(resources->handle));
|
||||||
}
|
}
|
||||||
free(resources);
|
free(resources);
|
||||||
}
|
}
|
||||||
@ -414,7 +414,7 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
|
|||||||
char shmPath[PATH_MAX];
|
char shmPath[PATH_MAX];
|
||||||
shmPath[0] = '\0';
|
shmPath[0] = '\0';
|
||||||
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
|
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
|
||||||
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1));
|
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
|
||||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
|
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
|
||||||
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
|
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
|
||||||
|
|
||||||
@ -477,7 +477,7 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
|
|||||||
if (useMemcpy) {
|
if (useMemcpy) {
|
||||||
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
|
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
|
||||||
if (proxyInfo) {
|
if (proxyInfo) {
|
||||||
NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
|
NCCLCHECK(ncclShmClose(proxyInfo->handle));
|
||||||
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
|
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
|
||||||
CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
|
CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
|
||||||
CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
|
CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
|
||||||
|
@ -17,18 +17,22 @@ struct shmSendResources {
|
|||||||
int remShmSize;
|
int remShmSize;
|
||||||
struct ncclRecvMem* remHostMem;
|
struct ncclRecvMem* remHostMem;
|
||||||
struct ncclRecvMem* devRemHostMem;
|
struct ncclRecvMem* devRemHostMem;
|
||||||
|
ncclShmHandle_t remHandle;
|
||||||
int shmSize;
|
int shmSize;
|
||||||
struct ncclSendMem* hostMem;
|
struct ncclSendMem* hostMem;
|
||||||
struct ncclSendMem* devHostMem;
|
struct ncclSendMem* devHostMem;
|
||||||
|
ncclShmHandle_t hostHandle;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct shmRecvResources {
|
struct shmRecvResources {
|
||||||
int remShmSize;
|
int remShmSize;
|
||||||
struct ncclSendMem* remHostMem;
|
struct ncclSendMem* remHostMem;
|
||||||
struct ncclSendMem* devRemHostMem;
|
struct ncclSendMem* devRemHostMem;
|
||||||
|
ncclShmHandle_t remHandle;
|
||||||
int shmSize;
|
int shmSize;
|
||||||
struct ncclRecvMem* hostMem;
|
struct ncclRecvMem* hostMem;
|
||||||
struct ncclRecvMem* devHostMem;
|
struct ncclRecvMem* devHostMem;
|
||||||
|
ncclShmHandle_t hostHandle;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define SHM_SEND_SIDE 1
|
#define SHM_SEND_SIDE 1
|
||||||
@ -84,7 +88,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
|
|||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
|
||||||
}
|
}
|
||||||
info->shmSize = resources->shmSize = shmSize;
|
info->shmSize = resources->shmSize = shmSize;
|
||||||
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
|
||||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
||||||
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
|
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
|
||||||
|
|
||||||
@ -107,7 +111,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
|
|||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
|
||||||
}
|
}
|
||||||
info->shmSize = resources->shmSize = shmSize;
|
info->shmSize = resources->shmSize = shmSize;
|
||||||
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
|
||||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
||||||
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
|
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
|
||||||
|
|
||||||
@ -137,9 +141,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
|||||||
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
||||||
resources->remShmSize = info->shmSize;
|
resources->remShmSize = info->shmSize;
|
||||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
||||||
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
|
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
|
||||||
// Remove the file to ensure proper clean-up
|
|
||||||
NCCLCHECK(ncclShmUnlink(shmPath));
|
|
||||||
|
|
||||||
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
|
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
|
||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
@ -172,8 +174,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
|
|||||||
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
||||||
resources->remShmSize = info->shmSize;
|
resources->remShmSize = info->shmSize;
|
||||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
||||||
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
|
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
|
||||||
NCCLCHECK(ncclShmUnlink(shmPath));
|
|
||||||
|
|
||||||
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
|
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
|
||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
@ -196,8 +197,8 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
|
|||||||
static ncclResult_t shmSendFree(struct ncclConnector* send) {
|
static ncclResult_t shmSendFree(struct ncclConnector* send) {
|
||||||
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
|
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
|
||||||
if (resources) {
|
if (resources) {
|
||||||
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
|
NCCLCHECK(ncclShmClose(resources->hostHandle));
|
||||||
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
|
NCCLCHECK(ncclShmClose(resources->remHandle));
|
||||||
free(resources);
|
free(resources);
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -206,8 +207,8 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
|
|||||||
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
||||||
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
|
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
|
||||||
if (resources) {
|
if (resources) {
|
||||||
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
|
NCCLCHECK(ncclShmClose(resources->hostHandle));
|
||||||
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
|
NCCLCHECK(ncclShmClose(resources->remHandle));
|
||||||
free(resources);
|
free(resources);
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user