Add support for IB SHARP to NVLS (NVLink SHARP algorithm).
Add NVLS+Tree algorithm.
Add support for memory management using cuMem* functions.
Use all NICs for Send/Receive operations on systems with more than
one NIC per GPU (#804).
Add ncclCommSplit primitive, with resource sharing option in config.
Fix alltoallv hang (#788)
Increase number of channels on H100 when we're not limited by NVLink.
Improve error reporting in case of IB failure, printing local and
remote ID (#779).
Add build option to allow compilation against RDMA includes instead
of dynamically loading IB verbs symbols (#802).
Fix context creation for progress thread (#803).
NET/IB: add option to use multiple QPs in round-robin mode.
Fix tree performance issue when NVB is disabled on HCM topologies.
This commit is contained in:
Sylvain Jeaugey 2023-04-03 05:32:07 -07:00
parent 9b7d5edbfc
commit d97a32fac8
64 changed files with 4758 additions and 3131 deletions

View File

@ -12,6 +12,7 @@ DEBUG ?= 0
TRACE ?= 0
PROFAPI ?= 1
NVTX ?= 1
RDMA_CORE ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
@ -106,3 +107,7 @@ endif
ifneq ($(PROFAPI), 0)
CXXFLAGS += -DPROFAPI
endif
ifneq ($(RDMA_CORE), 0)
CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
endif

View File

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 17
NCCL_MINOR := 18
NCCL_PATCH := 1
NCCL_SUFFIX :=
PKG_REVISION := 1

View File

@ -10,7 +10,7 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
misc/ipcsocket.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \

View File

@ -305,6 +305,74 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
return ncclSuccess;
}
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
int prev, next;
ncclSocketAddress listenAddr, tmpAddr;
struct ncclSocket* proxySocket;
struct bootstrapState* state;
NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail);
state->rank = rank;
state->nranks = nranks;
state->abortFlag = comm->abortFlag;
comm->bootstrap = state;
comm->magic = state->magic = handle->magic;
prev = parentRanks[(rank-1+nranks)%nranks];
next = parentRanks[(rank+1)%nranks];
// Setup my sockets for the allgather ring and other p2p connections
NCCLCHECKGOTO(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(&state->ringRecvSocket, NULL, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
// Create socket for other ranks to contact me
NCCLCHECKGOTO(ncclSocketListen(&state->listenSock), ret, fail);
// Get addr from next rank
NCCLCHECKGOTO(ncclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail);
NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(union ncclSocketAddress)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(union ncclSocketAddress)), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
NCCLCHECKGOTO(ncclSocketConnect(&state->ringSendSocket), ret, fail);
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECKGOTO(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail);
// AllGather all listen handlers
NCCLCHECKGOTO(ncclCalloc(&state->peerCommAddresses, nranks), ret, fail);
memcpy(state->peerCommAddresses+rank, &listenAddr, sizeof(union ncclSocketAddress));
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)), ret, fail);
if (parent->config.splitShare) {
/* map local rank to top parent local rank. */
for (int i = 0; i < nranks; ++i) {
comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
}
comm->proxyState = parent->sharedRes->proxyState;
ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
} else {
// Create the service proxy
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag, 0), ret, fail);
NCCLCHECKGOTO(ncclSocketListen(proxySocket), ret, fail);
NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress));
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
}
INFO(NCCL_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
exit:
return ret;
fail:
goto exit;
}
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
struct bootstrapState* state = (struct bootstrapState*)commState;
char* data = (char*)allData;
@ -336,7 +404,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
struct bootstrapState* state = (struct bootstrapState*)commState;
struct ncclSocket sock;
NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
@ -397,7 +465,7 @@ ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank,
}
}
else {
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/rank, bcastData, size));
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/ranks[rank], bcastData, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);

View File

@ -17,30 +17,120 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
channel->id = channelId;
channel->workFifoSent = 0;
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
struct ncclSharedResources* sharedRes = comm->sharedRes;
// The extra on nRanks+1 is for collnet root (i.e. network)
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nPeers);
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, comm->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devPeers);
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
for (int r=0; r < nPeers; ++r) {
for (int b=0; b < NCCL_MAX_CONNS; b++) {
channel->peers[r].send[b].comm = comm;
channel->peers[r].recv[b].comm = comm;
if (channel->peers == NULL) {
// The extra on nRanks+1 is for collnet root (i.e. network)
// Allocate everything related to sharedRes with ncclCalloc as this can be
// shared between communicators hence should not be tied to comm.
if (sharedRes->peers[channelId] == NULL) {
NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks));
}
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer*>(&comm->memPermanent, nPeers);
for (int r = 0; r < nRanks; r++) {
channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r];
ncclAtomicRefCountIncrement(&channel->peers[r]->refCount);
}
}
if (channel->devPeers == NULL) {
if (sharedRes->devPeers[channelId] == NULL) {
NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
}
/* channel->devPeers is not shared, so just free it when calling commFree() */
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devPeers);
for (int r = 0; r < nRanks; r++) {
uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
}
}
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
struct ncclChannel* channel = &comm->channels[channelId];
struct ncclSharedResources* sharedRes = comm->sharedRes;
if (channel->nvlsPeers != NULL)
return ncclSuccess;
if (channel->id == -1)
NCCLCHECK(initChannel(comm, channelId));
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
if (share) {
channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
for (int r = 0; r < comm->localRanks; ++r) {
int tr = comm->topParentLocalRanks[r];
uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
}
} else {
NCCLCHECK(ncclCalloc(&channel->nvlsPeers, comm->localRanks));
NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, comm->localRanks, sharedRes->deviceStream.cudaStream));
for (int r = 0; r < comm->localRanks; ++r) {
uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
}
}
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
return ncclSuccess;
}
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
struct ncclChannel* channel = &comm->channels[channelId];
struct ncclSharedResources* sharedRes = comm->sharedRes;
uintptr_t addr;
if (channel->collnetPeers != NULL)
return ncclSuccess;
if (channel->id == -1)
NCCLCHECK(initChannel(comm, channelId));
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
if (share) {
channel->collnetPeers = parent->channels[channelId].collnetPeers;
channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
} else {
NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream));
addr = (uintptr_t)channel->collnetDevPeers;
channel->peers[comm->nRanks] = channel->collnetPeers;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
}
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) {
int nPeers = nRanks + collnetNRanks + nvlsNRanks;
/* channel peers are only valid when async init thread completes commAlloc() and
* the channel is intialized with initChannel(); if either is not done, this channel
* should never be free. */
@ -48,18 +138,23 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
// Free transport proxy resources
// Note: free all send resources first due to CollNet arrangement
for (int r=0; r<nRanks+1; r++) {
struct ncclChannelPeer* peer = channel->peers+r;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
for (int r = 0; r < nPeers; r++) {
struct ncclChannelPeer* peer = channel->peers[r];
if (peer) {
if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) {
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
}
if (r == nRanks) {
free(channel->collnetPeers);
ncclCudaFree(channel->collnetDevPeers);
} else if (r == nPeers - 1) {
free(channel->nvlsPeers);
ncclCudaFree(channel->nvlsDevPeers);
}
}
}
}
for (int r=0; r<nRanks+1; r++) {
struct ncclChannelPeer* peer = channel->peers+r;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
}
}
return ncclSuccess;
}

View File

@ -55,7 +55,7 @@ namespace {
if (inputBuf + chunkOffset == outputBuf + offset) { // In place
prims.directSend(chunkOffset, offset, nelem);
} else {
prims.directCopySend(chunkOffset, offset, offset, nelem);
prims.directCopySend(chunkOffset, offset, nelem);
}
// k-2 steps: copy to next GPU
@ -63,7 +63,7 @@ namespace {
rankDest = ringRanks[nranks-j];
offset = chunkOffset + rankDest * size;
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
// Make final copy from buffer to dest.
@ -118,19 +118,19 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
if (tid < tidEndGather) {
// Gather
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndBcast) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Bcast through MC
// Bcast through NVLS
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, group, args);
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);

View File

@ -76,14 +76,14 @@ namespace {
chunk = ringIx + 0;
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true);
prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = modRanks(ringIx + nranks-j);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
// Make final copy from buffer to dest.
@ -146,7 +146,7 @@ namespace {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directSendFromOutput(offset, offset, nelem);
prims.directSendFromOutput(offset, nelem);
}
}
else if (tree->down[0] == -1) {
@ -160,7 +160,7 @@ namespace {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
}
}
@ -203,7 +203,7 @@ namespace {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*doPost=*/true);
prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
}
}
else if (tid < nthreadsSplit) {
@ -235,7 +235,8 @@ namespace {
else {
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth);
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth);
if (tree->down[0] == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
@ -247,7 +248,7 @@ namespace {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
}
}
@ -299,9 +300,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
// Scatter
int group = (2*Proto::MaxGroupWidth) | (1<<16);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff,
args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
@ -312,16 +313,16 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
}
}
} else if (tid >= tidStartReduce && direct->out != -1) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
if (hasDn) {
// Reduce, send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
if (args->regUsed) {
prims.directRecvReduceSend(offset, offset, nelem);
prims.directRecvReduceSend(offset, nelem);
} else {
prims.recvReduceSend(offset, nelem);
}
@ -329,7 +330,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
} else {
// Directly send to network
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@ -338,29 +340,30 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
}
} else if (tid < tidStartBcast && hasUp) {
// Gather
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
}
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
int group = (1*Proto::MaxGroupWidth) | (0<<16);
if (hasDn) {
// Recv from network, broadcast
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
}
} else {
// Recv from network (no post thread needed)
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@ -383,23 +386,27 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const int reduceWarps = nranks <= 6 ? 6 : 4;
const int copyWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps)/2;
const bool hasOut = nvls->out != -1;
const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5;
const int bcastWarps = hasOut ? 2 : 0;
const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
const int nThreadsScatter = copyWarps*WARP_SIZE;
const int nThreadsGather = (copyWarps-1)*WARP_SIZE;
const int nThreadsReduce = (reduceWarps+1)*WARP_SIZE;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
const int nThreadsReduce = reduceWarps*WARP_SIZE;
const int nThreadsBcast = (bcastWarps)*WARP_SIZE;
const int tidEndScatter = nThreadsScatter;
const int tidEndGather = tidEndScatter + nThreadsGather;
const int tidEndReduce = tidEndGather + nThreadsReduce;
using Proto = ProtoSimple<1, 1, COLL_UNROLL, /*NVLS=*/true>;
const int tidEndBcast = tidEndReduce + nThreadsBcast;
if (tid < tidEndScatter) {
// Scatter
int group = (0*Proto::MaxGroupWidth) | (0<<16);
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
@ -407,19 +414,136 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
}
} else if (tid < tidEndGather) {
// Gather
int group = (2*Proto::MaxGroupWidth) | (0<<16);
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Reduce, broadcast through NVLS
} else if (tid < tidEndReduce && nvls->headRank != -1) {
if (!hasOut) {
// Reduce, broadcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
#endif // NCCL_NVLS_ENABLED
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
#if NCCL_NVLS_ENABLED
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const int treeUp = nvls->treeUp;
const int* treeDown = nvls->treeDown;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const bool hasUp = treeUp != -1;
const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5;
const int bcastWarps = hasUp ? 4 : 0;
const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
const int nThreadsReduce = reduceWarps*WARP_SIZE;
const int nThreadsBcast = (bcastWarps)*WARP_SIZE;
const int tidEndScatter = nThreadsScatter;
const int tidEndGather = tidEndScatter + nThreadsGather;
const int tidEndReduce = tidEndGather + nThreadsReduce;
const int tidEndBcast = tidEndReduce + nThreadsBcast;
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce && nvls->headRank != -1) {
if (!hasUp) {
// Reduce and Broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@ -445,16 +569,20 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
int nthreadsSplit = nthreads/2;
if (nthreadsSplit >= 256) nthreadsSplit += 64;
int group, send, recv, groupTid, groupNthreads;
int group, connIndex, send, recv, groupTid, groupNthreads;
using Proto = ProtoSimple<1, 1>;
if (tid < nthreadsSplit) {
group = (0*Proto::MaxGroupWidth) | (1<<16);
// Reduce up the chain
group = 0;
connIndex = 1;
recv = tree->down[0];
send = tree->up;
groupTid = tid;
groupNthreads = nthreadsSplit;
} else {
group = (1*Proto::MaxGroupWidth);
// Broadcast down the chain
group = 1;
connIndex = 0;
recv = tree->up;
send = tree->down[0];
groupTid = tid - nthreadsSplit;
@ -462,7 +590,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
}
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, args->redOpArg, group);
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group*Proto::MaxGroupWidth, connIndex, connIndex);
if (tid < nthreadsSplit) {
if (recv == -1) {
@ -490,7 +619,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
}
}

View File

@ -22,7 +22,6 @@ struct ncclShmemGroup {
ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
void* srcs[NCCL_MAX_NVLS_ARITY+1];
void* dsts[NCCL_MAX_NVLS_ARITY+1];
int nvlsRecv;
};
struct ncclShmemData {
@ -237,7 +236,8 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
IMPL_COLL4(func, RING, devredop, type, ncclType) \
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \
IMPL_COLL4(func, NVLS, devredop, type, ncclType)
IMPL_COLL4(func, NVLS, devredop, type, ncclType) \
IMPL_COLL4(func, NVLS_TREE, devredop, type, ncclType)
#if NCCL_TYPE == 0
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8)

View File

@ -26,7 +26,8 @@ inline __device__ int loadInt(int* ptr) {
}
template<typename RedFn, typename T, int Unroll, int BytePerPack,
int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int PreOpSrcs,
int MultimemSrcs, int MinSrcs, int MaxSrcs,
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
typename IntBytes>
__device__ __forceinline__ void reduceCopyPacks(
int nThreads, int &thread,
@ -35,6 +36,7 @@ __device__ __forceinline__ void reduceCopyPacks(
IntBytes &nBytesBehind, IntBytes &nBytesAhead
) {
static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
if (BytePerPack == 0) __trap();
// A hunk is the amount of contiguous data a warp consumes per loop iteration
// assuming all threads partake.
@ -47,15 +49,15 @@ __device__ __forceinline__ void reduceCopyPacks(
IntBytes threadBytesBehind = nBytesBehind + (warp*BytePerHunk + lane*BytePerPack);
IntBytes threadBytesAhead = nBytesAhead - (warp*BytePerHunk + lane*BytePerPack);
// Number of hunks to be consumed over all warps.
IntBytes nHunksAhead = nBytesAhead/BytePerHunk;
IntBytes nHunksAhead = nBytesAhead/(BytePerHunk + !BytePerHunk);
// Advance collective position.
nBytesBehind += nHunksAhead*BytePerHunk;
nBytesAhead -= nHunksAhead*BytePerHunk;
if (Unroll==1 && BytePerPack <= nBytesAhead) {
// Only Unroll=1 can do partial hunks (where not all threads partake).
nHunksAhead += 1;
nBytesBehind += nBytesAhead - (nBytesAhead%BytePerPack);
nBytesAhead = nBytesAhead%BytePerPack;
nBytesBehind += nBytesAhead - (nBytesAhead%(BytePerPack + !BytePerPack));
nBytesAhead = nBytesAhead%(BytePerPack + !BytePerPack);
}
nHunksAhead -= warp;
@ -77,8 +79,13 @@ __device__ __forceinline__ void reduceCopyPacks(
{ RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
if (0 < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[0]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
}
minSrcs[0] += WARP_SIZE*BytePerPack;
if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
}
@ -90,8 +97,13 @@ __device__ __forceinline__ void reduceCopyPacks(
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
if (s < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[s]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
}
minSrcs[s] += WARP_SIZE*BytePerPack;
}
#pragma unroll Unroll
@ -128,7 +140,11 @@ __device__ __forceinline__ void reduceCopyPacks(
for (int d=0; d < MinDsts; d++) {
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
st_global<BytePerPack>(minDsts[d], acc[u]);
if (d < MultimemDsts) {
multimem_st_global(minDsts[d], acc[u]);
} else {
st_global<BytePerPack>(minDsts[d], acc[u]);
}
minDsts[d] += WARP_SIZE*BytePerPack;
}
}
@ -165,213 +181,61 @@ __device__ __forceinline__ void reduceCopyPacks(
}
template<int Unroll, typename RedFn, typename T,
int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int PreOpSrcs,
int MultimemSrcs, int MinSrcs, int MaxSrcs,
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
typename IntBytes>
__device__ __forceinline__ void ReduceOrCopyMulti(
__device__ __forceinline__ void reduceCopy(
int thread, int nThreads,
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
IntBytes nElts
) {
static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values.");
//int nWarps = nThreads/WARP_SIZE;
//int warp = thread/WARP_SIZE;
int lane = thread%WARP_SIZE;
// Check that all is 16B aligned. If not don't use 16B load/stores.
int aligned = 1;
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane])%16;
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane])%16;
aligned = __all_sync(~0u, aligned);
// If a multimem src is present then our biggest pack size is limited to what
// is supported for this redfn/type.
constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize<RedFn>::BigPackSize;
IntBytes nBytesBehind = 0;
IntBytes nBytesAhead = nElts*sizeof(T);
if (aligned) {
reduceCopyPacks<RedFn, T, Unroll, /*BytePerPack=*/16,
MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/16,
MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
#if __cpp_if_constexpr
if constexpr (BigPackSize > sizeof(T)) {
#else
if (BigPackSize > sizeof(T)) {
#endif
// Check that all pointers are BigPackSize aligned.
bool aligned = true;
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize);
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize);
aligned = __all_sync(~0u, aligned);
if (aligned) {
reduceCopyPacks<RedFn, T, Unroll, BigPackSize,
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
reduceCopyPacks<RedFn, T, /*Unroll=*/1, BigPackSize,
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
}
}
reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/sizeof(T),
MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
}
// Copies from srcAddr to dstAddr using multimem load/store. The amount copied
// will be at most Unroll*BytePerPack*WARP_SIZE. If Partial=1, then the amount
// will be the min() of that and nBytesAhead. If srcAddr is not BytePerPack
// aligned then the amount copied will be less by (srcAddr%BytePerPack) since
// we begin loads at the first pack containing the first element.
template<typename RedFn, typename T, int Unroll, int BytePerPack,
bool SrcAligned, // is srcAddr aligned to BytePerPack
bool DstAligned, // are dstAddr and nBytesAhead both aligned to BytePerPack
bool Partial, // is this a possibly partial hunk
typename IntBytes>
__device__ __forceinline__ void copyMultimemMultimem_WarpUnrolled(
int lane, RedFn redFn, bool postOp, uintptr_t srcAddr, uintptr_t dstAddr,
IntBytes nBytesAhead, uint32_t scratchAddr
) {
int srcMisalign = SrcAligned ? 0 : srcAddr%BytePerPack;
srcAddr -= srcMisalign;
BytePack<BytePerPack> reg[Unroll];
int offset = lane*BytePerPack;
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (!Partial || (offset < srcMisalign + nBytesAhead)) {
reg[u] = applyLoadMultimem(redFn, srcAddr+offset);
if (postOp) reg[u] = applyPostOp(redFn, reg[u]);
}
offset += WARP_SIZE*BytePerPack;
}
if (SrcAligned && DstAligned) {
offset = lane*BytePerPack;
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (!Partial || offset < nBytesAhead) {
multimem_st_global<BytePerPack>(dstAddr+offset, reg[u]);
}
offset += WARP_SIZE*BytePerPack;
}
} else {
__syncwarp();
offset = lane*BytePerPack;
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (!Partial || (offset < srcMisalign + nBytesAhead)) {
st_shared<BytePerPack>(scratchAddr+offset, reg[u]);
}
offset += WARP_SIZE*BytePerPack;
}
__syncwarp();
if (!SrcAligned) {
// Ignore the beginning of the first pack corresponding to bytes overread
// due to misalignment.
nBytesAhead = min(nBytesAhead, Unroll*WARP_SIZE*BytePerPack - srcMisalign);
}
copyGlobalShared_WarpUnrolled
<sizeof(T), /*MaxBytes=*/Unroll*WARP_SIZE*BytePerPack, /*Multimem=*/1>
(lane, dstAddr, scratchAddr+srcMisalign, nBytesAhead);
}
}
// copyMultimemMultimem_IfEnabled has two overloads: the enabled case whose first arg
// has type `std::true_type` and the disabled case with first arg `std::false_type`.
// This is to guard the template instantiations of Apply_LoadMultimem on types/ops where
// they aren't supported. A nicer approach is to use C++17's "if constexpr".
template<typename RedFn, typename IntBytes>
__device__ __forceinline__ void copyMultimemMultimem_IfEnabled(
std::false_type enabled/*=false*/,
int thread, int nThreads, uint64_t redArg, bool postOp,
void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
) {
// nop
}
template<typename RedFn, typename IntBytes>
__device__ __forceinline__ void copyMultimemMultimem_IfEnabled(
std::true_type enabled/*=true*/,
int thread, int nThreads, uint64_t redArg, bool postOp,
void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
) {
static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
constexpr int BytePerPack = Apply_LoadMultimem<RedFn>::PackSize;
using T = typename RedFn::EltType;
constexpr int Unroll = ncclNvlsUnroll(BytePerPack);
constexpr int BytePerHunk = Unroll*WARP_SIZE*BytePerPack;
int nWarps = nThreads/WARP_SIZE;
int warp = thread/WARP_SIZE;
int lane = thread%WARP_SIZE;
RedFn redFn(redArg);
uintptr_t srcAddr = cvta_to_global(srcPtr);
uintptr_t dstAddr = cvta_to_global(dstPtr);
IntBytes warpBytesAhead = nElts*sizeof(T);
bool partialHunkIsFront;
// First handle misalignment of srcAddr.
if ((BytePerPack != sizeof(T)) && (srcAddr%BytePerPack != 0)) {
// If srcAddr isn't pack aligned then the first hunk processed will be short
// the same number of bytes as srcAddr's misalignment.
if (warp == 0) {
partialHunkIsFront = true;
goto PartialHunk; // "call" PartialHunk()
PartialHunkFrontReturn:
warp = nWarps;
}
warp -= 1; // Rotate warp numbers for load balancing
int advanced = BytePerHunk-(srcAddr%BytePerPack); // since copyMultimemMultimem_WarpUnrolled shorts by the misalignment
srcAddr += advanced; // srcAddr is now pack aligned
dstAddr += advanced;
warpBytesAhead -= advanced;
}
warpBytesAhead -= warp*BytePerHunk;
srcAddr += warp*BytePerHunk;
dstAddr += warp*BytePerHunk;
// Now that srcAddr is pack aligned detect if dstAddr is pack aligned.
if ((BytePerPack == sizeof(T)) || (dstAddr%BytePerPack == 0)) {
while (BytePerHunk <= warpBytesAhead) {
copyMultimemMultimem_WarpUnrolled
<RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/true, /*DstAligned=*/true, /*Partial=*/false>
(lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
srcAddr += nWarps*BytePerHunk;
dstAddr += nWarps*BytePerHunk;
warpBytesAhead -= nWarps*BytePerHunk;
}
} else {
while (BytePerHunk <= warpBytesAhead) {
copyMultimemMultimem_WarpUnrolled
<RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/true, /*DstAligned=*/false, /*Partial=*/false>
(lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
srcAddr += nWarps*BytePerHunk;
dstAddr += nWarps*BytePerHunk;
warpBytesAhead -= nWarps*BytePerHunk;
}
}
if (0 < warpBytesAhead) {
partialHunkIsFront = false;
goto PartialHunk; // "call" PartialHunk()
PartialHunkBackReturn:;
}
return;
PartialHunk:
// We have to handle a partial hunk possibly at the front and back of the
// buffer. We generate the code once here since its a lot of instructions,
// and then simulate function calls with gotos.
copyMultimemMultimem_WarpUnrolled
<RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/false, /*DstAligned=*/false, /*Partial=*/true>
(lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
if (partialHunkIsFront) goto PartialHunkFrontReturn;
goto PartialHunkBackReturn;
}
template<typename RedFn, typename IntBytes>
__device__ __forceinline__ void copyMultimemMultimem(
int thread, int nThreads, uint64_t redArg, bool postOp,
void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
) {
constexpr bool Enabled = Apply_LoadMultimem<RedFn>::PackSize != 0;
copyMultimemMultimem_IfEnabled<RedFn>(
/*enabled=*/std::integral_constant<bool, Enabled>(),
thread, nThreads, redArg, postOp, srcPtr, dstPtr, nElts, warpScratchAddr);
}
#endif // COMMON_KERNEL_H_

View File

@ -23,7 +23,8 @@ __shared__ ncclShmemData ncclShmem;
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify)
NCCL_FUNC5(func, NVLS, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify)
#if defined(__CUDA_BF16_TYPES_EXIST__)
// Must be consistent with ncclDataType_t

View File

@ -37,7 +37,7 @@ namespace {
dst += i0;
void *vsrc = (void*)src;
void *vdst = (void*)dst;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/1>
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
(tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
}
}

View File

@ -7,6 +7,8 @@
#ifndef OP128_H_
#define OP128_H_
#include <type_traits>
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
: "=l"(v0), "=l"(v1) : "l"(ptr));
@ -94,6 +96,8 @@ __device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
template<int Size>
union BytePack;
template<>
union BytePack<0> {};
template<>
union BytePack<1> {
uint8_t u8, native;
};
@ -129,14 +133,26 @@ union alignas(16) BytePack<16> {
};
template<typename T>
__device__ __forceinline__ BytePack<sizeof(T)> toPack(T value) {
union { BytePack<sizeof(T)> p; T v; };
struct BytePackOf {
static constexpr int Size = sizeof(T);
using Pack = BytePack<Size>;
};
template<>
struct BytePackOf<BytePack<0>> {
static constexpr int Size = 0;
using Pack = BytePack<0>;
};
template<typename T>
__device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value) {
union { typename BytePackOf<T>::Pack p; T v; };
v = value;
return p;
}
template<typename T>
__device__ __forceinline__ T fromPack(BytePack<sizeof(T)> pack) {
union { BytePack<sizeof(T)> p; T v; };
__device__ __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack) {
union { typename BytePackOf<T>::Pack p; T v; };
p = pack;
return v;
}
@ -151,6 +167,13 @@ template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; }
template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {}
template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {}
// Used to define implementations for above prototypes.
#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
template<> \
@ -275,6 +298,18 @@ __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
template<>
__device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) {
// nop
}
template<>
__device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) {
asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) {
asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
}

View File

@ -21,13 +21,14 @@
* to how that protocol operates with a consistent interface so that our
* algorithm code can operate protocol parametrically.
*/
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, bool NVLS_1 = false>
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, int MultimemSrcs_1 = 0, int MultimemDsts_1 = 0>
struct ProtoSimple {
static constexpr int Id = NCCL_PROTO_SIMPLE;
static constexpr int SlicePerChunk = SlicePerChunk_1;
static constexpr int StepPerSlice = StepPerSlice_1;
static constexpr int Unroll = Unroll_1;
static constexpr bool NVLS = NVLS_1;
static constexpr int MultimemSrcs = MultimemSrcs_1;
static constexpr int MultimemDsts = MultimemDsts_1;
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
@ -39,9 +40,6 @@ struct ProtoSimple {
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 2;
__device__ static int calcGroupWidth(bool send, int nthreads) {
return send && nthreads-WARP_SIZE >= 64 ? 2 : 1;
}
};
struct ProtoLL {
@ -57,9 +55,6 @@ struct ProtoLL {
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 1;
__device__ static int calcGroupWidth(bool send, int nthreads) {
return 1;
}
};
struct ProtoLL128 {
@ -75,9 +70,6 @@ struct ProtoLL128 {
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 1;
__device__ static int calcGroupWidth(bool send, int nthreads) {
return 1;
}
};
/* Fan (as in fan-in & fan-out) classes hold recv and send counts. The template
@ -117,22 +109,22 @@ class Primitives;
// Used by LL & LL128 to implement direct members in the naive way.
template<typename RealPrimitives>
struct PrimitivesWithoutDirect {
__device__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) {
__device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->send(inpIx, eltN);
}
__device__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
__device__ void directSendFromOutput(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
}
__device__ void directRecv(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
}
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
}
__device__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
__device__ void directRecvCopySend(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
}
__device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
__device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
}

View File

@ -322,22 +322,22 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
int connIndex = group >> 16;
auto *channel = &ncclShmem.channel;
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
// We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
nrecv++;
}
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);

View File

@ -363,22 +363,22 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
warpInBlock(threadIdx.x/WARP_SIZE),
flagThread((tid%8)==7), group(group&(uint16_t)0xFFFF),
flagThread((tid%8)==7), group(group),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
int connIndex = group >> 16;
auto *channel = &ncclShmem.channel;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
nrecv++;
}
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);

View File

@ -5,9 +5,9 @@
************************************************************************/
template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, bool NVLS>
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
class Primitives<
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, NVLS>, P2p
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
@ -23,10 +23,9 @@ class Primitives<
DirectWrite = 0x200,
DirectRead = 0x400,
ThreadsSynced = 0x800,
NvlsMinPolling = 0x1000,
NvlsRecv = 0x2000;
NvlsMinPolling = 0x1000;
const int tid, tidInBlock;
int nthreads;
const int nthreads;
int nworkers;
const int stepSize;
Fan fan;
@ -107,19 +106,19 @@ class Primitives<
inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
if (NVLS && (flags & NvlsMinPolling)) {
if (flags & NvlsMinPolling) {
uint64_t ans;
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
return ans;
}
#endif
// volatile is faster than acquire but not as correct. Make sure ReduceOrCopyMulti
// volatile is faster than acquire but not as correct. Make sure reduceCopy
// loads data using volatile so it doesn't see stale data in L1.
return ld_volatile_global(ptr);
}
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
__device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
__device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input
const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
@ -143,7 +142,7 @@ class Primitives<
ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
else if (isSendNotRecv && DirectSend) {
if (flags & DirectWrite) {
ptrs[index] = directBuff + remoteIx + offset;
ptrs[index] = directBuff + dstIx + offset;
} else if (flags & DirectRead) { // empty send
ptrs[index] = nullptr;
} else {
@ -151,7 +150,7 @@ class Primitives<
}
} else if (!isSendNotRecv && DirectRecv) {
if (flags & DirectRead) {
ptrs[index] = directBuff + remoteIx + offset;
ptrs[index] = directBuff + srcIx + offset;
} else if (flags & DirectWrite) {
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
} else {
@ -176,7 +175,7 @@ class Primitives<
template <int DirectRecv1, int DirectSend1, int Recv, int Send, int SrcBuf, int DstBuf>
__device__ __forceinline__ void genericOp(
intptr_t srcIx, intptr_t dstIx, intptr_t remoteIx, int nelem, bool postOp
intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp
) {
constexpr int DirectRecv = 1 && Direct && DirectRecv1;
constexpr int DirectSend = 1 && Direct && DirectSend1;
@ -225,20 +224,15 @@ class Primitives<
ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(srcIx, dstIx, offset, sliceSize);
subBarrier();
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
* to 0 to avoid unnecessary workload. */
int workSize = ncclShmem.aborted ? 0 : sliceSize;
if (NVLS && ncclShmem.groups[group].nvlsRecv) {
void* src = ncclShmem.groups[group].srcs[0];
void* dst = ncclShmem.groups[group].dsts[0];
copyMultimemMultimem<RedOp>(tid, nworkers, ncclShmem.redOpArgs[0], postOp, src, dst, workSize,
cvta_to_shared(ncclScratchForWarp(tidInBlock/WARP_SIZE)));
} else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, MaxSend, /*PreOpSrcs*/0>
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
(tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
1, ncclShmem.groups[group].srcs,
fan.nsend(), ncclShmem.groups[group].dsts+1,
@ -246,7 +240,7 @@ class Primitives<
}
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
// For broadcast in CollNet to do empty send
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs*/0>
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs*/0>
(tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp,
Recv, ncclShmem.groups[group].srcs,
Dst, ncclShmem.groups[group].dsts,
@ -254,7 +248,9 @@ class Primitives<
} else {
constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
reduceCopy<Unroll, RedOp, T,
MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
@ -319,7 +315,7 @@ class Primitives<
void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
// Mark for threadfence at the end
fenceNeeded |= true;
}
@ -342,7 +338,7 @@ class Primitives<
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
if (realPeerSize > 0) reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
}
}
}
@ -364,14 +360,7 @@ class Primitives<
}
if (flags & RoleWaitRecv) {
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
if ((index == 0) && (flags & RoleWaitRecv)) {
if (conn->flags & NCCL_NVLS_MIN_POLL) {
flags |= NvlsMinPolling;
ncclShmem.groups[group].nvlsRecv = 1;
} else {
ncclShmem.groups[group].nvlsRecv = 0;
}
}
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->tail;
connStepCache = loadStepValue(connStepPtr);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
@ -448,16 +437,14 @@ class Primitives<
public:
__device__ Primitives(
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr
):
tid(tid), tidInBlock(threadIdx.x),
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
// For send operations, we need an extra warp to overlap the threadfence and the copy
this->nthreads = nthreads;
this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
this->group = group & (uint16_t)0xFFFF;
int connIndex = group >> 16;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@ -487,8 +474,8 @@ class Primitives<
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e);
loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e);
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
}
@ -593,62 +580,62 @@ class Primitives<
}
__device__ __forceinline__ void send(intptr_t inpIx, int eltN) {
genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, -1, eltN, false);
genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, eltN, false);
}
__device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) {
genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, -1, eltN, false);
genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, eltN, false);
}
__device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) {
genericOp<0, 1, 0, 1, Input, -1>(inpIx, -1, remoteOutIx, eltN, false);
__device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<0, 1, 0, 1, Input, -1>(inpIx, outIx, eltN, false);
}
__device__ __forceinline__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
genericOp<0, 1, 0, 1, Output, -1>(outIx, -1, remoteOutIx, eltN, false);
__device__ __forceinline__ void directSendFromOutput(intptr_t outIx, int eltN) {
genericOp<0, 1, 0, 1, Output, -1>(outIx, outIx, eltN, false);
}
__device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, postOp);
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, /*postOp=*/false);
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
}
__device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp);
genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
__device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, postOp);
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, eltN, postOp);
}
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp);
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, false);
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
}
__device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, postOp);
__device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, -1, eltN, postOp);
genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, -1, eltN, postOp);
genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, intptr_t remoteInpIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, remoteInpIx, eltN, postOp);
__device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
}
__device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp);
genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
__device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void

View File

@ -55,9 +55,14 @@ struct Apply_PostOp/*{
static BytePack<EltPerPack*sizeof(T)> postOp(Fn fn, BytePack<EltPerPack*sizeof(T)> a);
}*/;
template<typename Fn>
struct LoadMultimem_BigPackSize/*{
// If non-zero, then this and sizeof(T) are valid pack sizes for LoadMultimem,
// otherwise there are no valid pack sizes for LoadMultimem.
static constexpr int BigPackSize = 0;
}*/;
template<typename Fn, int BytePerPack>
struct Apply_LoadMultimem/*{
static constexpr int PackSize; // 0 if not implemented
static BytePack<PackSize> load(Fn fn, uintptr_t addr);
static BytePack<BytePerPack> load(Fn fn, uintptr_t addr);
}*/;
////////////////////////////////////////////////////////////////////////////////
@ -69,7 +74,7 @@ struct Apply_LoadMultimem/*{
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
return fromPack<Pack>(
Apply_Reduce<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
Apply_Reduce<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::reduce(fn, toPack(a), toPack(b))
);
}
@ -77,7 +82,7 @@ __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
return fromPack<Pack>(
Apply_PreOp<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
Apply_PreOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::preOp(fn, toPack(a))
);
}
@ -85,19 +90,27 @@ __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) {
return fromPack<Pack>(
Apply_PostOp<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
Apply_PostOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::postOp(fn, toPack(a))
);
}
template<typename Fn>
__device__ __forceinline__ BytePack<Apply_LoadMultimem<Fn>::PackSize> applyLoadMultimem(Fn fn, uintptr_t addr) {
return Apply_LoadMultimem<Fn>::load(fn, addr);
template<typename Fn, int BytePerPack>
__device__ __forceinline__ BytePack<BytePerPack> applyLoadMultimem(Fn fn, uintptr_t addr) {
return Apply_LoadMultimem<Fn, BytePerPack>::load(fn, addr);
}
////////////////////////////////////////////////////////////////////////////////
// Apply_Reduce
// Nonsensical base case
template<typename Fn>
struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
__device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
return {};
}
};
// General recursive definition (EltPerPack > 1). This is how we iterate over
// all elements in a pack of any size, by breaking it into halves. Eventually
// we'll hit a base case (a more specific template specialization which takes
@ -282,6 +295,14 @@ struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
return a;
}
};
// Base case definition (EltPerPack == 0), is nonsense!
template<typename Fn>
struct Apply_PreOp<Fn, /*EltPerPack=*/0> {
static constexpr bool IsIdentity = true;
__device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
return {};
}
};
////////////////////////////////////////////////////////////////////////////////
// Apply_PostOp
@ -315,6 +336,14 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
return a;
}
};
// Base case definition (EltPerPack == 0), is nonsense!
template<typename Fn>
struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
static constexpr bool IsIdentity = true;
__device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
return {};
}
};
////////////////////////////////////////////////////////////////////////////////
@ -505,11 +534,6 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
////////////////////////////////////////////////////////////////////////////////
// Apply_LoadMultimem
template<typename Fn>
struct Apply_LoadMultimem {
static constexpr int PackSize = 0; // Indicates not implemented
};
#define SIZEOF_BytePack_field_u16 2
#define PTX_REG_BytePack_field_u16 "h"
@ -521,11 +545,11 @@ struct Apply_LoadMultimem {
#define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>> { \
static constexpr int PackSize = 1*(SIZEOF_BytePack_field_##pack_field); \
struct Apply_LoadMultimem<Fn<T>, SIZEOF_BytePack_field_##pack_field> { \
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
__device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.global." #op "." #ptx_ty " %0, [%1];" \
asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
return ans; \
@ -533,11 +557,11 @@ struct Apply_LoadMultimem {
};
#define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>> { \
struct Apply_LoadMultimem<Fn<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
__device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
@ -546,8 +570,45 @@ struct Apply_LoadMultimem {
return ans; \
} \
};
#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \
DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>, sizeof(T)> { \
__device__ static BytePack<sizeof(T)> load(Fn<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(sizeof(T)))); \
return tmp.half[(addr/sizeof(T))%2]; \
} \
};
template<typename Fn, int BytePerPack>
struct Apply_LoadMultimem {
__device__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
__trap();
return {};
}
};
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
template<typename Fn>
struct LoadMultimem_BigPackSize {
using T = typename Fn::EltType;
static constexpr bool IsSum = std::is_same<Fn, FuncSum<T>>::value ||
std::is_same<Fn, FuncPreMulSum<T>>::value ||
std::is_same<Fn, FuncSumPostDiv<T>>::value;
static constexpr bool IsMinOrMax = std::is_same<Fn, FuncMin<T>>::value ||
std::is_same<Fn, FuncMax<T>>::value;
static constexpr bool IsFloat = IsFloatingPoint<T>::value;
static constexpr int BigPackSize =
IsFloat && IsSum && sizeof(T) < 8 ? 16 :
IsFloat && IsSum ? 8 :
IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 :
!IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) :
/*multimem.ld_reduce not supported:*/ 0;
};
DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32)
DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32)
DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32)
@ -564,23 +625,30 @@ struct Apply_LoadMultimem {
DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64)
DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64)
DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32)
DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32)
DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64)
DEFINE_Apply_LoadMultimem_v4(FuncSum, half, add, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4(FuncMin, half, min, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4(FuncMax, half, max, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32)
#if defined(__CUDA_BF16_TYPES_EXIST__)
DEFINE_Apply_LoadMultimem_v4(FuncSum, __nv_bfloat16, add, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4(FuncMin, __nv_bfloat16, min, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4(FuncMax, __nv_bfloat16, max, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32)
#endif
#else
template<typename Fn>
struct LoadMultimem_BigPackSize {
static constexpr int BigPackSize = 0;
};
#endif
#undef DEFINE_Apply_LoadMultimem
#undef DEFINE_Apply_LoadMultimem_v4
#undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf
#undef SIZEOF_BytePack_field_u64
#undef PTX_REG_BytePack_field_u64
#undef SIZEOF_BytePack_field_u32

View File

@ -108,19 +108,19 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
if (tid < tidEndScatter) {
// Scatter
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, group, args);
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndReduce) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Reduce through MC
// Reduce through NVLS
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);

View File

@ -11,14 +11,14 @@
template<typename T, typename RedOp>
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
template<typename Proto>
__device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
__device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
if (args->peer == ncclShmem.comm.rank) {
struct ncclWorkElemP2p* recvArgs = args-1;
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
if (buff != recvBuff) {
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
}
} else {
@ -26,7 +26,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1);
size_t offset = 0;
do {
int nelem = min(size_t(chunkSize), count-offset);
@ -37,7 +37,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
template<typename Proto>
__device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
__device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
if (args->peer != ncclShmem.comm.rank) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
@ -45,7 +45,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1);
size_t offset = 0;
do {
int nelem = min(size_t(chunkSize), count-offset);
@ -65,11 +65,10 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
// warpStarts were rounded thanks to int division, but for group number we need to round the other way around
// So we mirror wid then mirror again the group.
#define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
args += group;
tid -= args->warpStart * WARP_SIZE;
int nthreads = args->nWarps * WARP_SIZE;
group |= 1<<16; // Used to select connIndex 1
if (args->p2pType == ncclWorkP2pTypeUnused) return;
if (tid >= nthreads || args->peer == -1) return;

View File

@ -74,6 +74,8 @@ void ncclDebugInit() {
mask = NCCL_ALLOC;
} else if (strcasecmp(subsys, "CALL") == 0) {
mask = NCCL_CALL;
} else if (strcasecmp(subsys, "PROXY") == 0) {
mask = NCCL_PROXY;
} else if (strcasecmp(subsys, "NVLS") == 0) {
mask = NCCL_NVLS;
} else if (strcasecmp(subsys, "ALL") == 0) {

View File

@ -33,7 +33,8 @@ struct ncclKernelMatch {
NCCL_FUNC5(func, RING, devredop, type, specialized), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized), \
NCCL_FUNC5(func, NVLS, devredop, type, specialized)
NCCL_FUNC5(func, NVLS, devredop, type, specialized), \
NCCL_FUNC5(func, NVLS_TREE, devredop, type, specialized)
#ifdef __CUDA_BF16_TYPES_EXIST__
#define HAVE_BFLOAT16 1
@ -215,12 +216,13 @@ static void finishWork(struct ncclWork* work) {
static void appendWorkElemP2p(
struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
struct ncclWorkElemP2p const *elem
struct ncclWorkElemP2p const *elem, bool fuseOk
) {
constexpr int funcIndex = FUNC_INDEX_P2P;
struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
if (q && funcIndex == q->work.header.funcIndex) {
if (!fuseOk) goto NewWork;
if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) {
for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) {
// Can't have multiple elements of the same ncclWork communicate with the
@ -349,7 +351,7 @@ NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
// ensure *nWorkBudget >= 1 upon entry.
static ncclResult_t addP2pToPlan(
struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget,
bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes
bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, bool fuseOk
) {
struct ncclInfo info = {
isSendNotRecv ? ncclFuncSend : ncclFuncRecv,
@ -364,7 +366,7 @@ static ncclResult_t addP2pToPlan(
// 1 is connIndex
struct ncclConnInfo* conn = isSendNotRecv ?
&comm->channels[channelId].peers[peer].send[1].conn : &comm->channels[channelId].peers[peer].recv[1].conn;
&comm->channels[channelId].peers[peer]->send[1].conn : &comm->channels[channelId].peers[peer]->recv[1].conn;
info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE;
struct ncclProxyOp proxyOp = {};
@ -382,7 +384,7 @@ static ncclResult_t addP2pToPlan(
elem.chunkSize = info.chunkSize; // computed by ncclProxyComputeP2p
*nWorkBudget += plan->channels[channelId].nWork;
appendWorkElemP2p(comm, plan, channelId, &elem);
appendWorkElemP2p(comm, plan, channelId, &elem, fuseOk);
*nWorkBudget -= plan->channels[channelId].nWork;
// Calculate the opCount after appendWorkElemP2p since it will always return
@ -553,7 +555,7 @@ static ncclResult_t scheduleCollTasksToPlan(
info.sliceSteps = head->sliceSteps;
NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
if (nAggOps > 1) {
int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
info.nChannels = std::max(1, std::min(info.nChannels, maxChannels));
info.algorithm = aggInfo.algorithm;
@ -578,7 +580,7 @@ static ncclResult_t scheduleCollTasksToPlan(
NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
}
int maxChannels = info.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
tasks->nTasksColl -= 1;
@ -632,12 +634,15 @@ static ncclResult_t scheduleP2pTasksToPlan(
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
bool fuseOk;
// We can perform 8 send/recv per round per CTA. Make sure we jump between fused blocks at node boundaries.
while (tasks->nTasksP2p != 0) {
for (int i=0; i < nRanks; i++) {
for (int i=0; i < tasks->p2pOrderSteps; i++) {
int sendPeer = sendOrder[i];
int recvPeer = recvOrder[i];
struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendPeer].sendQueue);
struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvPeer].recvQueue);
if ((i % (NCCL_MAX_WORK_ELEMENTS_P2P/2)) == 0) fuseOk = false;
struct ncclTaskP2p* send = sendPeer != -1 ? ncclIntruQueueHead(&peers[sendPeer].sendQueue) : NULL;
struct ncclTaskP2p* recv = recvPeer != -1 ? ncclIntruQueueHead(&peers[recvPeer].recvQueue) : NULL;
if (sendPeer == comm->rank) {
if (recvPeer != comm->rank) {
WARN("Sendrecv plan not aligned for self");
@ -676,7 +681,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
if (recvChunkBytes != 0) {
if (recvChunkBytes == -1) recvChunkBytes = 0;
if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes));
NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, fuseOk));
fuseOk = true;
recvPtr += recvChunkBytes;
recvBytes -= recvChunkBytes;
recv->chunk += 1;
@ -689,7 +695,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
if (sendChunkBytes != 0) {
if (sendChunkBytes == -1) sendChunkBytes = 0;
if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes));
NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, fuseOk));
fuseOk = true;
sendPtr += sendChunkBytes;
sendBytes -= sendChunkBytes;
send->chunk += 1;
@ -822,12 +829,12 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
}
static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) {
uint64_t collOpCount = comm->collOpCount;
uint64_t collOpCount = comm->sharedRes->collOpCount;
// Advance comm's collOpCount by number of colls in this plan.
comm->collOpCount = collOpCount + plan->collOpCount;
comm->sharedRes->collOpCount += plan->collOpCount;
for (int c=0; c < plan->channelUbound; c++) {
struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue);
uint64_t p2pOpCount = comm->channels[c].p2pOpCount;
uint64_t p2pOpCount = comm->sharedRes->p2pOpCount[c];
uint64_t nextP2pOpCount = p2pOpCount;
while (q != nullptr) {
struct ncclProxyOp* qNext = q->enqNext;
@ -850,7 +857,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
q = qNext;
}
// Advance channel's p2pOpCount by number of p2p's in this plan channel.
comm->channels[c].p2pOpCount = nextP2pOpCount;
comm->sharedRes->p2pOpCount[c] = nextP2pOpCount;
}
return ncclSuccess;
}
@ -969,14 +976,14 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
// The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
// at least one of the two streams to be strong-stream.
cudaStream_t launchStream = tasks->streams->stream;
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->deviceStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
// Create dependency for device stream on user streams. First from extra user
// streams to deviceStream. Then deviceStream to first user stream.
for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) {
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, l->stream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
}
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
// We have to launch host tasks to push proxy args. We are careful to only
@ -986,15 +993,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
if (plan->hasProxyOps) {
if (!acquired) {
acquired = true;
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->hostStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
}
NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->hostStream, hostStreamPlanCallback, plan), result, failure);
NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
}
}
if (acquired) {
// Make to-be-launched kernels dependent on just-launched host stream tasks.
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->hostStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->hostStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
}
}
@ -1038,7 +1045,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
if (driverVersion >= 11080) {
int compCap = comm->compCap;
unsigned int clusterSize = (compCap == 90) ? comm->cgaClusterSize : 0;
unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
cudaLaunchConfig_t launchConfig = {0};
cudaLaunchAttribute launchAttrs[3];
@ -1110,7 +1117,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
// Create dependency for deviceStream on launchStream. We know that deviceStream
// hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
// so we can say that launchStream subsumes it.
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
resume1:
// Create dependency for other user streams (skip launch stream) on deviceStream.
// Again, the user streams haven't been touched since deviceStream waited on them
@ -1118,12 +1125,12 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
struct ncclCudaStreamList* sl = tasks->streams->next;
tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
while (sl != nullptr) {
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream, /*b_subsumes_a=*/true), result, resume2);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
resume2:
sl = sl->next;
}
// Release device stream as acquired in ncclLaunchPrepare()
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->deviceStream), result, resume3);
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
resume3:;
}
return result;
@ -1160,6 +1167,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
for (int a=0; a<nAlgos; a++) {
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue;
if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float time;
@ -1193,7 +1202,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
}
ncSwitch /= 2;
}
} else if (info->algorithm == NCCL_ALGO_NVLS) {
} else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
// NVLS should not need more than 16 channels to get peak BW.
nc = comm->nvlsChannels;
} else {
@ -1205,12 +1214,9 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
}
}
if (info->protocol == NCCL_PROTO_SIMPLE) {
nt += WARP_SIZE; // Extra warp for sync
if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
// More threads or sync warps needed due to split thread model
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_NVLS) nt = NCCL_MAX_NTHREADS;
if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
}
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
info->nChannels = nc;
@ -1226,10 +1232,13 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
case ncclFuncReduceScatter:
case ncclFuncAllGather:
info->pattern = ncclPatternRing; break;
info->pattern =
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
ncclPatternRing; break;
case ncclFuncAllReduce:
info->pattern =
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
info->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree :
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
@ -1249,14 +1258,17 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
case ncclPatternCollnetChain:
info->nstepsPerLoop = info->nchunksPerLoop = 1; break;
case ncclPatternNvls:
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break;
case ncclPatternCollnetDirect:
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
case ncclPatternRing:
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternRingTwice:
info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternNvlsTree:
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break;
default:
WARN("Unknown pattern %d", info->pattern);
return ncclInternalError;
@ -1326,13 +1338,22 @@ comp_next:
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_NVLS) {
if (chunkSize > 131072) chunkSize = 131072;
int maxChunkSize = 131072;
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((info->nBytes < (64 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((info->nBytes < (1 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@ -1361,8 +1382,7 @@ comp_next:
proxyOp->chunkSize = chunkSize;
proxyOp->protocol = info->protocol;
proxyOp->dtype = info->datatype;
proxyOp->redOp = (info->algorithm != NCCL_ALGO_COLLNET_DIRECT && info->algorithm != NCCL_ALGO_COLLNET_CHAIN) ? ncclNumOps : // Only set redOp when using CollNet
info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
info->op;
proxyOp->pattern = info->pattern;
proxyOp->root = info->root;
@ -1476,12 +1496,12 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
int channelId;
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
if (isSendNotRecv) {
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1UL<<channelId);
ncclGroupCommPreconnect(comm);
}
} else {
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1UL<<channelId);
ncclGroupCommPreconnect(comm);
}
@ -1576,10 +1596,10 @@ exit:
NCCLCHECK(ncclGroupEndInternal());
/* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
* so we have to check state here. */
if (info->comm && !info->comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
return ret;
fail:
if (info->comm && !info->comm->blocking) (void) ncclCommSetAsyncError(info->comm, ret);
if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret);
goto exit;
}

View File

@ -14,9 +14,7 @@
/********************* Internode connection ***********************/
/******************************************************************/
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks) {
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->topo->nodes[GPU].count;
int nChannels = comm->nChannels;
@ -35,9 +33,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
int* ringIntra = ringGraph->intra+c*localRanks;
int* treeIntra = treeGraph->intra+c*localRanks;
int* collNetIntra = collNetGraph->intra+c*localRanks;
int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks;
for (int i=0; i<localRanks; i++) {
if (ringIntra[i] == rank) {
@ -48,8 +47,8 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
}
if (treeIntra[i] == rank) {
int parentIndex = 0;
int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
topoRanks->treeToParent[c] = treeIntra[parentIndex];
topoRanks->treeToChild0[c] = treeIntra[child0Index];
@ -64,6 +63,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
}
topoRanks->ringPrev[c] = channel->ring.prev;
topoRanks->ringNext[c] = channel->ring.next;
topoRanks->nvlsHeads[c] = nvlsIntra[0];
}
// Duplicate channels rings/trees
struct ncclChannel* channel0 = comm->channels;
@ -72,26 +72,26 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
return ncclSuccess;
}
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) {
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
int nChannels = comm->nChannels;
int nNodes = comm->nNodes;
for (int c=0; c<nChannels; c++) {
int* recv = ringRecv+c*comm->nRanks;
int* send = ringSend+c*comm->nRanks;
int* recv = ringRecv+c*comm->nNodes;
int* send = ringSend+c*comm->nNodes;
int* prev = ringPrev+c*comm->nRanks;
int* next = ringNext+c*comm->nRanks;
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
for (int n=0; n<nNodes; n++) {
int recvRank = recv[firstRanks[n]];
int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]];
int recvRank = recv[n];
int prevSendRank = send[(n-1+nNodes)%nNodes];
prev[recvRank] = prevSendRank;
if (comm->rank == recvRank) {
channel0->ring.prev = prevSendRank;
channel1->ring.prev = prevSendRank;
}
int sendRank = send[firstRanks[n]];
int nextRecvRank = recv[firstRanks[(n+1)%nNodes]];
int sendRank = send[n];
int nextRecvRank = recv[(n+1)%nNodes];
next[sendRank] = nextRecvRank;
if (comm->rank == sendRank) {
channel0->ring.next = nextRecvRank;
@ -104,8 +104,8 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring
return ncclSuccess;
}
static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) {
for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]];
static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
for (int n=0; n<nNodes; n++) indexes[n] = ranks[n];
return ncclSuccess;
}
@ -127,48 +127,42 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
return ncclSuccess;
}
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
int* ranksToParent, *ranksToChild0, *ranksToChild1;
NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
int* ttp, *ttc0, *ttc1;
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
if (comm->rank == ranksToParent[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
ttp = treeToParent+c*comm->nNodes;
ttc0 = treeToChild0+c*comm->nNodes;
ttc1 = treeToChild1+c*comm->nNodes;
if (comm->rank == ttp[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if (comm->rank == ranksToChild0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
if (comm->rank == ttc0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if (comm->rank == ranksToChild1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
if (comm->rank == ttc1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if (comm->rank == ranksToParent[node] ||
comm->rank == ranksToChild0[node] ||
comm->rank == ranksToChild1[node]) {
if (comm->rank == ttp[node] ||
comm->rank == ttc0[node] ||
comm->rank == ttc1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
channel0->tree.depth = channel1->tree.depth = depth;
}
free(ranksToParent);
free(ranksToChild0);
free(ranksToChild1);
return ncclSuccess;
}
@ -221,10 +215,96 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
INFO(NCCL_GRAPH, "%s", line);
channel->collnetChain.depth = comm->nRanks/comm->nNodes;
}
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
}
free(heads);
return ncclSuccess;
}
static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) {
int nHeads = nvlsGraph->nChannels;
int headRank = -1;
for (int h=0; h<nHeads; h++) {
if (nvlsGraph->intra[h*comm->localRanks] == comm->rank) headRank = h;
}
if (nHeads == 0) {
comm->nvlsChannels = 0;
return ncclSuccess;
}
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.nHeads = nHeads;
for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
for (int h=nHeads; h<NCCL_MAX_NVLS_ARITY; h++) channel->nvls.up[h] = -1;
channel->nvls.down = comm->nRanks+1+headRank;
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
channel->nvls.headRank = headRank;
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
channel->nvls.node = comm->node;
channel->nvls.nNodes = comm->nNodes;
}
if (comm->nNodes == 1) return ncclSuccess;
// Connect Trees
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
int pc0, pc1; // ignored
NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node,
&tree0Parent, &tree0Child0, &tree0Child1, &pc0,
&tree1Parent, &tree1Child0, &tree1Child1, &pc1));
int* heads = NULL;
int treeUp[2] = { -1, -1 };
int treeDown0[2] = { -1, -1 };
int treeDown1[2] = { -1, -1 };
if (comm->node == 0) {
for (int h=0; h<nHeads; h++) {
char line[1024];
sprintf(line, "NVLS Head %2d:", h);
heads = nvlsHeads+h*comm->nNodes;
for (int n=0; n<comm->nNodes && n<20; n++) {
sprintf(line+strlen(line), " %2d", heads[n]);
}
INFO(NCCL_INIT, "%s", line);
}
}
// Find the heads where I'm the head rank and retain tree up/down
for (int h=0; h<nHeads; h++) {
heads = nvlsHeads+h*comm->nNodes;
if (heads[comm->node] == comm->rank) {
treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent];
treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
break;
}
}
// Set prev/next in all channels (NVLS compute channels work
// orthogonally to NVLS search channels).
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.treeUp = treeUp[c%2];
channel->nvls.treeDown[0] = channel->nvls.down;
int ix = 1;
if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2];
if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2];
}
struct ncclNvls* nvls0 = &comm->channels[0].nvls;
struct ncclNvls* nvls1 = &comm->channels[1].nvls;
INFO(NCCL_GRAPH, "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
nvls0->treeDown[0], nvls0->treeDown[1], comm->rank, nvls0->treeUp,
nvls1->treeDown[0], nvls1->treeDown[1], comm->rank, nvls1->treeUp);
return ncclSuccess;
}
// Legacy naming
NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
@ -266,33 +346,40 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev
return c;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph) {
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
for (int i=0; i<nranks; i++) {
for (int c=0; c<nChannels;c++) {
ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
for (int c=0; c<nChannels;c++) {
for (int n=0; n<nNodes; n++) {
int r = firstRanks[n];
ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
}
for (int r=0; r<nranks; r++) {
ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS]));
// Duplicate ringPrev/ringNext for ncclBuildRing
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@ -303,6 +390,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
// Setup CollNet
if (comm->collNetSupport == 1) {
struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
@ -311,10 +399,21 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
NCCLCHECK(connectCollNet(comm, collNetGraph));
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= MAXCHANNELS) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->minCTAs), ringPrev, ringNext);
if (comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
}
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
@ -326,6 +425,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
free(nvlsHeads);
return ncclSuccess;
}

View File

@ -538,6 +538,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system));
}
// Set direct paths to NVSwitches.
for (int n=0; n<system->nodes[NVS].count; n++) {
NCCLCHECK(ncclTopoSetPaths(system->nodes[NVS].nodes+n, system));
}
// Update path for GPUs when we don't want to / can't use GPU Direct P2P
for (int g=0; g<system->nodes[GPU].count; g++) {
for (int p=0; p<system->nodes[GPU].count; p++) {
@ -564,7 +569,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
if (shm == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
}
}
}
@ -578,32 +583,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (ncclPxnDisable(comm) != 1) {
int pxnGpu = -1;
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
int localGpuIndex;
NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
if (localGpuIndex != g && localGpuIndex != -1) {
// PXN = PCI + NVLink.
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
// Only use PXN for NIC n if remote GPU p ...
if (peerNode->paths[NET][n].type > PATH_PXB || // Is connected to the NIC through PCI
peerNode->paths[GPU][g].type > PATH_NVL || // Is connected to us through NVLink
(peerNode->paths[NET][n].bw <= gpu->paths[NET][n].bw && // Has either higher BW to that NIC
gpu->paths[NET][n].type <= PATH_PXB)) // or avoids going through a CPU
continue;
pxnGpu = p;
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
// To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
if (netDev == netNode->id) break;
}
if (pxnGpu != -1) {
if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n));
NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
}
}
// Update path when we dont want to / can't use GPU Direct RDMA.
@ -632,7 +625,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
domains[g] = g;
ids[g] = gpu->id;
for (int p=0; p<g; p++) {
if (gpu->paths[GPU][p].count > 0) {
if (gpu->paths[GPU][p].type < PATH_NET) {
domains[g] = std::min(domains[g], domains[p]);
}
}
@ -708,8 +701,14 @@ static int nextPow2(int v) {
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
/* here we already honor comm->max/minCTAs for p2pnChannels. */
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
if (comm->sharedRes->owner != comm) {
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()), comm->sharedRes->tpP2pNChannels);
} else {
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
}
int minChannels = comm->p2pnChannels;
// We need to loop through all local GPUs to have a global picture
for (int g=0; g<comm->topo->nodes[GPU].count; g++) {

View File

@ -10,6 +10,8 @@
#include "xml.h"
#include <math.h>
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
// Initialize system->maxBw. This is the per-channel (i.e. per-SM)
// max bw.
static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
@ -106,11 +108,15 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
if (type1 == -1) return ncclSuccess;
struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1;
struct ncclTopoLinkList* path = node1->paths[type2]+index2;
if (path == NULL) {
WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2);
return ncclInternalError;
}
if (path->count == 0 ) return ncclSuccess;
// Now check link type
*node = NULL;
int intra = type1 == GPU && type2 == GPU;
int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS);
float bw = intra ? graph->bwIntra : graph->bwInter;
int type = intra ? graph->typeIntra : graph->typeInter;
@ -290,17 +296,53 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
return ncclSuccess;
}
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
// 1. Constraint to get the same nChannels between Rings and Trees
ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
struct ncclTopoNode* nvs;
struct ncclTopoNode* gpu;
int d0=0; // See if there is enough bandwidth for NVS->GPU traffic
do {
NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu));
d0++;
} while (gpu && d0 < system->nodes[GPU].count);
if (gpu == NULL) {
d0--;
} else {
int d1=0; // See if there is enough bandwidth for GPU->NVS traffic
do {
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs));
d1++;
} while (nvs && d1 < system->nodes[GPU].count);
if (nvs == NULL) {
d1--;
} else { // Both directions worked. Move on to the next path.
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
}
while (d1) {
d1--;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs));
}
}
while (d0) {
d0--;
NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu));
}
return ncclSuccess;
}
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
// 1. Try to get the same nChannels between Rings and Trees
if (graph->nChannels < graph->minChannels) return ncclSuccess;
// 2. Try to get better bandwidth
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
// Give a 15% perf bonus to paths not crossing nics
float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra*target) {
*copy = 1;
return ncclSuccess;
}
// 3. Less hops (but not at the price of going cross NICs)
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra*target) return ncclSuccess;
// 3. Less hops
if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
return ncclSuccess;
}
@ -365,7 +407,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
// Determine whether we found a better solution or not
int copy = 0;
graph->nChannels++;
NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, &copy));
NCCLCHECK(ncclTopoCompareGraphs(system, graph, saveGraph, &copy));
if (copy) {
memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
if (graph->nChannels == graph->maxChannels) *time = -1;
@ -417,6 +459,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
}
free(nets);
}
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
} else if (step < system->nodes[GPU].count-1) {
// Go to next GPU
int next[NCCL_TOPO_MAX_NODES];
@ -570,7 +614,10 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
} else {
// Intra-node only.
if (graph->nChannels == 0) {
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels));
return ncclSuccess;
} else if (graph->nChannels == 0) {
// Try PCI order first
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
} else {
@ -637,7 +684,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
int crossNic;
NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
if (graph->crossNic == 0 && crossNic == 1) return ncclSuccess;
if (ncclParamCrossNic() == 0 && crossNic == 1) return ncclSuccess;
graph->crossNic = crossNic;
NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
@ -726,29 +773,31 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
return ncclSuccess;
}
float speedArrayIntra[] = { 44.0, 30.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
float sm90SpeedArrayIntra[] = { 66.0, 33.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 30.0, 24.0, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
graph->crossNic = ncclParamCrossNic();
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic &&
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0;
graph->bwIntra = graph->bwInter = 0;
graph->latencyInter = 0;
if (graph->crossNic == 2) graph->crossNic = 0;
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
graph->typeInter = PATH_PIX;
graph->nChannels = 0;
graph->sameChannels = 1;
int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
graph->sameChannels = trySameChannels;
char* str = getenv("NCCL_GRAPH_FILE");
if (str) {
@ -763,10 +812,16 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
if (graph->nChannels > 0) return ncclSuccess;
}
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
int ccMin;
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
// Force intra-node NVLS algorithm to pull evenly from all GPUs.
graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
}
struct ncclTopoGraph tmpGraph;
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
@ -783,7 +838,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
}
int pass = 1;
int speedIndex = 0;
while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
float maxBw = system->maxBw;
if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) maxBw /= ngpus; // We want all GPUs to pull the same BW
while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
@ -817,7 +874,7 @@ search:
tmpGraph.sameChannels = 0;
goto search;
}
tmpGraph.sameChannels = 1;
tmpGraph.sameChannels = trySameChannels;
if (time != -1) globalTimeout += time;
else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
@ -856,7 +913,7 @@ search:
goto search;
}
speedIndex = 0;
while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
}
@ -885,7 +942,7 @@ done:
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
}
if (graph->nChannels == 0 && graph->collNet == 0) {
if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
graph->inter[0] = graph->inter[1] = 0;
@ -894,7 +951,7 @@ done:
graph->nChannels = 1;
}
if ((ccMin <= 80 && graph->bwIntra >= 25.0) || (ccMin <= 90 && graph->bwIntra >= 50.0)) {
if (graph->pattern != NCCL_TOPO_PATTERN_NVLS && ((ccMin <= 80 && graph->bwIntra >= 25.0) || (ccMin <= 90 && graph->bwIntra >= 50.0))) {
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
@ -943,23 +1000,40 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
return ncclSuccess;
}
#include "comm.h"
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int* dev) {
int localRanks = comm->topo->nodes[GPU].count;
for (int c=0; c<graph->nChannels; c++) {
if (graph->intra[c*localRanks] == comm->rank) {
*dev = graph->inter[c*2];
return ncclSuccess;
}
}
WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
return ncclInternalError;
}
// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
#include "comm.h"
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
if (graph) {
// Honor the net device in the graph
int channel = channelId%graph->nChannels;
int ngpus = comm->topo->nodes[GPU].count;
int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
*dev = graph->inter[channel*2+index];
if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
*dev = graph->inter[channel*2+index];
} else {
NCCLCHECK(getNvlsNetDev(comm, graph, dev));
}
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
} else if (peerRank == -1) {
return ncclInternalError;
} else {
// Start with our local NIC and local Rank
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, dev));
*proxyRank = rank;
int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
@ -969,7 +1043,9 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
int cudaDev = comm->peerInfo[peerRank].cudaDev;
int localRank;
if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess;
int netDev = comm->peerInfo[localRank].netDev;
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev));
int n;
// Check that device exists on our node
if (ncclParamCrossNic() == 0) {
@ -989,20 +1065,17 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
}
} else if (pxnLevel == 2) {
// Check whether we can access it through our node-local GPU for that NIC.
for (int r=0; r<comm->localRanks; r++) {
int peerRank = comm->localRankToRank[r];
if (comm->peerInfo[peerRank].netDev == netDev) {
int g1, g2, n;
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2));
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
*proxyRank = peerRank;
*dev = netDev;
return ncclSuccess;
}
// Check which local GPU corresponds to that NIC and see if we can use PXN.
int n, g1, g2;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netDev, &g2));
if (g2 != -1) {
struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
*proxyRank = peerGpu->gpu.rank;
*dev = netDev;
return ncclSuccess;
}
}
}

View File

@ -646,11 +646,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
}
}
if (netDevCount == 0) {
NCCLCHECK(ncclNetDevices(comm, &netDevCount));
NCCLCHECK(comm->ncclNet->devices(&netDevCount));
}
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, n, &props));
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@ -679,10 +679,8 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id) {
int g;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
int minType = PATH_SYS;
static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
int minType = PATH_DIS;
float maxBw = 0;
int count = 0;
int* nets;
@ -692,20 +690,115 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* i
if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
maxBw = path->bw;
minType = path->type;
if (type) *type = minType;
count = 0;
}
if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
}
if (count == 0) {
*id = -1;
free(nets);
*localNetMask = 0ULL;
for (int n=0; n<count; n++) {
if (nets[n] >= 64) return ncclInternalError;
*localNetMask |= 1ULL<<nets[n];
}
free(nets);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
uint64_t* localNetMasks;
int ngpus = system->nodes[GPU].count;
NCCLCHECK(ncclCalloc(&localNetMasks, ngpus));
// Fill localNetMasks for all GPUs.
for (int g=0; g<ngpus; g++) {
NCCLCHECK(getLocalNetMask(system, g, localNetMasks+g, NULL));
}
// Find GPUs which have the same mask as rank, i.e. share the same local Nets.
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
int netLocalGpus = 0, netLocalGpu = 0;
for (int g=0; g<ngpus; g++) {
if (localNetMasks[g] == localNetMasks[gpu]) {
if (g == gpu) netLocalGpu = netLocalGpus;
netLocalGpus++;
}
}
uint64_t localNetMask = localNetMasks[gpu];
free(localNetMasks);
if (localNetMask == 0) return ncclInternalError;
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while (1) {
if (1ULL << n & localNetMask) {
if (gIndex == netLocalGpu && cId == channelId) {
*id = n;
return ncclSuccess;
}
gIndex++;
if (gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n+1) % 64;
}
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
int ngpus = system->nodes[GPU].count;
int* gpus;
NCCLCHECK(ncclCalloc(&gpus, ngpus));
// Find localNetMask which includes net with the most local GPUs.
int netLocalGpus = 0, minType = PATH_DIS;
uint64_t localNetMask = 0ULL;
for (int g=0; g<ngpus; g++) {
int type = PATH_DIS;
uint64_t mask;
NCCLCHECK(getLocalNetMask(system, g, &mask, &type));
if ((1ULL<<net) & mask) {
if (type < minType) {
localNetMask = mask;
netLocalGpus = 0;
minType = type;
}
if (type == minType) {
if (localNetMask && mask != localNetMask) {
WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus-1], minType, net, mask, localNetMask);
free(gpus);
return ncclInternalError;
}
gpus[netLocalGpus] = g;
netLocalGpus++;
}
}
}
if (localNetMask == 0ULL) {
*gpuIndex = -1;
free(gpus);
return ncclSuccess;
}
int rr = system->nodes[GPU].nodes[g].gpu.dev;
*id = nets[rr%count];
free(nets);
return ncclSuccess;
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while (1) {
if (1ULL << n & localNetMask) {
if (n == net) {
*gpuIndex = gpus[gIndex];
free(gpus);
return ncclSuccess;
}
gIndex++;
if (gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n+1) % 64;
}
}
/****************************/
@ -785,6 +878,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
return ncclSuccess;
}
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[GPU].count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[NET].count;
return ncclSuccess;

View File

@ -12,12 +12,13 @@
#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 22.0
#define SM80_NVLINK_BW 22.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.0
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define SKL_QPI_BW 9.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
@ -72,7 +73,12 @@ extern const char* topoLinkTypeStr[];
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
#define PATH_SYS 7
#define PATH_DIS 7
// Connection through the network
#define PATH_NET 8
// Disconnected
#define PATH_DIS 9
extern const char* topoPathTypeStr[];
struct ncclTopoNode;
@ -195,6 +201,7 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
// Returns NVLink bw in GB/s
static float ncclTopoNVLinkBw(int cudaCompCap) {
return
cudaCompCap >= 90 ? SM90_NVLINK_BW :
cudaCompCap == 86 ? SM86_NVLINK_BW :
cudaCompCap >= 80 ? SM80_NVLINK_BW :
cudaCompCap >= 70 ? SM70_NVLINK_BW :

View File

@ -53,26 +53,30 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 }, { 4.4, 4.4, 0 }, { 0, 0, 40.0 }};
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
{ 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
{ 6.8, 14.0, 0 }, { 6.8, 14.0, 0 }, // Collnet Direct, Chain
{ 0, 0, 23.0 }, { 0, 0, 23.0 }}; // NVLS, NVLS Tree
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
#define NCCL_HW_PCI 1
#define NCCL_HW_NET 2
// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
// Ring/LL128 reflects the latency for the second plateau, not the base latency.
static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 },
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
/* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 4.75 },
/* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
/* NVLS */ { 0, 0, 0 } },
/* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 },
/* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 },
/* NVLS */ { 0, 0, 0 } }
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
/* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 14 },
/* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 19 } }
};
/* Array indexes used below */
@ -94,15 +98,28 @@ static const double perChMaxTreeBws[3][3] = {
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 33.0},
};
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
// Network post overhead in ns (1000 = 1 us)
NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
static float getNetOverhead(struct ncclComm* comm) {
if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
else return 1.0;
}
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
@ -124,7 +141,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph, ringGraph/* we only need the NVSwitch speed for NVLS*/ };
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
@ -140,18 +156,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (a == NCCL_ALGO_NVLS && p != NCCL_PROTO_SIMPLE) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
float busBw = graphs[a]->nChannels * bw;
// Various model refinements
if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
if (compCapIndex == HOPPER_COMPCAP_IDX) busBw = std::min(busBw, 370.0f);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
@ -165,30 +179,39 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
factor -= (factor-1)/2;
busBw /= factor;
}
if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85;
// Convert bus BW to algorithm BW
float ratio;
if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS) ratio = .75;
else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1));
else ratio = .5;
comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = hwLat[intraHw[a]][a][p];
float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : hwLat[NCCL_HW_NET][a][p];
float interLat = hwLat[NCCL_HW_NET][a][p] + graphs[a]->latencyInter;
// Also add the flush extra latency
if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (a == NCCL_ALGO_RING) {
float lat = hwLat[hw[a]][a][p];
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
if (ringGraph->sameChannels) {
if (graphs[a]->sameChannels) {
comm->latencies[coll][a][p] += lat;
} else {
if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
comm->latencies[coll][a][p] += nsteps*lat;
}
} else {
// Inter-node rings still have to launch nsteps * net overhead.
float netOverhead = 0.0;
if (nNodes > 1) {
netOverhead = getNetOverhead(comm);
if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
}
intraLat = std::max(intraLat, netOverhead);
comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
}
} else if (a == NCCL_ALGO_TREE) {
@ -198,7 +221,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->latencies[coll][a][p] +=
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency
} else if (a == NCCL_ALGO_COLLNET_CHAIN) {
comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat;
comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
} else if (a == NCCL_ALGO_NVLS) {
if (nNodes > 1) comm->latencies[coll][a][p] += hwLat[NCCL_HW_NET][a][p];
} else if (a == NCCL_ALGO_NVLS_TREE) {
comm->latencies[coll][a][p] += 2*(nNodes-1)*hwLat[NCCL_HW_NET][a][p];
}
}
}
@ -207,7 +234,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
const char *protoStr = getenv("NCCL_PROTO");
if (protoStr) {
@ -220,15 +247,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
}
// Disable NVLink SHARP if not supported
if (comm->nvlsSupport == 0 /* || comm->localRanks <= 2*/) algoEnable[NCCL_ALGO_NVLS] = 0;
if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
// Disable CollNet if it is not supported
if (comm->collNetSupport == 0) {
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
// If user has hard set NCCL_ALGO=COLLNET, ignore it
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
}
@ -262,28 +290,38 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (comm->rank == 0) {
char line[1024];
sprintf(line, "Latency/AlgBw |");
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %7s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
for (int block=0; block<2; block++) {
sprintf(line, " Algorithm |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
sprintf(line+strlen(line), " %14s %14s %14s |", "", ncclAlgoStr[a], "");
}
}
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Max NThreads |");
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
}
}
INFO(NCCL_TUNING, "%s", line);
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", ncclFuncStr[c]);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Protocol |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
}
}
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Max NThreads |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
}
}
INFO(NCCL_TUNING, "%s", line);
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", ncclFuncStr[c]);
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
}
}
INFO(NCCL_TUNING, "%s", line);
}
}
}
@ -340,8 +378,8 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) {
lat *= info->comm->minCompCap < 90 ? 1.9 : 1.5; // Plateau effect of ring
&& info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
}
// Tree pipelining saves latency in aggregation cases
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);

View File

@ -46,8 +46,8 @@ ncclResult_t ncclAsyncLaunch(
/* check if there are blocking and nonblocking comms at the same time in group. */
if (ncclGroupBlocking == -1) {
/* first met communicator */
ncclGroupBlocking = comm->blocking;
} else if (ncclGroupBlocking != comm->blocking) {
ncclGroupBlocking = comm->config.blocking;
} else if (ncclGroupBlocking != comm->config.blocking) {
WARN("Blocking and nonblocking communicators are not allowed in the same group.");
ret = ncclInvalidArgument;
}
@ -242,7 +242,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
}
if (!comm->blocking)
if (!comm->config.blocking)
(void) ncclCommSetAsyncError(comm, error);
comm = next;
}
@ -251,7 +251,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
*job->abortFlag = 1;
if (job->comm && !job->comm->blocking)
if (job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, error);
if (job->undo) job->undo(job);
if (job->destructor) job->destructor((void*)job);
@ -346,7 +346,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
if (job->comm && !job->comm->blocking)
if (job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, ret);
if (job->destructor) job->destructor((void*)job);
}
@ -355,7 +355,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
struct ncclComm* comm = groupCommHeadMain;
struct ncclComm* next = comm->groupNext;
(void) ncclGroupCommLeave(comm);
if (!comm->blocking) {
if (!comm->config.blocking) {
(void) ncclCommSetAsyncError(comm, ret);
}
groupCommHeadMain = next;

View File

@ -13,6 +13,9 @@
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_POWER(x, y) \
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);

View File

@ -11,6 +11,7 @@
#include "checks.h"
#include "align.h"
#include "utils.h"
#include "p2p.h"
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
@ -72,13 +73,88 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
return ncclSuccess;
}
#if CUDART_VERSION >= 11030
#include <cuda.h>
#include "cudawrap.h"
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
CUdevice currentDev;
CUmemAllocationProp prop = {};
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
int cudaDev;
int flag = 0;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
prop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
/* Now allow RW access to the newly mapped memory */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = currentDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
if (handlep) *handlep = handle;
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
return result;
}
static inline ncclResult_t ncclCuMemFree(void *ptr) {
if (ptr == NULL) return ncclSuccess;
ncclResult_t result = ncclSuccess;
CUmemGenericAllocationHandle handle;
size_t size = 0;
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
return result;
}
#else
extern int ncclCuMemEnable();
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemFree(void *ptr) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
#endif
template <typename T>
ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
}
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
@ -96,7 +172,11 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in
// Need a side stream so as not to interfere with graph capture.
cudaStream_t stream;
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
}
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
@ -114,7 +194,11 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
}
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@ -155,8 +239,13 @@ template <typename T>
ncclResult_t ncclCudaFree(T* ptr) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaFree(ptr), result, finish);
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
} else {
CUDACHECKGOTO(cudaFree(ptr), result, finish);
}
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;

View File

@ -20,6 +20,7 @@ ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);

View File

@ -9,7 +9,9 @@
#include "comm.h"
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
int peerNode = comm->rankToNode[peer];

View File

@ -50,11 +50,12 @@ struct ncclDevRedOpFull {
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type))
#define DECL3(func, devredop, type, undef) \
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef)
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef) \
DECL4(func, NVLS_TREE, devredop, type, undef)
#if defined(__CUDA_BF16_TYPES_EXIST__)
#define DECL2(func, devredop, undefForFloat) \

View File

@ -96,18 +96,51 @@ struct ncclCommCallback {
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
};
struct ncclSharedResources {
int refCount;
struct ncclComm* owner; /* comm which creates this shared res. */
struct ncclChannelPeer* peers[MAXCHANNELS];
struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
/* P2P operation counter, one per channel */
uint64_t p2pOpCount[MAXCHANNELS];
/* Collective operation counter */
uint64_t collOpCount;
int tpNRanks;
int tpNLocalRanks;
int tpNChannels;
int tpP2pNChannels;
int tpP2pChunkSize;
uint64_t magic;
// top parent rank to localRank translation table
int* tpRankToLocalRank;
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
/* proxy related shared res */
struct ncclProxyState* proxyState;
};
struct ncclChannel {
struct ncclChannelPeer* peers;
struct ncclDevChannelPeer* devPeers;
struct ncclChannelPeer** peers;
struct ncclDevChannelPeer** devPeers;
struct ncclRing ring;
int* devRingUserRanks;
struct ncclTree tree;
struct ncclTree collnetChain;
struct ncclDirect collnetDirect;
struct ncclNvls nvls;
int id; // index of this channel
uint32_t workFifoSent; // last used work index+1
uint64_t p2pOpCount;
/* comm split sharable resources */
struct ncclChannelPeer* collnetPeers;
struct ncclDevChannelPeer* collnetDevPeers;
struct ncclChannelPeer* nvlsPeers;
struct ncclDevChannelPeer* nvlsDevPeers;
};
struct ncclWorkList {
@ -161,6 +194,10 @@ struct ncclComm {
// List of destructors to run when comm is destructed
struct ncclDestructor* destructorHead;
struct ncclSharedResources* sharedRes;
/* map to top parent ranks. */
int* topParentRanks;
int* topParentLocalRanks;
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
@ -174,11 +211,12 @@ struct ncclComm {
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
uint64_t commHash;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int compCap; // compute capability of the GPU
int minCompCap; // min compute capability in the communicator
int minCompCap, maxCompCap; // min/max compute capability in the communicator
int64_t busId; // my PCI bus ID in int format
cpu_set_t cpuAffinity; // CPU affinity of the GPU
int cudaArch; // matches __CUDA_ARCH__ of device
@ -199,12 +237,11 @@ struct ncclComm {
// Counter for tracking CUDA launches (P2P and collectives included)
uint64_t opCount;
// Collective operation counter
uint64_t collOpCount;
// Channels for collectives
int nChannels;
int nvlsChannels;
int collNetChannels;
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
@ -229,6 +266,8 @@ struct ncclComm {
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile uint32_t *childAbortFlag;
uint32_t *abortFlagRefCount;
// Device side of the communicator (for cudaFree's)
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
@ -255,21 +294,23 @@ struct ncclComm {
char intraPad2[64 - sizeof(uint64_t)];
uint64_t intraBarrierGate; // only used if this is intraComm0
struct ncclProxyState proxyState;
struct ncclProxyState* proxyState;
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
// Whether this communicator uses collNet
int collNetSupport;
int intraHighestTransportType;
int* collNetHeads;
int collNetHeadsNum;
/* sharable collNet proxy progress resource. */
struct ncclCollNetSharedRes* collNetSharedRes;
// NVLink SHARP (NVLS) support
int nvlsSupport;
void* nvlsResources;
/* sharable NVLS resource. */
struct ncclNvlsSharedRes* nvlsResources;
size_t channelSize; // User requested work size (bytes) for channel partitions
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
@ -294,13 +335,7 @@ struct ncclComm {
// First of the unlaunched kernels in `planQueue`
struct ncclKernelPlan* unlaunchedPlansHead;
// communicator mode
int blocking;
// CGA cluster size
int cgaClusterSize;
int minCTAs, maxCTAs;
// network interface name
char *netName;
ncclConfig_t config;
// initState is to more conveniently reclaim resources when errors happen.
ncclResult_t initState;
// flag to indicate if ncclCommFinalize() is called

View File

@ -11,6 +11,9 @@
#include <cuda_runtime.h>
#include "checks.h"
// Is cuMem API usage enabled
extern int ncclCuMemEnable();
#if CUDART_VERSION >= 11030
#include <cudaTypedefs.h>
#else
@ -85,6 +88,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070

View File

@ -15,12 +15,13 @@
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@ -100,10 +101,10 @@ struct ncclConnInfo {
};
struct ncclProxyConnector {
int rank;
int localRank;
int tpRank;
int tpLocalRank;
int sameProcess;
struct ncclProxyConnection* connection;
struct ncclComm* comm;
};
struct ncclConnector {
@ -112,7 +113,6 @@ struct ncclConnector {
struct ncclTransportComm* transportComm;
void* transportResources;
struct ncclConnInfo conn;
struct ncclComm *comm;
};
struct ncclRing {
@ -148,18 +148,24 @@ struct ncclDirect {
};
#define NCCL_MAX_NVLS_ARITY 8
#define NCCL_MAX_NVLS_TREE_ARITY 3
struct ncclNvls {
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int up[NCCL_MAX_NVLS_ARITY];
int down;
int treeUp;
int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
int node;
int nNodes;
};
#define NCCL_MAX_CONNS 2
struct ncclChannelPeer {
struct ncclConnector send[NCCL_MAX_CONNS];
struct ncclConnector recv[NCCL_MAX_CONNS];
int refCount;
};
struct ncclDevComm;
@ -270,7 +276,7 @@ struct ncclDevChannelPeer {
};
struct alignas(16) ncclDevChannel {
struct ncclDevChannelPeer *peers;
struct ncclDevChannelPeer** peers;
struct ncclRing ring;
struct ncclTree tree;
struct ncclTree collnetChain;

View File

@ -243,7 +243,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
CUDACHECK(cudaFree(md->gdrDevMem));
NCCLCHECK(ncclCudaFree(md->gdrDevMem));
free(md);
return ncclSuccess;

View File

@ -53,9 +53,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
#define NCCL_TOPO_CPU_TYPE_SKL 2
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
#define NCCL_TOPO_MAX_NODES 256
@ -66,6 +68,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4 // Ring
#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree
struct ncclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
@ -99,16 +102,15 @@ struct ncclTopoRanks {
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
int nvlsHeads[MAXCHANNELS];
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph);
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);

View File

@ -95,7 +95,7 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
ncclMemoryStackPush(&comm->memScoped);
}
ncclGroupBlocking = comm->blocking;
ncclGroupBlocking = comm->config.blocking;
}
// Add comm to this thread's group needing preconnect

1043
src/include/ibvcore.h Normal file

File diff suppressed because it is too large Load Diff

44
src/include/ibvsymbols.h Normal file
View File

@ -0,0 +1,44 @@
#ifndef NCCL_IBV_SYMBOLS_H_
#define NCCL_IBV_SYMBOLS_H_
#ifdef NCCL_BUILD_RDMA_CORE
#include <infiniband/verbs.h>
#else
#include "ibvcore.h"
#endif
#include "nccl.h"
/* IB Verbs Function Pointers*/
struct ncclIbvSymbols {
int (*ibv_internal_fork_init)(void);
struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
void (*ibv_internal_free_device_list)(struct ibv_device **list);
const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
int (*ibv_internal_close_device)(struct ibv_context *context);
int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
/* DMA-BUF support */
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
};
/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
#endif // NCCL_IBV_SYMBOLS_H_

File diff suppressed because it is too large Load Diff

View File

@ -25,6 +25,7 @@ typedef enum : uint8_t {
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
@ -93,7 +94,6 @@ struct ncclCudaStreamList {
struct ncclCudaStreamList *next;
cudaStream_t stream;
};
struct ncclTasks {
struct Peer {
bool sendSeen, recvSeen;
@ -103,7 +103,8 @@ struct ncclTasks {
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
size_t collBytesTotal;
struct Peer* peers/*[nRanks]*/;
int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
int *p2pSendOrder, *p2pRecvOrder;
int p2pOrderSteps;
int nTasksColl, nTasksP2p;
// The list of user streams aggregated over all tasks present.

View File

@ -18,25 +18,6 @@ ncclResult_t ncclNetPluginInit();
ncclResult_t ncclNetInit(struct ncclComm* comm);
int ncclNetVersion(struct ncclComm* comm);
// Translation to external API
static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
/* DMA-BUF support */
static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
// Test whether the current GPU support GPU Direct RDMA.
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);

View File

@ -1,30 +1,33 @@
/*
* Copyright 2021-2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
#endif
/*
* Helper array to get the alignment for each predefined C language type.
*/
typedef void* pointer_type;
#if __STDC_VERSION__ >= 201112L /* or CPP11 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
#include <uchar.h>
#include <stdalign.h>
#endif
/* `alignof` is available as of C11 or C++11 */
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
#define nvtx_alignof(type) alignof(type)
#define nvtx_alignof2(type,tname) alignof(type)
#else /* __STDC_VERSION__ >= 201112L */
#ifndef __cplusplus
#include <stddef.h>
#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
#define nvtx_alignof2(type,tname) nvtx_alignof(type)
#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */
#else /* __cplusplus */
#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
/* Create helper structs to determine type alignment. */
#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type
#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname
MKTYPEDEF(char);
MKTYPEDEF2(unsigned char, uchar);
@ -54,22 +57,33 @@ MKTYPEDEF(size_t);
MKTYPEDEF(pointer_type);
MKTYPEDEF(wchar_t);
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
{sizeof(char8_t), nvtx_alignof(char8_t)},
/* `char8_t` is available as of C++20 or C23 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
MKTYPEDEF(char8_t);
#endif
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
/* `char16_t` and `char32_t` are available as of C++11 or C11 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
MKTYPEDEF(char16_t);
MKTYPEDEF(char32_t);
#endif
/* C requires to include stddef.h to use `offsetof` */
#ifndef __cplusplus
#include <stddef.h>
#endif
#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d)
#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d)
#endif /* __STDC_VERSION__ >= 201112L */
#undef MKTYPEDEF
#undef MKTYPEDEF2
#endif /* __cplusplus */
#endif /* __STDC_VERSION__ >= 201112L */
/*
* Helper array to get the alignment for each predefined C/C++ language type.
* The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
*/
const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
@ -109,13 +123,14 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
/*** Special character types ***/
/* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
{sizeof(char8_t), nvtx_alignof(char8_t)},
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)},
#else
{0, 0},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0},
#endif
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
#else
@ -125,4 +140,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
};
#undef nvtx_alignof
#undef nvtx_alignof2
#undef nvtx_alignof2

View File

@ -9,4 +9,21 @@
#ifndef NCCL_P2P_H_
#define NCCL_P2P_H_
#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
typedef struct {
int data; // Currently only support an fd based descriptor
} ncclCuDesc;
typedef union {
// Legacy CUDA IPC
cudaIpcMemHandle_t devIpc;
// cuMem API support
ncclCuDesc cuDesc;
} ncclIpcDesc;
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
#endif

View File

@ -13,11 +13,12 @@
#include "ipcsocket.h"
#include <pthread.h>
#include "shm.h"
#include "p2p.h"
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
@ -120,18 +121,11 @@ struct ncclProxySharedP2p {
int size;
char* cudaBuff;
char* hostBuff;
cudaIpcMemHandle_t ipc;
// CUDA IPC
ncclIpcDesc ipcDesc;
struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
};
struct ncclProxySharedCollNet {
int size;
char* cudaBuff;
char* hostBuff;
struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
void* resources;
};
struct ncclProxyPeer {
struct ncclProxySharedP2p send;
struct ncclProxySharedP2p recv;
@ -155,7 +149,6 @@ struct ncclProxyProgressState {
bool stop;
struct ncclProxyPeer** localPeers;
struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
struct ncclProxySharedCollNet collNet;
struct ncclProxyArgs* active;
struct ncclProxyArgs* pool;
struct ncclProxyPool* pools;
@ -182,12 +175,27 @@ struct ncclProxyAsyncOp {
struct ncclProxyLocalPeer {
struct ncclSocket sock;
int localRank;
int tpRank;
int tpLocalRank;
ncclProxyAsyncOp* asyncOps;
int asyncOpCounter;
};
struct ncclProxyState {
int refCount;
int tpRank;
int tpnRanks;
int tpLocalnRanks;
int cudaDev;
int p2pnChannels;
int p2pChunkSize;
int nChannels;
int buffSizes[NCCL_NUM_PROTOCOLS];
bool allocP2pNetLLBuffers;
bool dmaBufSupport;
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
volatile uint32_t* abortFlag;
// Service thread
pthread_t thread;
struct ncclSocket* listenSock;
@ -199,6 +207,7 @@ struct ncclProxyState {
struct ncclSocket* peerSocks;
struct ncclProxyOps* proxyOps;
void** sharedDevMems;
struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
// Progress thread
struct ncclProxyProgressState progressState;
@ -218,13 +227,14 @@ enum proxyConnectState {
struct ncclProxyConnection {
int send, transport, shared;
int localRank;
int tpLocalRank, sameProcess;
struct ncclSocket* sock;
struct ncclTransportComm* tcomm;
struct ncclProxyArgs *proxyAppend;
struct ncclProxyArgs **proxyAppendPtr;
void* transportResources;
proxyConnectState state;
struct ncclCollNetSharedRes* collNet;
};
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@ -240,7 +250,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* prox
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
enum ncclProxyMsgType {
ncclProxyMsgInit = 1,
ncclProxyMsgSharedInit = 2,
@ -250,18 +260,21 @@ enum ncclProxyMsgType {
ncclProxyMsgClose = 6,
ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8,
ncclProxyMsgConvertFd = 9 // cuMem API support
ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
};
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
ncclResult_t ncclProxyStop(struct ncclComm* comm);
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
#endif

View File

@ -35,7 +35,6 @@ struct ncclComm;
struct ncclPeerInfo {
int rank;
int cudaDev;
int netDev;
int gdrSupport;
uint64_t hostHash;
uint64_t pidHash;
@ -50,15 +49,46 @@ struct ncclConnect {
char data[CONNECT_SIZE];
};
#if CUDART_VERSION >= 12010
#define NVLS_HANDLE_SIZE 64
struct ncclNvlsSharedRes {
int refCount;
CUmulticastObjectProp properties;
CUmemAccessDesc accessDesc;
int dev;
size_t size;
size_t granularity;
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
char* mcBuff; // Multicast NVLS buffer address
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* ucBuff; // Unicast NVLS buffer address
char shareableHandle[NVLS_HANDLE_SIZE];
int nChannels;
};
#endif /* CUDART_VERSION >= 12010 */
struct ncclCollNetSharedRes {
int refCount;
int size;
char* cudaBuff;
char* hostBuff;
struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
void* resources;
int nChannels;
size_t buffSize;
};
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(struct ncclConnector*);
ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels);
ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
};
struct ncclTransport {
@ -71,7 +101,8 @@ struct ncclTransport {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
enum { collNetRecv=0, collNetSend=1 };

File diff suppressed because it is too large Load Diff

View File

@ -6,10 +6,46 @@
#include "nccl.h"
#include "debug.h"
#include "param.h"
#include "cudawrap.h"
#include <dlfcn.h>
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0);
static int ncclCuMemSupported = 0;
// Determine whether CUMEM & VMM RDMA is supported on this platform
int ncclIsCuMemSupported() {
#if CUDART_VERSION < 11030
return 0;
#else
CUdevice currentDev;
int cudaDev;
int cudaDriverVersion;
int flag = 0;
ncclResult_t ret = ncclSuccess;
CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
if (cudaDriverVersion < 12000) return 0; // Need CUDA_VISIBLE_DEVICES support
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error);
if (CUPFN(cuMemCreate) == NULL) return 0;
CUCHECKGOTO(cuDeviceGet(&currentDev, cudaDev), ret, error);
// Query device to see if CUMEM VMM support is available
CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
if (!flag) return 0;
// Query device to see if CUMEM RDMA support is available
CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error);
if (!flag) return 0;
error:
return (ret == ncclSuccess);
#endif
}
int ncclCuMemEnable() {
return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable());
}
#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
#if CUDART_VERSION >= 11030
@ -35,6 +71,7 @@ DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemMap, 10020);
DECLARE_CUDA_PFN(cuMemRelease, 10020);
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070
@ -89,7 +126,6 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
LOAD_SYM(cuCtxGetDevice, 2000, 1);
/* cuMem API support */
#if CUDA_VERSION >= 11030
LOAD_SYM(cuMemAddressReserve, 10020, 1);
LOAD_SYM(cuMemAddressFree, 10020, 1);
LOAD_SYM(cuMemCreate, 10020, 1);
@ -98,9 +134,9 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
LOAD_SYM(cuMemMap, 10020, 1);
LOAD_SYM(cuMemRelease, 10020, 1);
LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
LOAD_SYM(cuMemSetAccess, 10020, 1);
LOAD_SYM(cuMemUnmap, 10020, 1);
#endif
#if CUDA_VERSION >= 11070
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
#endif
@ -135,7 +171,7 @@ static void initOnceFunc() {
if (ncclCudaPath == NULL)
snprintf(path, 1024, "%s", "libcuda.so");
else
snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
snprintf(path, 1024, "%s/%s", ncclCudaPath, "libcuda.so");
(void) dlerror(); // Clear any previous errors
cudaLib = dlopen(path, RTLD_LAZY);
@ -195,6 +231,9 @@ static void initOnceFunc() {
}
#endif
// Determine whether we support the cuMem APIs or not
ncclCuMemSupported = ncclIsCuMemSupported();
initResult = ncclSuccess;
return;
error:

158
src/misc/ibvsymbols.cc Normal file
View File

@ -0,0 +1,158 @@
#include <sys/types.h>
#include <unistd.h>
#include "ibvsymbols.h"
#ifdef NCCL_BUILD_RDMA_CORE
/* RDMA-core linking mode. Symbols are pointers to linked IB Verbs */
#define ASSIGN_SYM(container, symbol, name) container->name= &symbol;
// Passthrough function for ibv_reg_mr macro in verbs.h
struct ibv_mr* ibv_internal_reg_mr(
struct ibv_pd* pd,
void* addr,
size_t length,
int access) {
return ibv_reg_mr(pd, addr, length, access);
}
// Passthrough function for ibv_internal_query_port macro in verbs.h
int ibv_internal_query_port(
struct ibv_context* context,
uint8_t port_num,
struct ibv_port_attr* port_attr) {
return ibv_query_port(context, port_num, port_attr);
}
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
ASSIGN_SYM(ibvSymbols, ibv_get_device_list, ibv_internal_get_device_list);
ASSIGN_SYM(ibvSymbols, ibv_free_device_list, ibv_internal_free_device_list);
ASSIGN_SYM(ibvSymbols, ibv_get_device_name, ibv_internal_get_device_name);
ASSIGN_SYM(ibvSymbols, ibv_open_device, ibv_internal_open_device);
ASSIGN_SYM(ibvSymbols, ibv_close_device, ibv_internal_close_device);
ASSIGN_SYM(ibvSymbols, ibv_get_async_event, ibv_internal_get_async_event);
ASSIGN_SYM(ibvSymbols, ibv_ack_async_event, ibv_internal_ack_async_event);
ASSIGN_SYM(ibvSymbols, ibv_query_device, ibv_internal_query_device);
ASSIGN_SYM(ibvSymbols, ibv_query_gid, ibv_internal_query_gid);
ASSIGN_SYM(ibvSymbols, ibv_query_qp, ibv_internal_query_qp);
ASSIGN_SYM(ibvSymbols, ibv_alloc_pd, ibv_internal_alloc_pd);
ASSIGN_SYM(ibvSymbols, ibv_dealloc_pd, ibv_internal_dealloc_pd);
ASSIGN_SYM(ibvSymbols, ibv_reg_mr_iova2, ibv_internal_reg_mr_iova2);
ASSIGN_SYM(ibvSymbols, ibv_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr);
ASSIGN_SYM(ibvSymbols, ibv_dereg_mr, ibv_internal_dereg_mr);
ASSIGN_SYM(ibvSymbols, ibv_create_cq, ibv_internal_create_cq);
ASSIGN_SYM(ibvSymbols, ibv_destroy_cq, ibv_internal_destroy_cq);
ASSIGN_SYM(ibvSymbols, ibv_create_qp, ibv_internal_create_qp);
ASSIGN_SYM(ibvSymbols, ibv_modify_qp, ibv_internal_modify_qp);
ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp);
ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init);
ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str);
ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr;
ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port;
return ncclSuccess;
}
#else
/* RDMA-core dynamic loading mode. Symbols are loaded from shared objects. */
#include <dlfcn.h>
#include "core.h"
// IBVERBS Library versioning
#define IBVERBS_VERSION "IBVERBS_1.1"
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
static void* ibvhandle = NULL;
void* tmp;
void** cast;
ibvhandle=dlopen("libibverbs.so", RTLD_NOW);
if (!ibvhandle) {
ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
if (!ibvhandle) {
INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]");
goto teardown;
}
}
#define LOAD_SYM(handle, symbol, funcptr) do { \
cast = (void**)&funcptr; \
tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \
if (tmp == NULL) { \
WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION); \
goto teardown; \
} \
*cast = tmp; \
} while (0)
// Attempt to load a specific symbol version - fail silently
#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \
cast = (void**)&funcptr; \
*cast = dlvsym(handle, symbol, version); \
} while (0)
LOAD_SYM(ibvhandle, "ibv_get_device_list", ibvSymbols->ibv_internal_get_device_list);
LOAD_SYM(ibvhandle, "ibv_free_device_list", ibvSymbols->ibv_internal_free_device_list);
LOAD_SYM(ibvhandle, "ibv_get_device_name", ibvSymbols->ibv_internal_get_device_name);
LOAD_SYM(ibvhandle, "ibv_open_device", ibvSymbols->ibv_internal_open_device);
LOAD_SYM(ibvhandle, "ibv_close_device", ibvSymbols->ibv_internal_close_device);
LOAD_SYM(ibvhandle, "ibv_get_async_event", ibvSymbols->ibv_internal_get_async_event);
LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibvSymbols->ibv_internal_ack_async_event);
LOAD_SYM(ibvhandle, "ibv_query_device", ibvSymbols->ibv_internal_query_device);
LOAD_SYM(ibvhandle, "ibv_query_port", ibvSymbols->ibv_internal_query_port);
LOAD_SYM(ibvhandle, "ibv_query_gid", ibvSymbols->ibv_internal_query_gid);
LOAD_SYM(ibvhandle, "ibv_query_qp", ibvSymbols->ibv_internal_query_qp);
LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibvSymbols->ibv_internal_alloc_pd);
LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibvSymbols->ibv_internal_dealloc_pd);
LOAD_SYM(ibvhandle, "ibv_reg_mr", ibvSymbols->ibv_internal_reg_mr);
// Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibvSymbols->ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
// Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibvSymbols->ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibvSymbols->ibv_internal_dereg_mr);
LOAD_SYM(ibvhandle, "ibv_create_cq", ibvSymbols->ibv_internal_create_cq);
LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibvSymbols->ibv_internal_destroy_cq);
LOAD_SYM(ibvhandle, "ibv_create_qp", ibvSymbols->ibv_internal_create_qp);
LOAD_SYM(ibvhandle, "ibv_modify_qp", ibvSymbols->ibv_internal_modify_qp);
LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibvSymbols->ibv_internal_destroy_qp);
LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init);
LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str);
return ncclSuccess;
teardown:
ibvSymbols->ibv_internal_get_device_list = NULL;
ibvSymbols->ibv_internal_free_device_list = NULL;
ibvSymbols->ibv_internal_get_device_name = NULL;
ibvSymbols->ibv_internal_open_device = NULL;
ibvSymbols->ibv_internal_close_device = NULL;
ibvSymbols->ibv_internal_get_async_event = NULL;
ibvSymbols->ibv_internal_ack_async_event = NULL;
ibvSymbols->ibv_internal_query_device = NULL;
ibvSymbols->ibv_internal_query_port = NULL;
ibvSymbols->ibv_internal_query_gid = NULL;
ibvSymbols->ibv_internal_query_qp = NULL;
ibvSymbols->ibv_internal_alloc_pd = NULL;
ibvSymbols->ibv_internal_dealloc_pd = NULL;
ibvSymbols->ibv_internal_reg_mr = NULL;
ibvSymbols->ibv_internal_reg_mr_iova2 = NULL;
ibvSymbols->ibv_internal_reg_dmabuf_mr = NULL;
ibvSymbols->ibv_internal_dereg_mr = NULL;
ibvSymbols->ibv_internal_create_cq = NULL;
ibvSymbols->ibv_internal_destroy_cq = NULL;
ibvSymbols->ibv_internal_create_qp = NULL;
ibvSymbols->ibv_internal_modify_qp = NULL;
ibvSymbols->ibv_internal_destroy_qp = NULL;
ibvSymbols->ibv_internal_fork_init = NULL;
ibvSymbols->ibv_internal_event_type_str = NULL;
if (ibvhandle != NULL) dlclose(ibvhandle);
return ncclSystemError;
}
#endif

View File

@ -8,314 +8,186 @@
#include <sys/types.h>
#include <unistd.h>
#include <dlfcn.h>
#include "core.h"
/*Function Pointers*/
int (*ibv_internal_fork_init)(void);
struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
void (*ibv_internal_free_device_list)(struct ibv_device **list);
const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
int (*ibv_internal_close_device)(struct ibv_context *context);
int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
/* DMA-BUF support */
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
// IBVERBS Library versioning
#define IBVERBS_VERSION "IBVERBS_1.1"
#include "ibvsymbols.h"
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static ncclResult_t initResult;
static void initOnceFunc(void) {
static void* ibvhandle = NULL;
void* tmp;
void** cast;
ibvhandle=dlopen("libibverbs.so", RTLD_NOW);
if (!ibvhandle) {
ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
if (!ibvhandle) {
INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]");
goto teardown;
}
}
#define LOAD_SYM(handle, symbol, funcptr) do { \
cast = (void**)&funcptr; \
tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \
if (tmp == NULL) { \
WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION); \
goto teardown; \
} \
*cast = tmp; \
} while (0)
// Attempt to load a specific symbol version - fail silently
#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \
cast = (void**)&funcptr; \
*cast = dlvsym(handle, symbol, version); \
} while (0)
LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list);
LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list);
LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name);
LOAD_SYM(ibvhandle, "ibv_open_device", ibv_internal_open_device);
LOAD_SYM(ibvhandle, "ibv_close_device", ibv_internal_close_device);
LOAD_SYM(ibvhandle, "ibv_get_async_event", ibv_internal_get_async_event);
LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibv_internal_ack_async_event);
LOAD_SYM(ibvhandle, "ibv_query_device", ibv_internal_query_device);
LOAD_SYM(ibvhandle, "ibv_query_port", ibv_internal_query_port);
LOAD_SYM(ibvhandle, "ibv_query_gid", ibv_internal_query_gid);
LOAD_SYM(ibvhandle, "ibv_query_qp", ibv_internal_query_qp);
LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd);
LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd);
LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
// Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
// Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
LOAD_SYM(ibvhandle, "ibv_create_qp", ibv_internal_create_qp);
LOAD_SYM(ibvhandle, "ibv_modify_qp", ibv_internal_modify_qp);
LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibv_internal_destroy_qp);
LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init);
LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str);
initResult = ncclSuccess;
return;
teardown:
ibv_internal_get_device_list = NULL;
ibv_internal_free_device_list = NULL;
ibv_internal_get_device_name = NULL;
ibv_internal_open_device = NULL;
ibv_internal_close_device = NULL;
ibv_internal_get_async_event = NULL;
ibv_internal_ack_async_event = NULL;
ibv_internal_query_device = NULL;
ibv_internal_query_port = NULL;
ibv_internal_query_gid = NULL;
ibv_internal_query_qp = NULL;
ibv_internal_alloc_pd = NULL;
ibv_internal_dealloc_pd = NULL;
ibv_internal_reg_mr = NULL;
ibv_internal_reg_mr_iova2 = NULL;
ibv_internal_reg_dmabuf_mr = NULL;
ibv_internal_dereg_mr = NULL;
ibv_internal_create_cq = NULL;
ibv_internal_destroy_cq = NULL;
ibv_internal_create_qp = NULL;
ibv_internal_modify_qp = NULL;
ibv_internal_destroy_qp = NULL;
ibv_internal_fork_init = NULL;
ibv_internal_event_type_str = NULL;
if (ibvhandle != NULL) dlclose(ibvhandle);
initResult = ncclSystemError;
return;
}
struct ncclIbvSymbols ibvSymbols;
ncclResult_t wrap_ibv_symbols(void) {
pthread_once(&initOnceControl, initOnceFunc);
pthread_once(&initOnceControl,
[](){ initResult = buildIbvSymbols(&ibvSymbols); });
return initResult;
}
#define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \
if (name_internal == NULL) { \
/* CHECK_NOT_NULL: helper macro to check for NULL symbol */
#define CHECK_NOT_NULL(container, internal_name) \
if (container.internal_name == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
retval = call; \
}
#define IBV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
retval = container.call; \
if (retval == error_retval) { \
WARN("Call to " name " failed with error %s", strerror(errno)); \
return ncclSystemError; \
} \
return ncclSuccess;
#define IBV_PTR_CHECK(name_internal, call, retval, error_retval, name) \
if (name_internal == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
retval = call; \
#define IBV_PTR_CHECK(container, internal_name, call, retval, error_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
retval = container.call; \
if (retval == error_retval) { \
WARN("Call to " name " failed"); \
return ncclSystemError; \
} \
return ncclSuccess;
#define IBV_INT_CHECK_RET_ERRNO(name_internal, call, success_retval, name) \
if (name_internal == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
int ret = call; \
#define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
int ret = container.call; \
if (ret != success_retval) { \
WARN("Call to " name " failed with error %s", strerror(ret)); \
return ncclSystemError; \
} \
return ncclSuccess;
#define IBV_INT_CHECK(name_internal, call, error_retval, name) \
if (name_internal == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
int ret = call; \
#define IBV_INT_CHECK(container, internal_name, call, error_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
int ret = container.call; \
if (ret == error_retval) { \
WARN("Call to " name " failed"); \
return ncclSystemError; \
} \
return ncclSuccess;
#define IBV_PASSTHRU(name_internal, call) \
if (name_internal == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
call; \
#define IBV_PASSTHRU(container, internal_name, call) \
CHECK_NOT_NULL(container, internal_name); \
container.call; \
return ncclSuccess;
ncclResult_t wrap_ibv_fork_init() {
IBV_INT_CHECK(ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
}
ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) {
*ret = ibv_internal_get_device_list(num_devices);
*ret = ibvSymbols.ibv_internal_get_device_list(num_devices);
if (*ret == NULL) *num_devices = 0;
return ncclSuccess;
}
ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) {
IBV_PASSTHRU(ibv_internal_free_device_list, ibv_internal_free_device_list(list));
IBV_PASSTHRU(ibvSymbols, ibv_internal_free_device_list, ibv_internal_free_device_list(list));
}
const char *wrap_ibv_get_device_name(struct ibv_device *device) {
if (ibv_internal_get_device_name == NULL) {
if (ibvSymbols.ibv_internal_get_device_name == NULL) {
WARN("lib wrapper not initialized.");
exit(-1);
}
return ibv_internal_get_device_name(device);
return ibvSymbols.ibv_internal_get_device_name(device);
}
ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
IBV_PTR_CHECK(ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
IBV_PTR_CHECK(ibvSymbols, ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
}
ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
IBV_INT_CHECK(ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
IBV_INT_CHECK(ibvSymbols, ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
}
ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/
IBV_INT_CHECK(ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
IBV_INT_CHECK(ibvSymbols, ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
}
ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) {
IBV_PASSTHRU(ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
IBV_PASSTHRU(ibvSymbols, ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
}
ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
}
ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
}
ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
}
ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) {
IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
}
ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
IBV_PTR_CHECK_ERRNO(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
}
ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
}
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
}
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
if (ibv_internal_reg_mr == NULL) {
if (ibvSymbols.ibv_internal_reg_mr == NULL) {
WARN("lib wrapper not initialized.");
return NULL;
}
return ibv_internal_reg_mr(pd, addr, length, access);
return ibvSymbols.ibv_internal_reg_mr(pd, addr, length, access);
}
ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
if (ibv_internal_reg_mr_iova2 == NULL) {
if (ibvSymbols.ibv_internal_reg_mr_iova2 == NULL) {
return ncclInternalError;
}
if (ret == NULL) { return ncclSuccess; } // Assume dummy call
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
}
/* DMA-BUF support */
ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
}
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
if (ibv_internal_reg_dmabuf_mr == NULL) {
if (ibvSymbols.ibv_internal_reg_dmabuf_mr == NULL) {
errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
return NULL;
}
return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
return ibvSymbols.ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
}
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
}
ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
IBV_PTR_CHECK_ERRNO(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
}
ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
}
ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
}
ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
IBV_PTR_CHECK_ERRNO(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
}
ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
}
ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
*ret = (char *) ibv_internal_event_type_str(event);
*ret = (char *) ibvSymbols.ibv_internal_event_type_str(event);
return ncclSuccess;
}

View File

@ -14,6 +14,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <utils.h>
struct shmHandleInternal {
int fd;
@ -86,17 +87,13 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
if (create) {
*(int*)(hptr + shmSize) = refcount;
} else {
int remref = __atomic_sub_fetch((int*)(hptr + shmSize), 1, __ATOMIC_RELAXED);
int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize));
if (remref == 0) {
/* the last peer has completed attachment, it should unlink the shm mem file. */
if (unlink(shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
}
}
if (refcount != -1) {
WARN("attaching memory should only reduce refcount by 1 but %d is passed", refcount);
}
}
if (devShmPtr) {
@ -133,8 +130,8 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
ret = ncclSystemError;
}
free(tmphandle->shmPath);
}
free(tmphandle->shmPath);
}
if (tmphandle->shmPtr) {

View File

@ -411,7 +411,7 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
if (sock->fd != -1) {
sock->state = ncclSocketStateAccepted;
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno);
WARN("socketTryAccept: Accept failed: %s", strerror(errno));
return ncclSystemError;
}
return ncclSuccess;

View File

@ -46,6 +46,7 @@ typedef enum { ncclSuccess = 0,
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
#define NCCL_SPLIT_NOCOLOR -1
/* Communicator configuration. Users can assign value to attributes to specify the
* behavior of a communicator. */
@ -60,6 +61,7 @@ typedef struct ncclConfig_v21700 {
int minCTAs;
int maxCTAs;
const char *netName;
int splitShare;
} ncclConfig_t;
/* Config initializer must be assigned to initialize config structure when it is created.
@ -72,7 +74,8 @@ typedef struct ncclConfig_v21700 {
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
NCCL_CONFIG_UNDEF_PTR /* netName */ \
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
}
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
@ -128,6 +131,16 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm);
ncclResult_t ncclCommAbort(ncclComm_t comm);
ncclResult_t pncclCommAbort(ncclComm_t comm);
/* Creates one or more communicators from an existing one.
* Ranks with the same color will end up in the same communicator.
* Within the new communicator, key will be used to order ranks.
* NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
* and will therefore return a NULL communicator.
* If config is NULL, the new communicator will inherit the original communicator's
* configuration*/
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
/* Returns a string for each error code. */
const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);

View File

@ -258,10 +258,10 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
ncclResult_t ncclNetInit(struct ncclComm* comm) {
// Initialize main communication network
char* netName;
const char* netName;
bool ok = false;
netName = comm->netName;
netName = comm->config.netName;
for (int i=0; i<3; i++) {
if (ncclNets[i] == nullptr) continue;
enum ncclNetState state;
@ -302,23 +302,27 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
return ncclSuccess;
}
#endif
int netDevs;
NCCLCHECK(ncclNetDevices(comm, &netDevs));
*gdrSupport = 0;
for (int dev=0; dev<netDevs; dev++) {
// Find a net device which is GDR-capable
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, dev, &props));
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
static int gdrSupportMatrix[32] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
if (gdrSupportMatrix[comm->cudaDev] == -1) {
int netDevs;
NCCLCHECK(comm->ncclNet->devices(&netDevs));
gdrSupportMatrix[comm->cudaDev] = 0;
for (int dev=0; dev<netDevs; dev++) {
// Find a net device which is GDR-capable
ncclNetProperties_t props;
NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
// Allocate memory on the GPU and try to register it on the NIC.
void *lComm = NULL, *sComm = NULL, *rComm = NULL;
ncclNetHandle_t handle;
void* gpuPtr = NULL;
char* gpuPtr = NULL;
void* mHandle = NULL;
ncclResult_t ret;
ncclDebugNoWarn = NCCL_NET;
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
bool connected;
connected = false;
@ -330,32 +334,34 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
}
if (sComm == NULL)
NCCLCHECKGOTO(ncclNetConnect(comm, dev, &handle, &sComm), ret, cleanup2);
NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2);
if (rComm == NULL)
NCCLCHECKGOTO(ncclNetAccept(comm, lComm, &rComm), ret, cleanup2);
NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2);
connected = (rComm != NULL) && (sComm != NULL);
}
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle));
*gdrSupport = 1;
NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
gdrSupportMatrix[comm->cudaDev] = 1;
}
ncclDebugNoWarn = 0;
CUDACHECK(cudaFree(gpuPtr));
NCCLCHECK(ncclCudaFree(gpuPtr));
cleanup2:
if (rComm != NULL)
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
NCCLCHECK(comm->ncclNet->closeRecv(rComm));
if (sComm != NULL)
NCCLCHECK(ncclNetCloseSend(comm, sComm));
NCCLCHECK(ncclNetCloseListen(comm, lComm));
NCCLCHECK(comm->ncclNet->closeSend(sComm));
NCCLCHECK(comm->ncclNet->closeListen(lComm));
cleanup1:
break;
break;
}
}
*gdrSupport = gdrSupportMatrix[comm->cudaDev];
return ncclSuccess;
}

File diff suppressed because it is too large Load Diff

View File

@ -21,8 +21,8 @@ template <int type>
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) {
struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
comm->channels[channelId].peers[peer].recv + connIndex;
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer]->send + connIndex :
comm->channels[channelId].peers[peer]->recv + connIndex;
for (int t=0; t<NTRANSPORTS; t++) {
struct ncclTransport *transport = ncclTransports[t];
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
@ -45,12 +45,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int n
uint64_t mask = 1UL << channel->id;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->recv[connIndex].connected) continue;
comm->connectRecv[peer] |= mask;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->send[connIndex].connected) continue;
comm->connectSend[peer] |= mask;
}
return ncclSuccess;
@ -73,7 +73,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel
struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
// First time initialization
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
@ -142,13 +142,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
for (int c=0; c<MAXCHANNELS; c++) {
TIME_START(3);
if (sendMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
struct ncclDevChannelPeer* addr;
conn->connected = 1;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
/* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
@ -159,13 +162,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
// Start with recv channels
TIME_START(4);
if (recvMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
struct ncclDevChannelPeer* addr;
conn->connected = 1;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
/* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
@ -191,8 +197,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
if (highestTransportType != NULL) *highestTransportType = highestType;
TIME_PRINT("P2P Setup/Connect");
exit:
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
return ret;
fail:
goto exit;
@ -226,7 +232,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
}
// select
struct ncclChannelPeer* root = channel->peers+nranks;
struct ncclChannelPeer* root = channel->peers[nranks];
// connector index: 0 for recv, 1 for send
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
@ -265,8 +271,9 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
// connect
if (isMaster) {
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
struct ncclDevChannelPeer* devRoot;
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
@ -305,16 +312,20 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
// Free collNet resources
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
recv->transportResources = NULL; // avoid double free
struct ncclChannelPeer* peer = channel->peers[comm->nRanks];
if (peer) {
if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) {
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
recv->transportResources = NULL; // avoid double free
}
}
}
}
return ncclSuccess;

View File

@ -141,6 +141,7 @@ struct setupReq {
int netDev;
int useGdr;
int needFlush;
struct ncclCollNetSharedRes* collNet;
};
@ -149,16 +150,19 @@ struct setupReq {
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req;
int proxyRank;
int proxyRank, tpProxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
// Determine whether we need to flush the GDR buffer on recv or not
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank));
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
req.collNet = comm->collNetSharedRes;
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "");
@ -168,15 +172,18 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req;
int proxyRank;
int proxyRank, tpProxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank));
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
req.collNet = comm->collNetSharedRes;
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "");
@ -221,7 +228,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
// We're on the same process as the proxy. We can pass a pointer to a struct.
struct collNetConnectArgs args = { rank, nranks, connectInfos };
struct connectMap* map;
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
// If collnet connect failed, propagate error to fallback on regular p2p
if (map == NULL) return ncclSystemError;
@ -247,7 +254,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
// We're on the same process as the proxy. We can pass a pointer to a struct.
struct collNetConnectArgs args = { rank, nranks, connectInfos };
struct connectMap* map;
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
// If collnet connect failed, propagate error to fallback on regular p2p
if (map == NULL) return ncclSystemError;
@ -276,7 +283,7 @@ static ncclResult_t recvFree(struct ncclConnector* recv) {
return ncclSuccess;
}
static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct setupReq* req = (struct setupReq*)reqBuff;
if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
@ -288,9 +295,10 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
resources->netDev = req->netDev;
resources->useGdr = req->useGdr;
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props));
connection->collNet = req->collNet;
/* DMA-BUF support */
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
return ncclSuccess;
}
@ -300,19 +308,19 @@ struct sharedResources {
int commRefCount[NCCL_MAX_NETDEVS];
};
ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
static ncclResult_t sharedListen(struct ncclProxyState* proxyState, int netDev, struct ncclCollNetSharedRes* collNet, void* collNetHandle) {
struct sharedResources* resources = (struct sharedResources*)collNet->resources;
if (resources == NULL) {
NCCLCHECK(ncclCalloc(&resources, 1));
comm->proxyState.progressState.collNet.resources = resources;
collNet->resources = resources;
}
if (resources->collNetComms[netDev] == NULL)
NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev));
NCCLCHECK(proxyState->ncclCollNet->listen(netDev, collNetHandle, resources->collNetListenComms + netDev));
return ncclSuccess;
}
static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
static ncclResult_t sharedConnect(struct ncclProxyState* proxyState, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclCollNetSharedRes* collNet, void** collNetComm) {
struct sharedResources* resources = (struct sharedResources*)collNet->resources;
if (resources->collNetComms[netDev] == NULL) {
// Connect to coll comm
collNetHandle_t** handlePtrs = NULL;
@ -321,13 +329,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
handlePtrs[i] = &(info->collNetHandle);
}
ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank,
ncclResult_t ret = proxyState->ncclCollNet->connect((void**)handlePtrs, nranks, rank,
resources->collNetListenComms[netDev],
resources->collNetComms+netDev);
free(handlePtrs);
if (ret == ncclSuccess) {
// Close listen comm
NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev]));
NCCLCHECK(proxyState->ncclCollNet->closeListen(resources->collNetListenComms[netDev]));
} else {
resources->collNetListenComms[netDev] = NULL;
}
@ -337,55 +345,53 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
return ncclSuccess;
}
static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
static ncclResult_t sharedFree(struct ncclProxyState* proxyState, struct ncclCollNetSharedRes* collNet, int netDev) {
struct sharedResources* resources = (struct sharedResources*)collNet->resources;
resources->commRefCount[netDev]--;
if (resources->commRefCount[netDev] == 0) {
NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev]));
NCCLCHECK(proxyState->ncclCollNet->closeColl(resources->collNetComms[netDev]));
}
for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
comm->proxyState.progressState.collNet.resources = NULL;
collNet->resources = NULL;
free(resources);
return ncclSuccess;
}
static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
if (state->size == 0) {
state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
if (collNet->size == 0) {
collNet->size = 2 * collNet->nChannels * collNet->buffSize;
}
*size = state->size;
*size = collNet->size;
if (cuda && state->cudaBuff == NULL) {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size));
if (cuda && collNet->cudaBuff == NULL) {
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size));
}
if (!cuda && state->hostBuff == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
if (!cuda && collNet->hostBuff == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size));
}
*gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
*gpuPtr = *cpuPtr = cuda ? collNet->cudaBuff : collNet->hostBuff;
return ncclSuccess;
}
static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) {
static ncclResult_t sharedBuffersGet(struct ncclCollNetSharedRes* collNet, int type, int slot, int channel, int* offset) {
// Use different pools for different channels and also separate send/recv.
int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
int slotSize = collNet->buffSize / NCCL_STEPS;
int globalSlot = (type * NCCL_STEPS + slot) * collNet->nChannels + channel;
*offset = slotSize * globalSlot;
return ncclSuccess;
}
static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) {
struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
if (state->size == 0) return ncclSuccess;
CUDACHECK(cudaFree(state->cudaBuff));
NCCLCHECK(ncclCudaHostFree(state->hostBuff));
static ncclResult_t sharedBuffersDestroy(struct ncclCollNetSharedRes* collNet) {
if (collNet->size == 0) return ncclSuccess;
NCCLCHECK(ncclCudaFree(collNet->cudaBuff));
NCCLCHECK(ncclCudaHostFree(collNet->hostBuff));
// This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once.
state->size = 0;
collNet->size = 0;
return ncclSuccess;
}
static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct setupReq* req = (struct setupReq*)reqBuff;
if (reqSize != sizeof (struct setupReq)) return ncclInternalError;
@ -398,18 +404,19 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
resources->useGdr = req->useGdr;
resources->needFlush = req->needFlush;
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props));
connection->collNet = req->collNet;
/* DMA-BUF support */
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
NCCLCHECK(sharedListen(comm, req->netDev, netHandle));
NCCLCHECK(sharedListen(proxyState, req->netDev, req->collNet, netHandle));
return ncclSuccess;
}
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
@ -423,7 +430,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
resources->recvMhandles[p] = info->mhandles[p];
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm));
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
@ -431,7 +438,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
*((struct connectMap**)respBuff) = NULL;
return ncclSuccess;
}
connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev;
connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev;
struct connectMap* map = &resources->map;
@ -459,7 +466,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
// Allocate & Register shared buffers for the Simple protocol
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
#if CUDA_VERSION >= 11070
@ -467,23 +474,23 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
if (resources->useGdr && resources->useDmaBuf) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
}
*((struct connectMap**)respBuff) = &resources->map;
return ncclSuccess;
}
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
@ -491,7 +498,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
resources->collNetRank = args->rank;
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm));
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
@ -499,7 +506,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
*((struct connectMap**)respBuff) = NULL;
return ncclSuccess;
}
connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1;
connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev + 1;
struct connectMap* map = &resources->map;
@ -528,7 +535,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
// Allocate & Register shared buffers for the Simple protocol
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
#if CUDA_VERSION >= 11070
@ -536,16 +543,16 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
if (resources->useGdr && resources->useDmaBuf) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
}
// Pass info to send side
@ -558,41 +565,43 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (resources) {
for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
if (resources->sendMhandles[p]) {
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->sendMhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
NCCLCHECK(sharedBuffersDestroy(comm));
NCCLCHECK(sharedFree(comm, resources->netDev));
NCCLCHECK(sharedBuffersDestroy(connection->collNet));
NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev));
if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet);
free(connection->transportResources);
}
return ncclSuccess;
}
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
if (resources) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->mhandles[p]) {
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
NCCLCHECK(sharedBuffersDestroy(comm));
NCCLCHECK(sharedFree(comm, resources->netDev));
NCCLCHECK(sharedBuffersDestroy(connection->collNet));
NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev));
if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet);
free(connection->transportResources);
}
return ncclSuccess;
@ -602,7 +611,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
#define LAST_OF_GROUP(s) \
(s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@ -629,7 +638,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
int sharedBuffSlot = sub->posted%NCCL_STEPS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
__sync_synchronize();
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@ -650,7 +659,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int ready = 1;
if (s == 0) {
int offset;
NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
args->sharedBuff[sharedBuffSlot] = localBuff + offset;
args->sharedSize[sharedBuffSlot] = args->chunkSize;
}
@ -671,7 +680,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] == NULL) continue;
TRACE(NCCL_NET, "sendProxy [%d/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
@ -687,7 +696,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int done, size;
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size));
NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
if (done) {
TRACE(NCCL_NET, "sendProxy [%d/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
@ -711,7 +720,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
return ncclSuccess;
}
static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@ -742,7 +751,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int sharedBuffSlot = sub->posted%NCCL_STEPS;
int startChannel = group*COLLNET_GROUP_NSUBS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
reqFifo[group][buffSlot].recvBuff = localBuff + offset;
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
sub->posted += args->sliceSteps;
@ -773,8 +782,8 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
} else {
int startChannel = group*COLLNET_GROUP_NSUBS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
}
} else {
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@ -788,7 +797,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
int done = 1;
if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL));
if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] flushed", sub->flushed, group, buffSlot);
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@ -802,7 +811,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
int startChannel = group*COLLNET_GROUP_NSUBS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize;
__sync_synchronize();

View File

@ -11,6 +11,7 @@
#include "collectives.h"
#include "gdrwrap.h"
#include "shm.h"
#include "p2p.h"
#include "profiler.h"
static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
@ -59,10 +60,8 @@ struct connectMapMem{
char* gpuPtr;
char* cpuPtr;
int size;
union {
char shmPath[PATH_MAX];
cudaIpcMemHandle_t ipc;
};
ncclIpcDesc ipcDesc;
char shmPath[PATH_MAX];
ncclShmHandle_t attachHandle;
ncclShmHandle_t createHandle;
};
@ -87,9 +86,9 @@ struct sendResources {
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
int rank;
int localRank;
int remoteRank;
int tpRank;
int tpLocalRank;
int tpRemoteRank;
int netDev;
int useGdr;
int useDmaBuf;
@ -113,10 +112,10 @@ struct recvResources {
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
int rank;
int localRank;
int remoteRank;
int proxyRank;
int tpRank;
int tpLocalRank;
int tpRemoteRank;
int tpRemoteProxyRank;
int netDev;
int useGdr;
int useDmaBuf;
@ -149,9 +148,9 @@ NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
struct setupReq {
int rank;
int localRank;
int remoteRank;
int tpRank;
int tpLocalRank;
int tpRemoteRank;
int shared;
int netDev;
int useGdr;
@ -164,6 +163,7 @@ struct setupReq {
* information for this peer */
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req;
int localRank, tpProxyRank;
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
@ -174,20 +174,22 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
req.rank = myInfo->rank;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
tpProxyRank = comm->topParentRanks[proxyRank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
req.tpLocalRank = comm->topParentLocalRanks[localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
if (proxyRank == myInfo->rank) {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
} else {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
}
*((int*)connectInfo) = proxyRank;
*((int*)connectInfo) = tpProxyRank;
return ncclSuccess;
}
@ -199,13 +201,14 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
/* Setup recv connector */
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req;
int localRank;
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
req.connIndex = connIndex;
// Use myInfo->rank as the receiver uses its own NIC
int proxyRank;
int proxyRank, tpProxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
@ -213,13 +216,15 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
// We don't support PXN on receive yet
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
req.rank = myInfo->rank;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
req.tpLocalRank = comm->topParentLocalRanks[localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, comm->ncclNet->name, req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
return ncclSuccess;
}
@ -274,39 +279,47 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
send->transportResources = map;
opId = send;
INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
NCCLCHECK(ncclProxyCallAsync(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
} else {
opId = send;
}
ncclResult_t ret;
NCCLCHECK(ret = ncclPollProxyResponse(&send->proxyConn, map, opId));
NCCLCHECK(ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId));
if (ret == ncclInProgress) {
return ret;
}
INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId);
if (map->sameProcess) {
if (map->sameProcess && !ncclCuMemEnable()) {
if (map->cudaDev != comm->cudaDev) {
// Enable P2P access
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError;
if (!ncclCuMemEnable()) {
// Enable P2P access for Legacy IPC
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError;
}
}
}
} else {
NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
CUDACHECK(cudaIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
map->mems[NCCL_NET_MAP_DEVMEM].size,
&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL;
}
if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) {
void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank;
void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank;
if (*sharedDevMemPtr == NULL) {
CUDACHECK(cudaIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
&map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
sharedDevMemPtr));
}
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr);
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL;
@ -340,13 +353,13 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
opId = recv;
INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
opId, &recv->proxyConn, connectInfo);
NCCLCHECK(ncclProxyCallAsync(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
} else {
opId = recv;
}
ncclResult_t ret;
NCCLCHECK(ret = ncclPollProxyResponse(&recv->proxyConn, map, opId));
NCCLCHECK(ret = ncclPollProxyResponse(comm, &recv->proxyConn, map, opId));
if (ret == ncclInProgress) {
return ret;
}
@ -371,10 +384,24 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
static ncclResult_t sendFree(struct ncclConnector* send) {
struct connectMap* map = (struct connectMap*)(send->transportResources);
if (map) {
if (map->sameProcess == 0) {
NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
if (map->sameProcess && map->cudaDev == cudaDev) {
// Our own GPU, so it wasn't mapped in
free(map);
return ncclSuccess;
}
if (!map->sameProcess || ncclCuMemEnable()) {
if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
if (ncclCuMemEnable()) {
// cuMem API support
NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
// Legacy CUDA IPC support
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
}
}
free(map);
@ -389,86 +416,87 @@ static ncclResult_t recvFree(struct ncclConnector* recv) {
}
#define NCCL_SHARED_STEPS 16
static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess,
int nChannels, char** gpuPtr, char** cpuPtr, int* size, cudaIpcMemHandle_t* ipc) {
static ncclResult_t sharedBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess,
int nChannels, char** gpuPtr, char** cpuPtr, int* size, ncclIpcDesc *ipcDesc) {
if (cuda == 0 && sameProcess == 0) {
WARN("PXN should not use host buffers for data");
return ncclInternalError;
}
struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
struct ncclProxyProgressState* progressState = &proxyState->progressState;
if (progressState->localPeers == NULL) {
NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
}
struct ncclProxyPeer** localPeers = progressState->localPeers;
if (localPeers[localRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers+localRank, 1));
if (localPeers[tpLocalRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers + tpLocalRank, 1));
}
struct ncclProxyPeer* peer = localPeers[localRank];
struct ncclProxyPeer* peer = localPeers[tpLocalRank];
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
state->refcount++;
if (state->size == 0) {
state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pChunkSize;
state->size = nChannels * NCCL_SHARED_STEPS * proxyState->p2pChunkSize;
}
if (size) *size = state->size;
if (cuda && state->cudaBuff == NULL) {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size));
if (sameProcess == 0) {
CUDACHECK(cudaIpcGetMemHandle(&state->ipc, state->cudaBuff));
if (sameProcess == 0 || ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
} else {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size));
}
}
if (!cuda && state->hostBuff == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
}
if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
if (sameProcess) {
if (gpuPtr) *gpuPtr = *cpuPtr;
} else {
if (gpuPtr) *gpuPtr = NULL;
if (ipc) memcpy(ipc, &state->ipc, sizeof(cudaIpcMemHandle_t));
}
if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL;
if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc));
return ncclSuccess;
}
static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset) {
// Use different pools for different channels and also separate send/recv.
int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
*offset = comm->p2pChunkSize * globalSlot;
*offset = proxyState->p2pChunkSize * globalSlot;
return ncclSuccess;
}
static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) {
if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank];
static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank];
if (peer == NULL) NCCLCHECK(ncclInternalError;)
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
if (state->size == 0) NCCLCHECK(ncclInternalError);
state->refcount--;
if (state->refcount == 0) {
if (state->cudaBuff) CUDACHECK(cudaFree(state->cudaBuff));
if (ncclAtomicRefCountDecrement(&state->refcount) == 0) {
if (state->cudaBuff) {
if (!connection->sameProcess || ncclCuMemEnable()) {
NCCLCHECK(ncclP2pFreeShareableBuffer(&state->ipcDesc));
}
NCCLCHECK(ncclCudaFree(state->cudaBuff));
}
if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));
}
if (peer->send.refcount || peer->recv.refcount) return ncclSuccess;
free(peer);
comm->proxyState.progressState.localPeers[localRank] = NULL;
for (int r=0; r<comm->localRanks; r++) {
if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess;
proxyState->progressState.localPeers[tpLocalRank] = NULL;
for (int r = 0; r < proxyState->tpLocalnRanks; r++) {
if (proxyState->progressState.localPeers[r]) return ncclSuccess;
}
// All peers are freed, free array
free(comm->proxyState.progressState.localPeers);
comm->proxyState.progressState.localPeers = NULL;
free(proxyState->progressState.localPeers);
proxyState->progressState.localPeers = NULL;
return ncclSuccess;
}
static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
int rank = comm->localRankToRank[connection->localRank];
int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels) {
NCCLCHECK(sharedBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL));
return ncclSuccess;
}
static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct setupReq* req = (struct setupReq*) reqBuff;
if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
@ -476,18 +504,18 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
NCCLCHECK(ncclCalloc(&resources, 1));
connection->transportResources = resources;
resources->rank = req->rank;
resources->localRank = req->localRank;
resources->remoteRank = req->remoteRank;
resources->tpRank = req->tpRank;
resources->tpLocalRank = req->tpLocalRank;
resources->tpRemoteRank = req->tpRemoteRank;
resources->netDev = req->netDev;
resources->shared = connection->shared = req->shared;
resources->useGdr = req->useGdr;
resources->channelId = req->channelId;
resources->connIndex = req->connIndex;
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props));
/* DMA-BUF support */
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->maxRecvs = props.maxRecvs;
// We don't return any data
@ -496,7 +524,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
return ncclSuccess;
}
static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct setupReq* req = (struct setupReq*) reqBuff;
if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
@ -504,9 +532,9 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
NCCLCHECK(ncclCalloc(&resources, 1));
connection->transportResources = resources;
resources->rank = req->rank;
resources->localRank = req->localRank;
resources->remoteRank = req->remoteRank;
resources->tpRank = req->tpRank;
resources->tpLocalRank = req->tpLocalRank;
resources->tpRemoteRank = req->tpRemoteRank;
resources->netDev = req->netDev;
resources->shared = connection->shared = req->shared;
resources->useGdr = req->useGdr;
@ -514,50 +542,50 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
resources->channelId = req->channelId;
resources->connIndex = req->connIndex;
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props));
/* DMA-BUF support */
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->maxRecvs = props.maxRecvs;
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
*done = 1;
return ncclSuccess;
}
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
ncclResult_t ret = ncclSuccess;
if (resources->shared) {
// Shared buffers
struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
struct ncclProxyProgressState* progressState = &proxyState->progressState;
if (progressState->localPeers == NULL) {
NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
}
struct ncclProxyPeer** localPeers = progressState->localPeers;
if (localPeers[resources->localRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
if (localPeers[resources->tpLocalRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1));
}
connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId;
connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->send.proxyAppend + resources->channelId;
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
// Connect or reuse connection for a netdev/remote rank.
if (progressState->netComms[resources->netDev] == NULL) {
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
}
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
if (comms->sendComm[resources->channelId] == NULL) ret = ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId);
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, comms->sendComm + resources->channelId);
resources->netSendComm = comms->sendComm[resources->channelId];
if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
} else {
ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm);
}
} else {
// Connect to remote peer
ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm);
connection->proxyAppendPtr = &connection->proxyAppend;
}
@ -570,28 +598,27 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
// Create structures
struct connectMap* map = &resources->map;
map->sameProcess =
comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
map->sameProcess = connection->sameProcess;
map->shared = resources->shared;
CUDACHECK(cudaGetDevice(&map->cudaDev));
if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, comm->buffSizes[p], buffs[p]);
resources->buffSizes[p] = comm->buffSizes[p];
NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, proxyState->buffSizes[p], buffs[p]);
resources->buffSizes[p] = proxyState->buffSizes[p];
}
} else {
// Get shared buffers
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(
comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels,
&mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc));
proxyState, resources->useGdr, resources->tpLocalRank, 0, map->sameProcess, proxyState->p2pnChannels,
&mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipcDesc));
resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
if (comm->allocP2pNetLLBuffers) {
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
if (proxyState->allocP2pNetLLBuffers) {
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
}
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
@ -602,15 +629,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (resources->shared == 0) {
if (!map->sameProcess) {
if (!map->sameProcess || ncclCuMemEnable()) {
ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
}
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
}
if (!map->sameProcess) {
CUDACHECK(cudaIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
}
if (map->sameProcess) {
NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
@ -645,12 +672,12 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
}
}
}
@ -661,40 +688,40 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(int)) return ncclInternalError;
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
resources->proxyRank = *(int*)reqBuff;
resources->tpRemoteProxyRank = *(int*)reqBuff;
ncclResult_t ret = ncclSuccess;
// Finish connection establishment from remote peer
if (resources->shared) {
// Shared buffers
struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
struct ncclProxyProgressState* progressState = &proxyState->progressState;
if (progressState->localPeers == NULL) {
NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
}
struct ncclProxyPeer** localPeers = progressState->localPeers;
if (localPeers[resources->localRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
if (localPeers[resources->tpLocalRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1));
}
connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId;
connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->recv.proxyAppend + resources->channelId;
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
// Connect or reuse connection for a netdev/remote rank.
if (progressState->netComms[resources->netDev] == NULL) {
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
}
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
if (comms->recvComm[resources->channelId] == NULL) ret = ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId);
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank;
if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId);
resources->netRecvComm = comms->recvComm[resources->channelId];
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
} else {
ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm);
}
} else {
// Connect to remote peer
ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm);
connection->proxyAppendPtr = &connection->proxyAppend;
}
@ -705,26 +732,25 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
}
*done = 1;
NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm));
// Create structures
struct connectMap* map = &resources->map;
map->sameProcess =
comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
map->sameProcess = connection->sameProcess;
if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv
map->shared = resources->shared;
if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, comm->buffSizes[p], buffs[p]);
resources->buffSizes[p] = comm->buffSizes[p];
NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, proxyState->buffSizes[p], buffs[p]);
resources->buffSizes[p] = proxyState->buffSizes[p];
}
} else {
// Get shared buffers
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(
comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels,
proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels,
&mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
@ -733,14 +759,19 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
if (comm->allocP2pNetLLBuffers) {
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
if (proxyState->allocP2pNetLLBuffers) {
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
}
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (resources->shared == 0) {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
if (ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
}
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
}
}
@ -771,12 +802,12 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
}
}
}
@ -787,17 +818,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (connection->state == connSharedInitialized) { // NVB Preconnect
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection));
return ncclSuccess;
}
if (connection->state == connConnected) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
@ -806,19 +837,25 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
} else {
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle));
}
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (!resources->map.sameProcess || ncclCuMemEnable()) {
// cuMem API support
if (mems[NCCL_NET_MAP_DEVMEM].size) {
NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
}
}
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
if (resources->shared) {
NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection));
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank;
comms->sendRefCount[resources->channelId]--;
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeSend(comms->sendComm[resources->channelId]));
} else {
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm));
}
} else {
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm));
}
}
@ -826,44 +863,50 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
return ncclSuccess;
}
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
if (connection->state == connSharedInitialized) { // NVB Preconnect
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection));
return ncclSuccess;
}
if (connection->state == connConnected) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (!resources->map.sameProcess || ncclCuMemEnable()) {
// cuMem API support
if (mems[NCCL_NET_MAP_DEVMEM].size) {
NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
}
}
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
if (resources->shared) {
NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection));
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank;
comms->recvRefCount[resources->channelId]--;
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeRecv(comms->recvComm[resources->channelId]));
} else {
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm));
}
} else {
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm));
}
}
if (resources) free(resources);
return ncclSuccess;
}
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@ -894,7 +937,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
if (resources->shared) {
int sharedBuffSlot = sub->posted%maxDepth;
int offset;
NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
resources->recvMem->offsFifo[buffSlot] = offset;
__sync_synchronize();
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@ -944,7 +987,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
}
if (ready) {
// Data is ready, try to send.
NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
sizesFifo[buffSlot] = -1;
@ -962,7 +1005,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
if (sub->done < sub->transmitted) {
int done;
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
sub->done += args->sliceSteps;
@ -988,7 +1031,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
return ncclSuccess;
}
static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
// Initialize subs and group them by same recvComm.
void* recvComm;
@ -1048,7 +1091,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
if (p == NCCL_PROTO_SIMPLE && resources->shared) {
int sharedBuffSlot = sub->posted%maxDepth;
int offset;
NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
offsFifo[buffSlot] = offset;
ptrs[subCount] = localBuff+offset;
@ -1057,7 +1100,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
}
sizes[subCount] = stepSize*args->sliceSteps;
if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
tags[subCount] = resources->remoteRank;
tags[subCount] = resources->tpRemoteRank;
mhandles[subCount] = resources->mhandles[p];
subCount++;
}
@ -1066,7 +1109,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
uint64_t step = subGroup->posted;
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
if (*requestPtr) {
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup+i;
@ -1088,7 +1131,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int sizes[NCCL_PROXY_MAX_SUBS];
void* mhandles[NCCL_PROXY_MAX_SUBS];
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
NCCLCHECK(proxyState->ncclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes));
if (done) {
int needFlush = 0;
int totalSize = 0;
@ -1129,7 +1172,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
}
}
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
}
}
args->idle = 0;
@ -1144,7 +1187,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
uint64_t step = subGroup->transmitted;
int done = 1;
void* request = subGroup->requests[step%NCCL_STEPS];
if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL));
if (request) NCCLCHECK(proxyState->ncclNet->test(request, &done, NULL));
if (done) {
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup + i;

View File

@ -99,6 +99,7 @@ static void* ncclIbAsyncThreadMain(void* args) {
}
NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1);
static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) {
char devicePath[PATH_MAX];
@ -110,7 +111,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
// Merge multi-port NICs into the same PCI device
p[strlen(p)-1] = '0';
// Also merge virtual functions (VF) into the same device
p[strlen(p)-3] = '0';
if (ncclParamIbMergeVfs()) p[strlen(p)-3] = '0';
// And keep the real port aside (the ibv port is always 1 on recent cards)
*realPort = 0;
for (int d=0; d<ncclNIbDevs; d++) {
@ -381,16 +382,25 @@ struct ncclIbHandle {
struct ncclIbCommStage stage; // Used by the other side when connecting
};
// Retain local and remote RoCE addresses for error logging
struct ncclIbGidInfo {
uint8_t link_layer;
union ibv_gid localGid;
union ibv_gid remoteGid;
};
#define NCCL_NET_IB_REQ_UNUSED 0
#define NCCL_NET_IB_REQ_SEND 1
#define NCCL_NET_IB_REQ_RECV 2
#define NCCL_NET_IB_REQ_FLUSH 3
const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" };
struct ncclIbRequest {
struct ncclIbVerbs* verbs;
int type;
int events;
struct ncclSocket* sock;
struct ncclIbGidInfo* gidInfo;
int nreqs;
union {
struct {
@ -440,8 +450,10 @@ struct ncclIbSendComm {
int ready;
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
int nqps;
int qpIndex;
struct ibv_mr* fifoMr;
int ar;
struct ncclIbGidInfo gidInfo;
};
// The SendFifo needs to be 32-byte aligned and each element needs
// to be a 32-byte multiple, so that an entry does not get split and
@ -474,7 +486,9 @@ struct ncclIbRecvComm {
int ready;
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
int nqps;
int qpIndex;
struct ncclIbGpuFlush gpuFlush;
struct ncclIbGidInfo gidInfo;
};
static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
@ -648,15 +662,14 @@ ib_connect_check:
// RoCE support
qpInfo.lid = portAttr.lid;
qpInfo.link_layer = portAttr.link_layer;
qpInfo.link_layer = comm->gidInfo.link_layer = portAttr.link_layer;
if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
for (int q=0; q<comm->nqps; q++)
INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
} else { // RoCE
union ibv_gid gid;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
qpInfo.spn = gid.global.subnet_prefix;
qpInfo.iid = gid.global.interface_id;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid));
qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix;
qpInfo.iid = comm->gidInfo.localGid.global.interface_id;
for (int q=0; q<comm->nqps; q++)
INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
}
@ -682,6 +695,8 @@ ib_connect:
memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo));
comm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn;
comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
@ -743,6 +758,9 @@ ib_recv:
/* copy back the received info */
memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
rComm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn;
rComm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
// IB setup
struct ibv_context* ctx;
uint8_t ib_port;
@ -750,8 +768,7 @@ ib_recv:
ib_port = ncclIbDevs[lComm->dev].port;
struct ibv_port_attr portAttr;
NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
union ibv_gid gid;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &rComm->gidInfo.localGid));
// QP Creation
NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs));
@ -789,8 +806,8 @@ ib_recv:
localQpInfo.lid=portAttr.lid;
localQpInfo.link_layer=portAttr.link_layer;
localQpInfo.ib_port=ib_port;
localQpInfo.spn=gid.global.subnet_prefix;
localQpInfo.iid=gid.global.interface_id;
localQpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix;
localQpInfo.iid=rComm->gidInfo.localGid.global.interface_id;
localQpInfo.mtu=portAttr.active_mtu;
NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, rComm->gpuFlush.qp->qp_num, &localQpInfo));
NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp));
@ -799,11 +816,11 @@ ib_recv:
// Fill Handle
struct ncclIbQpInfo qpInfo;
qpInfo.lid=portAttr.lid;
qpInfo.link_layer=portAttr.link_layer;
qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer;
qpInfo.ib_port=ib_port;
for (int q=0; q<rComm->nqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num;
qpInfo.spn=gid.global.subnet_prefix;
qpInfo.iid=gid.global.interface_id;
qpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix;
qpInfo.iid=rComm->gidInfo.localGid.global.interface_id;
qpInfo.mtu=remQpInfo.mtu;
stage->state = ncclIbCommStateSend;
@ -841,6 +858,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
r->verbs = verbs;
r->events = 1;
r->sock = NULL;
r->gidInfo = NULL;
*req = r;
return ncclSuccess;
}
@ -945,6 +963,8 @@ returning:
return res;
}
NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1);
ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
struct ncclIbRequest** reqs = comm->fifoReqs[slot];
volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
@ -1000,9 +1020,10 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
// Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work
const int align = 128;
for (int q=0; q<comm->nqps; q++) {
const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
for (int q=0; q<nqps; q++) {
for (int r=0; r<nreqs; r++) {
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, comm->nqps), align) * align;
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
if (length <= 0) {
comm->wrs[r].sg_list = NULL;
@ -1014,10 +1035,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
}
}
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr));
NCCLCHECK(wrap_ibv_post_send(comm->qps[comm->qpIndex], comm->wrs, &bad_wr));
comm->qpIndex = (comm->qpIndex+1)%comm->nqps;
for (int r=0; r<nreqs; r++) {
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, comm->nqps), align) * align;
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
reqs[r]->send.offset += chunkSize;
comm->sges[r].addr += chunkSize;
comm->wrs[r].wr.rdma.remote_addr += chunkSize;
@ -1077,7 +1099,8 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
req->send.data = data;
req->send.lkey = mr->lkey;
req->send.offset = 0;
req->events = comm->nqps;
req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo;
*request = reqs[r] = req;
// If this is a multi-recv, send only when all requests have matched.
@ -1171,6 +1194,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
req->type = NCCL_NET_IB_REQ_RECV;
req->sock = &comm->sock;
req->nreqs = n;
if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo;
for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
struct ibv_recv_wr wr;
@ -1181,13 +1205,15 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
wr.num_sge = 0;
TIME_START(1);
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
for (int q=0; q<nqps; q++) {
struct ibv_qp* qp = comm->qps[comm->qpIndex];
struct ibv_recv_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr));
comm->qpIndex = (comm->qpIndex+1)%comm->nqps;
}
TIME_STOP(1);
req->events = comm->nqps;
req->events = nqps;
*request = req;
@ -1258,8 +1284,16 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
char line[SOCKET_NAME_MAXLEN+1];
union ncclSocketAddress addr;
ncclSocketGetAddr(r->sock, &addr);
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
char localGidString[INET6_ADDRSTRLEN] = "";
char remoteGidString[INET6_ADDRSTRLEN] = "";
const char* localGidStr = NULL, *remoteGidStr = NULL;
if (r->gidInfo) {
localGidStr = inet_ntop(AF_INET6, &r->gidInfo->localGid, localGidString, sizeof(localGidString));
remoteGidStr = inet_ntop(AF_INET6, &r->gidInfo->remoteGid, remoteGidString, sizeof(remoteGidString));
}
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d (%s)%s%s%s%s",
ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGid ":"", remoteGidString);
return ncclRemoteError;
}

View File

@ -43,22 +43,7 @@ struct ncclTransport nvlsTransport = {
{ NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
};
#define NVLS_HANDLE_SIZE 64
struct nvlsResources {
CUmulticastObjectProp properties;
CUmemAccessDesc accessDesc;
int dev;
size_t size;
size_t granularity;
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
char* mcBuff; // Multicast NVLS buffer address
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* ucBuff; // Unicast NVLS buffer address
};
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* resources, int dev, int nranks, size_t size) {
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) {
CUmulticastObjectProp* prop = &resources->properties;
memset(prop, 0, sizeof(*prop));
prop->size = size;
@ -81,7 +66,7 @@ ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* reso
return ncclSuccess;
}
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resources, int rank, unsigned int nranks, char* shareableHandle) {
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, unsigned int nranks, char* shareableHandle) {
size_t size = resources->size;
// Create a Multicast group
@ -103,24 +88,13 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resour
return ncclSuccess;
}
ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct nvlsResources* resources) {
ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev);
CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev));
return ncclSuccess;
}
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct nvlsResources* resources) {
int dev = resources->dev;
size_t size = resources->size;
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
// Unbind physical memory from group for the given device
CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
return ncclSuccess;
}
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resources, int rank, char* shareableHandle) {
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, char* shareableHandle) {
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
@ -131,9 +105,11 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resou
int fd = *(int *)shareableHandle;
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd);
struct ncclProxyConnector proxyConn;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, rank, &proxyConn));
int tpProxyRank = comm->topParentRanks[rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn));
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank);
NCCLCHECK(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgConvertFd, shareableHandle, sizeof(int), &fd, sizeof(int)));
NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, (int *)shareableHandle));
fd = *(int *)shareableHandle;
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type));
} else {
@ -146,7 +122,20 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resou
return ncclSuccess;
}
ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resources) {
ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// cuMem UDS support
int fd = *(int *)resources->shareableHandle;
(void) close(fd);
}
return ncclSuccess;
}
ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
size_t size = resources->size;
size_t granularity;
CUdeviceptr ptr = 0;
@ -178,7 +167,21 @@ ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resou
return ncclSuccess;
}
ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resources) {
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
int dev = resources->dev;
size_t size = resources->size;
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
// Unbind physical memory from group for the given device
CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
// Release the MC group resources
NCCLCHECK(nvlsGroupDisconnect(comm, resources));
return ncclSuccess;
}
ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
size_t size = resources->size;
CUdeviceptr ptr = 0;
@ -196,7 +199,7 @@ ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resour
return ncclSuccess;
}
ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* resources) {
ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
size_t size;
CUdeviceptr ptr;
INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)",
@ -224,135 +227,173 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* reso
#define NVLS_MEM_ALIGN_SIZE (1 << 21)
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 1);
ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
comm->nvlsSupport = 0;
comm->nvlsChannels = 0;
int gpuCount;
NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
if (!ncclParamNvlsEnable() || comm->localRanks <= 1 || comm->nNodes>1) return ncclSuccess;
CUdevice dev;
int driverVersion;
if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess;
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
CUCHECK(cuCtxGetDevice(&dev));
CUDACHECK(cudaDriverGetVersion(&driverVersion));
comm->nvlsSupport = 0;
// NVLS Multicast support requires CUDA12.1 UMD + KMD
if (CUPFN(cuMulticastCreate) != NULL && driverVersion >= 12010) {
CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
}
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
if (comm->nvlsSupport == 0) return ncclSuccess;
int nChannels = comm->nvlsChannels = std::max(comm->minCTAs, std::min(comm->maxCTAs, (int)ncclParamNvlsChannels()));
int rank = comm->localRank, nranks = comm->localRanks;
for (int c=0; c<nChannels; c++) {
NCCLCHECK(initChannel(comm, c));
}
ncclResult_t res = ncclSuccess;
struct nvlsResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
comm->nvlsResources = resources;
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
size_t memSize = NVLS_MEM_ALIGN_SIZE;
size_t nvlsPerRankSize = nChannels*2*(buffSize+memSize);
size_t nvlsTotalSize = nvlsPerRankSize*nranks;
INFO(NCCL_INIT|NCCL_NVLS, "NVLS comm %p rank %d nranks %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
comm, rank, nranks, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
char* nvlsShareableHandle = NULL;
NCCLCHECKGOTO(ncclCalloc(&nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nranks, nvlsTotalSize), res, cleanup);
if (rank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, rank, nranks, nvlsShareableHandle), res, cleanup);
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
if (ncclParamNvlsEnable() == 2) {
// NVLS Multicast support requires CUDA12.1 UMD + KMD
if (CUPFN(cuMulticastCreate) != NULL /*&& driverVersion >= 12010 */) {
CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
}
} else {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, 0, nvlsShareableHandle), res, cleanup);
comm->nvlsSupport = 1;
}
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
// Local intra-node barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels()));
return ncclSuccess;
}
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.nHeads = nranks;
for (int i=0; i<NCCL_MAX_NVLS_ARITY; i++) channel->nvls.up[i] = -1;
channel->nvls.down = comm->nRanks+1+comm->localRank;
channel->nvls.out = -1; // Network not yet implemented.
channel->nvls.headRank = comm->localRank; // Network not yet implemented.
}
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess;
for (int r=0; r<nranks; r++) {
int nvlsPeer = comm->nRanks+1+r;
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.up[r] = nvlsPeer;
int nHeads = comm->channels[0].nvls.nHeads;
int headRank = comm->channels[0].nvls.headRank;
char* mem = NULL;
struct ncclChannelPeer* peer = channel->peers+nvlsPeer;
CUdevice dev;
CUCHECK(cuCtxGetDevice(&dev));
// Reduce UC -> MC
mem = resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize);
peer->send[0].transportComm = &nvlsTransport.send;
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[0].conn.head = (uint64_t*)(mem+buffSize);
peer->send[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
mem = resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize);
peer->recv[1].transportComm = &nvlsTransport.recv;
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[1].conn.head = (uint64_t*)(mem+buffSize);
peer->recv[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
peer->recv[1].conn.flags |= NCCL_NVLS_MIN_POLL;
ncclResult_t res = ncclSuccess;
bool nvlsShare = true;
if (parent && parent->nvlsSupport && parent->config.splitShare && parent->localRanks == comm->localRanks)
nvlsShare = true;
else
nvlsShare = false;
// Broadcast MC -> UC
mem = resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
peer->recv[0].transportComm = &nvlsTransport.recv;
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[0].conn.head = (uint64_t*)(mem+buffSize);
peer->recv[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
mem = resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
peer->send[1].transportComm = &nvlsTransport.send;
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[1].conn.head = (uint64_t*)(mem+buffSize);
peer->send[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
peer->send[1].conn.flags |= NCCL_NVLS_MIN_POLL;
if (nvlsShare) {
/* reuse NVLS resources */
comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
for (int c = 0; c < comm->nvlsChannels; c++) {
NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup);
}
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
comm->nvlsResources = parent->nvlsResources;
ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount);
} else {
int nChannels;
ncclResult_t res = ncclSuccess;
struct ncclNvlsSharedRes* resources;
/*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
nvlsPeer, c,
resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize),
resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize),
resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize),
resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize));*/
NCCLCHECK(ncclCalloc(&resources, 1));
comm->nvlsResources = resources;
resources->refCount = 1;
if (parent && parent->config.splitShare) {
/* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels
* to make sure nvlsChannels match for each rank. */
comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
}
nChannels = resources->nChannels = comm->nvlsChannels;
for (int c = 0; c < nChannels; c++) {
NCCLCHECK(initNvlsChannel(comm, c, parent, false));
}
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
size_t memSize = NVLS_MEM_ALIGN_SIZE;
size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize);
size_t nvlsTotalSize = nvlsPerRankSize * nHeads;
INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
char* shareableHandle = resources->shareableHandle;
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup);
if (comm->localRank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, comm->localRank, comm->localRanks, shareableHandle), res, cleanup);
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
} else {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, comm->localRankToRank[0], shareableHandle), res, cleanup);
}
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
// Local intra-node barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
for (int h = 0; h < nHeads; h++) {
int nvlsPeer = comm->nRanks + 1 + h;
for (int c = 0; c < nChannels; c++) {
struct ncclChannel* channel = comm->channels + c;
char* mem = NULL;
struct ncclChannelPeer* peer = channel->peers[nvlsPeer];
// Reduce UC -> MC
mem = resources->ucBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
peer->send[1].transportComm = &nvlsTransport.send;
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[1].conn.head = (uint64_t*)(mem + buffSize);
peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
peer->recv[0].transportComm = &nvlsTransport.recv;
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[0].conn.head = (uint64_t*)(mem + buffSize);
peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL;
// Broadcast MC -> UC
mem = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
peer->recv[1].transportComm = &nvlsTransport.recv;
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[1].conn.head = (uint64_t*)(mem + buffSize);
peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
peer->send[0].transportComm = &nvlsTransport.send;
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[0].conn.head = (uint64_t*)(mem + buffSize);
peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
struct ncclDevChannelPeer* addr;
CUDACHECKGOTO(cudaMemcpyAsync(&addr, comm->channels[c].devPeers + nvlsPeer, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
/*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
nvlsPeer, c,
resources->mcBuff + (h*2*nChannels+c)*(buffSize+memSize),
resources->mcBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize),
resources->ucBuff + (h*2*nChannels+c)*(buffSize+memSize),
resources->ucBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize));*/
}
}
}
free(nvlsShareableHandle);
return res;
cleanup:
comm->nvlsSupport = 0;
free(nvlsShareableHandle);
return res;
}
ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
struct nvlsResources* resources = (struct nvlsResources*)comm->nvlsResources;
struct ncclNvlsSharedRes* resources = (struct ncclNvlsSharedRes*)comm->nvlsResources;
if (resources == NULL) return ncclSuccess;
NCCLCHECK(nvlsGroupUnbind(comm, resources));
NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
free(resources);
comm->nvlsResources = NULL;
if (ncclAtomicRefCountDecrement(&resources->refCount) == 0) {
NCCLCHECK(nvlsGroupUnbind(comm, resources));
NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
free(resources);
comm->nvlsResources = NULL;
}
return ncclSuccess;
}
@ -362,7 +403,12 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
* Pre CUDA 12.1 stubs
*/
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
comm->nvlsChannels = 0;
return ncclSuccess;
}
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
return ncclSuccess;
}

View File

@ -8,17 +8,21 @@
#include "graph.h"
#include "utils.h"
#include "shm.h"
#include "p2p.h"
enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
struct ncclP2pBuff {
void* directPtr;
cudaIpcMemHandle_t devIpc;
size_t size;
ncclIpcDesc ipcDesc;
};
struct p2pConnectInfo {
int rank;
int read;
struct ncclP2pBuff p2pBuff;
// Use by CE memcpy
// Used by CE memcpy
char shmName[7];
int shmSize;
};
@ -28,7 +32,7 @@ struct p2pShm {
struct ncclSendMem sendMem;
struct ncclRecvMem recvMem;
};
struct p2pProxyInfo {
struct p2pShmProxyInfo {
// Shared memory between proxy and receiving GPU
struct p2pShm* shm;
struct p2pShm* devShm;
@ -43,30 +47,34 @@ struct p2pProxyInfo {
// Receiver buffer
char* recvFifo;
// Used by progress only
// Used by CE memcpy progress only
uint64_t step;
cudaStream_t stream;
cudaEvent_t events[NCCL_STEPS];
};
static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
struct p2pSendResources {
struct ncclSendMem* devMem;
void* sendMemIpc;
void* recvMemIpc;
struct p2pProxyInfo proxyInfo;
};
struct p2pRecvResources {
struct ncclRecvMem* devMem;
struct p2pResources {
enum p2pType type;
union {
struct ncclSendMem* sendDevMem;
struct ncclRecvMem* recvDevMem;
};
void* sendMemIpc;
void* recvMemIpc;
// CE memcpy support
struct p2pShmProxyInfo proxyInfo;
struct p2pShm* shm;
struct p2pShm* devShm;
int shmSize;
ncclShmHandle_t handle;
};
// cuMem API support
struct p2pCuMemProxyInfo {
struct ncclP2pBuff p2pBuff;
};
#include <sys/types.h>
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
@ -86,6 +94,7 @@ static int busIdToCudaDev(int64_t busId) {
return -1;
}
// CE memcpy support
NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
static int useMemcpy = 0;
static void initCeOperation();
@ -140,7 +149,8 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
return ncclSuccess;
}
if (p2p != 0) {
// This will always fail when using NCCL_CUMEM_ENABLE=1
if (p2p != 0 && !ncclCuMemEnable()) {
// Cached result of the legacyIPC detection
static int legacyIPC = -1;
if (legacyIPC >= 0) {
@ -150,12 +160,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
// Check that legacy IPC support is available (WSL WAR)
char *dummy;
cudaIpcMemHandle_t ipc;
NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN));
NCCLCHECK(ncclCudaMalloc(&dummy, CUDA_IPC_MIN));
if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported");
*ret = 0;
}
CUDACHECK(cudaFree(dummy));
NCCLCHECK(ncclCudaFree(dummy));
legacyIPC = *ret;
return ncclSuccess;
}
@ -176,6 +186,98 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
} while (0)
// cuMem API support
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
// cuMem API support
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
CUmemGenericAllocationHandle handle;
NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
#else
return ncclInternalError;
#endif
} else {
// Allocate a CUDA buffer and generate an IPC handle for it
NCCLCHECK(ncclCudaCalloc((char **)ptr, size));
cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr);
if (res != cudaSuccess) {
WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
ncclCudaFree(*ptr);
CUDACHECK(res);
}
}
INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zi ipcDesc %p", *ptr, size, ipcDesc);
return ncclSuccess;
}
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) {
if (ncclCuMemEnable()) {
// cuMem API support
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
int fd = *(int *) &ipcDesc->cuDesc.data;
if (fd <= 0) return ncclInternalError;
(void) close(fd);
}
}
return ncclSuccess;
}
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
// cuMem API support
CUdeviceptr dptr = 0;
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
CUmemGenericAllocationHandle handle;
ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// UDS fd support
struct ncclProxyConnector proxyConn;
int fd = *(int *)(&cuDesc->data);
int newFd = -1;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn));
NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, &newFd));
INFO(NCCL_P2P, "UDS converted fd %d -> %d on peer %d", fd, newFd, tpPeer);
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)newFd, type));
close(newFd);
} else {
CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type));
}
CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0));
TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%lx dptr %p", size, (long)handle, (void*)dptr);
// Allow access by the local GPU
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = comm->cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id);
*devMemPtr = (void *)dptr;
#else
return ncclInternalError;
#endif
} else {
// Legacy CUDA IPC
CUDACHECK(cudaIpcOpenMemHandle(devMemPtr, ipcDesc->devIpc, cudaIpcMemLazyEnablePeerAccess));
}
INFO(NCCL_P2P, "Imported shareable buffer device %d size %zi ptr %p", comm->cudaDev, size, *devMemPtr);
return ncclSuccess;
}
// Setting this to non zero causes P2P to use Reads rather than Writes
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
@ -192,10 +294,11 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
return ncclSuccess;
}
static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
if (myInfo->pidHash == peerInfo->pidHash) {
static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) {
if (peerInfo->cudaDev != myInfo->cudaDev) {
// Enable P2P access
// Same PID different GPUs, enable P2P access
// Legacy CUDA IPC
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
@ -208,8 +311,15 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
} else {
CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pBuff->devIpc, cudaIpcMemLazyEnablePeerAccess));
*ipcPtr = *devMem;
if ((myInfo->pidHash == peerInfo->pidHash) && (peerInfo->cudaDev == myInfo->cudaDev)) {
// Same PID and GPU
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
} else {
// Different PID or different GPU
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
*ipcPtr = *devMem;
}
}
return ncclSuccess;
}
@ -217,7 +327,8 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct p2pSendResources* resources;
struct p2pResources* resources;
int tpProxyRank;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
int useRead, intermediateRank;
@ -233,35 +344,47 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
int sendSize = sizeof(struct ncclSendMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
if (info->read) sendSize += comm->buffSizes[NCCL_PROTO_SIMPLE];
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
resources->type = P2P_DIRECT;
send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
} else {
// cuMem API support
if (ncclCuMemEnable()) {
resources->type = P2P_CUMEM;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s",
channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr, useMemcpy ? "/CE" : "");;
} else {
// Legacy CUDA IPC
resources->type = P2P_IPC;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
}
send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
}
} else {
resources->type = P2P_INTERMEDIATE;
info->rank = intermediateRank;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
comm->peerInfo[intermediateRank].busId, useReadStr);
comm->peerInfo[intermediateRank].busId, useReadStr);
}
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
tpProxyRank = comm->topParentRanks[info->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn));
if (useMemcpy) {
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo)));
info->shmSize = resources->proxyInfo.shmSize;
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
} else {
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
}
return ncclSuccess;
@ -270,7 +393,8 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
/* Create and return connect structures for this peer to connect to me */
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
struct p2pRecvResources* resources;
struct p2pResources* resources;
int tpProxyRank;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
int useRead, intermediateRank;
@ -284,44 +408,56 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
int recvSize = sizeof(struct ncclRecvMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += comm->buffSizes[p];
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
resources->type = P2P_DIRECT;
recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {
if (ncclCuMemEnable()) {
// cuMem API support
resources->type = P2P_CUMEM;
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/CUMEM",
channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
} else {
// Legacy CUDA IPC
resources->type = P2P_IPC;
}
recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
}
} else {
resources->type = P2P_INTERMEDIATE;
info->rank = intermediateRank;
}
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
tpProxyRank = comm->topParentRanks[info->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
return ncclSuccess;
}
/* Connect/Send to this peer */
static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
struct ncclRecvMem* remDevMem;
struct p2pResources* resources = (struct p2pResources*)send->transportResources;
struct ncclRecvMem* remDevMem = NULL;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
char* buff = (char*)(remDevMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
/* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy
send->conn.buffs[p] = (char*)(resources->devMem+1);
if (resources->sendDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
send->conn.buffs[p] = (char*)(resources->sendDevMem+1);
} else {
send->conn.buffs[p] = buff;
buff += send->comm->buffSizes[p];
buff += comm->buffSizes[p];
}
}
@ -330,20 +466,20 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
// Send SIMPLE buff to proxy, and replace it by local buffer
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
} else {
send->conn.tail = &remDevMem->tail;
send->conn.head = &resources->devMem->head;
send->conn.ptrExchange = &resources->devMem->ptrExchange;
send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
send->conn.head = &resources->sendDevMem->head;
send->conn.ptrExchange = &resources->sendDevMem->ptrExchange;
send->conn.redOpArgExchange = resources->sendDevMem->redOpArgExchange;
}
return ncclSuccess;
}
/* Connect/Recv from this peer */
ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
struct p2pResources* resources = (struct p2pResources*)recv->transportResources;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
struct ncclSendMem* remDevMem = NULL;
@ -353,20 +489,22 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
resources->shmSize = info->shmSize;
// Attach to peer's SHM segment
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
recv->conn.tail = &resources->devShm->recvMem.tail;
recv->conn.head = &resources->devShm->sendMem.head;
} else {
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
recv->conn.tail = &resources->devMem->tail;
struct ncclRecvMem* devMem = resources->recvDevMem;
recv->conn.tail = &devMem->tail;
recv->conn.head = &remDevMem->head;
recv->conn.ptrExchange = &remDevMem->ptrExchange;
recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
}
char* buff = (char*)(resources->devMem+1);
char* buff = (char*)(resources->recvDevMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
@ -374,93 +512,113 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
recv->conn.buffs[p] = (char*)(remDevMem+1);
} else {
recv->conn.buffs[p] = buff;
buff += recv->comm->buffSizes[p];
buff += comm->buffSizes[p];
}
}
return ncclSuccess;
}
ncclResult_t p2pSendFree(struct ncclConnector* send) {
struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
struct p2pResources* resources = (struct p2pResources*)send->transportResources;
if (resources) {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (ncclCuMemEnable()) {
// cuMem API support
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
}
else {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
}
free(resources);
}
return ncclSuccess;
}
ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
struct p2pResources* resources = (struct p2pResources*)recv->transportResources;
if (resources) {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (useMemcpy) {
NCCLCHECK(ncclShmClose(resources->handle));
if (ncclCuMemEnable()) {
// cuMem API support
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
}
else {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (useMemcpy) {
NCCLCHECK(ncclShmClose(resources->handle));
}
}
free(resources);
}
return ncclSuccess;
}
static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (useMemcpy) {
struct p2pProxyInfo* proxyInfo;
// CE memcpy support
struct p2pShmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
connection->transportResources = proxyInfo;
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
char shmPath[PATH_MAX];
shmPath[0] = '\0';
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
// Create a SHM segment for the peer to attach to
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo));
if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
} else {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size));
connection->transportResources = p2pBuff->directPtr;
cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
if (res != cudaSuccess) {
WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
cudaFree(p2pBuff->directPtr);
free(p2pBuff);
CUDACHECK(res);
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
p2pBuff->size = size;
if (ncclCuMemEnable()) {
// cuMem API support
struct p2pCuMemProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff));
connection->transportResources = proxyInfo;
} else {
connection->transportResources = p2pBuff->directPtr;
}
}
*done = 1;
return ncclSuccess;
}
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size));
connection->transportResources = p2pBuff->directPtr;
cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
if (res != cudaSuccess) {
WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
cudaFree(p2pBuff->directPtr);
free(p2pBuff);
CUDACHECK(res);
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
p2pBuff->size = size;
if (ncclCuMemEnable()) {
// cuMem API support
struct p2pCuMemProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff));
connection->transportResources = proxyInfo;
} else {
connection->transportResources = p2pBuff->directPtr;
}
*done = 1;
return ncclSuccess;
}
static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
if (reqSize != sizeof(void*)) return ncclInternalError;
proxyInfo->recvFifo = *((char**)reqBuff);
@ -473,13 +631,14 @@ static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection,
return ncclSuccess;
}
static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
// CE memcpy support
if (useMemcpy) {
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
if (proxyInfo) {
NCCLCHECK(ncclShmClose(proxyInfo->handle));
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff));
CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(proxyInfo->events[i]));
@ -487,23 +646,45 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
free(proxyInfo);
}
} else {
// Do not check return code as CUDA may have already shut down
cudaFree(connection->transportResources);
if (ncclCuMemEnable()) {
// cuMem API support
struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources;
if (proxyInfo) {
struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff;
ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc);
ncclCudaFree(p2pBuff->directPtr);
free(proxyInfo);
}
} else {
// Do not check return code as CUDA may have already shut down
ncclCudaFree(connection->transportResources);
}
}
return ncclSuccess;
}
static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
// Do not check return code as CUDA may have already shut down
cudaFree(connection->transportResources);
static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
if (ncclCuMemEnable()) {
struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources;
if (proxyInfo) {
struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff;
ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc);
ncclCudaFree(p2pBuff->directPtr);
free(proxyInfo);
}
} else {
// Do not check return code as CUDA may have already shut down
ncclCudaFree(connection->transportResources);
}
return ncclSuccess;
}
static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
// CE memcpy support
static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->transmitted = sub->done = 0;
@ -513,10 +694,10 @@ static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxy
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources);
if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
resources->step = sub->base + sub->nsteps;
args->done++;

View File

@ -85,7 +85,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclSendMem);
if (shmLocality == SHM_SEND_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
@ -108,7 +108,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclRecvMem);
if (shmLocality == SHM_RECV_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
@ -146,7 +146,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
send->conn.buffs[p] = buff;
buff += send->comm->buffSizes[p];
buff += comm->buffSizes[p];
}
send->conn.tail = &resources->devRemHostMem->tail;
send->conn.head = &resources->devHostMem->head;
@ -155,9 +155,11 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
}
if (useMemcpySend) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
int tpProxyRank;
tpProxyRank = comm->topParentRanks[comm->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
send->conn.tail = &proxyInfo.ceRecvMem->tail;
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
@ -179,7 +181,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
recv->conn.buffs[p] = buff;
buff += recv->comm->buffSizes[p];
buff += comm->buffSizes[p];
}
recv->conn.head = &resources->devRemHostMem->head;
recv->conn.tail = &resources->devHostMem->tail;
@ -187,7 +189,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
if (useMemcpyRecv) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
recv->conn.tail = &proxyInfo.ceRecvMem->tail;
}
@ -214,12 +216,12 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
return ncclSuccess;
}
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
for (int i=0; i<NCCL_STEPS; i++) {
@ -232,12 +234,12 @@ static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection,
return ncclSuccess;
}
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
for (int i=0; i<NCCL_STEPS; i++) {
@ -250,12 +252,12 @@ static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection,
return ncclSuccess;
}
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
if (resources) {
CUDACHECK(cudaStreamDestroy(resources->stream));
CUDACHECK(cudaFree(resources->devFifo));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
@ -265,12 +267,12 @@ static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
if (resources) {
CUDACHECK(cudaStreamDestroy(resources->stream));
CUDACHECK(cudaFree(resources->devFifo));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
@ -280,7 +282,7 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@ -294,7 +296,7 @@ static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxy
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
@ -339,7 +341,7 @@ static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxy
return ncclSuccess;
}
static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@ -353,7 +355,7 @@ static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxy
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);