Fix crash when setting NCCL_MAX_P2P_NCHANNELS below nchannels.
Fix hang during sendrecv dynamic NVB connection establishment on
cubemesh topologies.
Add environment variable to only use SHARP on communicators beyond
a given number of ranks.
Add debug subsystem to trace memory allocations.
Fix compilation with TRACE=1. (Issue #505)
This commit is contained in:
Sylvain Jeaugey 2021-05-11 18:16:30 -07:00
parent ca8485b0d0
commit 3fec2fa5ee
14 changed files with 142 additions and 52 deletions

View File

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 9
NCCL_PATCH := 8
NCCL_PATCH := 9
NCCL_SUFFIX :=
PKG_REVISION := 1

View File

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -62,6 +62,8 @@ void ncclDebugInit() {
mask = NCCL_TUNING;
} else if (strcasecmp(subsys, "ENV") == 0) {
mask = NCCL_ENV;
} else if (strcasecmp(subsys, "ALLOC") == 0) {
mask = NCCL_ALLOC;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}

View File

@ -133,7 +133,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
// Because in cudaGraph mode the launch param needs to be determined
// at capture time instead of launch time.
if (!usingCudaGraph) {
for (int c=0; c<comm->p2pnChannels; c++) {
int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
for (int c=0; c<nChannels; c++) {
if (comm->channels[c].workCount) params->gridDim.x = c+1;
}
eqInfo->maxChannels = params->gridDim.x;
@ -169,8 +170,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
// GDRCOPY support
uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
uint64_t nelems = channel->workCount;
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld last %ld nelems %zi",
channel->workFifo, channel->workFifoGdr, first, last, nelems);
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi",
channel->workFifo, channel->workFifoGdr, first, nelems);
for (int i = 0; i < nelems; i++) {
int elem = (first+i) % NCCL_MAX_OPS;
@ -799,6 +800,14 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
#if CUDART_VERSION >= 11030
cudaStreamCaptureStatus captureStatus;
unsigned long long cudaGraphId;
if (comm->driverVersion < 11030) {
CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
if (captureStatus != cudaStreamCaptureStatusNone) {
WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
return ncclInvalidUsage;
}
return ncclSuccess;
}
CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
if (captureStatus == cudaStreamCaptureStatusActive) {
if (cudaGraphId != comm->lastCudaGraphId) {

View File

@ -29,6 +29,8 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
return ncclInternalError;
}
NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
@ -63,7 +65,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
// allow routing through a GPU only as 1 hop
if (node != baseNode && node->type == GPU &&
(link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
(ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) {
// Find reverse link
@ -529,3 +531,20 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
return ncclSuccess;
}
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) {
int ngpus = system->nodes[GPU].count;
NCCLCHECK(ncclCalloc(ranks, ngpus));
int nvbGpus = 0;
for (int g=0; g<ngpus; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (gpu->gpu.rank != rank) continue;
for (int p=0; p<ngpus; p++) {
if (gpu->paths[GPU][p].type == PATH_NVB) {
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
}
}
}
*nranks = nvbGpus;
return ncclSuccess;
}

View File

@ -469,7 +469,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
}
ncclDebugNoWarn = 1;
ncclDebugNoWarn = NCCL_GRAPH;
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
if (index == -1) {
if (path == NULL) getPciPath(busId, &path);

View File

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

View File

@ -16,6 +16,7 @@ template <typename T>
static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) {
CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
memset(*ptr, 0, nelem*sizeof(T));
INFO(NCCL_ALLOC, "Cuda Host Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
return ncclSuccess;
}
@ -33,6 +34,7 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
}
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
INFO(NCCL_ALLOC, "Mem Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
return ncclSuccess;
}
@ -45,6 +47,7 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
CUDACHECK(cudaStreamSynchronize(stream));
CUDACHECK(cudaStreamDestroy(stream));
INFO(NCCL_ALLOC, "Cuda Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
return ncclSuccess;
}
@ -65,6 +68,7 @@ static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
if (ret != 0) return ncclSystemError;
memset(p, 0, size);
*ptr = p;
INFO(NCCL_ALLOC, "Ib Alloc Size %ld pointer %p", size, *ptr);
return ncclSuccess;
}

View File

@ -159,6 +159,7 @@ struct ncclComm {
struct ncclQueueInfo* enqueueInfo;
cudaGraphNode_t lastSetupNode;
unsigned long long lastCudaGraphId;
int driverVersion;
};
#endif

View File

@ -26,6 +26,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);

View File

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -19,7 +19,7 @@
#define NCCL_NET_MAX_REQUESTS 8
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

View File

@ -60,4 +60,5 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
#endif

View File

@ -248,7 +248,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->nRanks = comm->hostDevComm.nRanks = ndev;
cudaGetDevice(&comm->cudaDev);
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
comm->doneEvent = doneEvent;
comm->intDoneEvent = intDoneEvent;
@ -277,6 +277,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->lastSetupNode = NULL;
comm->lastCudaGraphId = -1;
CUDACHECK(cudaDriverGetVersion(&comm->driverVersion));
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
@ -295,11 +297,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
static ncclResult_t devCommSetup(ncclComm_t comm) {
// Duplicate the channels on the device
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, nChannels));
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, nChannels));
// Copy userRanks and peers
for (int r=0; r<comm->p2pnChannels; r++) {
for (int r=0; r<comm->nChannels; r++) {
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
}
@ -459,6 +462,8 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
// We use 2 AllGathers
@ -579,7 +584,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
}
// Determine CollNet support
// Determine local CollNet support before all-gather
if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
if (intraRanks > 8) {
if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
@ -687,6 +692,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
}
// Determine CollNet support after all-gather now that we know nNodes
int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
if (comm->nNodes < collNetNodeThreshold) {
if (comm->collNetSupport == 1)
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
comm->collNetSupport = 0;
}
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph));
@ -727,6 +740,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
free(rings);
INFO(NCCL_INIT, "Connected all rings");
// Connect Trees
@ -759,27 +773,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
else if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend) != 1)
collNetSetupFail = 1;
}
// Verify CollNet setup across ranks after trying the first channel
if (c == 0) {
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
}
}
// Verify CollNet setup across ranks after trying all channels
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
// Connect intra-node CollNet
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
collnet_cleanup:
free(heads);
// Verify CollNet setup across ranks
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
if (comm->collNetSupport) {
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+c;
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0));
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0));
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1));
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1));
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
if (ret != ncclSuccess) {
NCCLCHECK(ncclTransportCollNetFree(comm));
comm->collNetSupport = 0;
ret = ncclSuccess;
}
}
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
free(rings);
// Compute time models for algorithm and protocol combinations
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
@ -787,6 +811,32 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Compute nChannels per peer for p2p
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
if (ncclParamNvbPreconnect()) {
// Connect p2p when using NVB path
int nvbNpeers;
int* nvbPeers;
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
for (int r=0; r<nvbNpeers; r++) {
int peer = nvbPeers[r];
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1<<channelId);
}
}
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1<<channelId);
}
}
}
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 0));
free(nvbPeers);
}
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
@ -916,7 +966,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
// Try and prevent a double free of the comm struct (user error)
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {

View File

@ -237,23 +237,26 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
free(allGatherFailures);
if (collNetSetupFail) {
if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
// Free collNet resources
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclPeer* peer = channel->peers+nranks;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
recv->transportResources = NULL; // avoid double free
}
}
// Set support to 0
comm->collNetSupport = 0;
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
// Free collNet resources
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclPeer* peer = channel->peers+comm->nRanks;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
recv->transportResources = NULL; // avoid double free
}
}
return ncclSuccess;
}

View File

@ -371,7 +371,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
// Data is ready, try to send.
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
@ -388,7 +388,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", sub->done, buffSlot, sub->requests[buffSlot]);
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
sub->done += args->sliceSteps;
if (resources->shared == 0) {
@ -447,7 +447,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
}
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
TRACE(NCCL_NET, "recvProxy [%ld/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
sub->posted += args->sliceSteps;
args->idle = 0;
continue;