Merge tag 'v2.15.1-1'
This commit is contained in:
commit
d128d62238
@ -31,13 +31,17 @@ CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
|
|||||||
-gencode=arch=compute_61,code=sm_61
|
-gencode=arch=compute_61,code=sm_61
|
||||||
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
||||||
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
||||||
|
CUDA11_8_GENCODE = -gencode=arch=compute_90,code=sm_90
|
||||||
|
|
||||||
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
||||||
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
||||||
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
|
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
|
||||||
|
CUDA11_8_PTX = -gencode=arch=compute_90,code=compute_90
|
||||||
|
|
||||||
# Include Ampere support if we're using CUDA11 or above
|
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
|
||||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
|
# Include Hopper support if we're using CUDA11.8 or above
|
||||||
|
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_8_GENCODE) $(CUDA11_8_PTX)
|
||||||
|
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
|
||||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
|
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
|
||||||
# Include Volta support if we're using CUDA9 or above
|
# Include Volta support if we're using CUDA9 or above
|
||||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
|
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
|
||||||
@ -45,7 +49,7 @@ else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
|
|||||||
else
|
else
|
||||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
|
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
|
||||||
endif
|
endif
|
||||||
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
|
$(info NVCC_GENCODE is ${NVCC_GENCODE})
|
||||||
|
|
||||||
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
|
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
|
||||||
-Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
|
-Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 14
|
NCCL_MINOR := 15
|
||||||
NCCL_PATCH := 3
|
NCCL_PATCH := 1
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -27,7 +27,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
|||||||
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.stream));
|
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.stream));
|
||||||
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
|
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
|
||||||
|
|
||||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
|
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
|
||||||
|
|
||||||
for (int r=0; r < nRanks+1; ++r) {
|
for (int r=0; r < nRanks+1; ++r) {
|
||||||
for (int b=0; b < NCCL_MAX_CONNS; b++) {
|
for (int b=0; b < NCCL_MAX_CONNS; b++) {
|
||||||
|
103
src/enqueue.cc
103
src/enqueue.cc
@ -940,14 +940,33 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
|||||||
struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue);
|
struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue);
|
||||||
comm->unlaunchedPlansHead = planHead;
|
comm->unlaunchedPlansHead = planHead;
|
||||||
|
|
||||||
|
// Semantically we want these dependencies for the kernels launched:
|
||||||
|
// 1. Launch host task on hostStream.
|
||||||
|
// 2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...}
|
||||||
|
// 3. {deviceStream, userStream[i]...} depend on kernel.
|
||||||
|
// We achieve this by:
|
||||||
|
// 1. userStream[0] waits on deviceStream
|
||||||
|
// 2. deviceStream waits on each of userStream[1...]
|
||||||
|
// 3. host task launch on hostStream
|
||||||
|
// 4. userStream[0] waits on hostStream
|
||||||
|
// 5. kernel launch on userStream[0]
|
||||||
|
// 6. deviceStream waits on userStream[0]
|
||||||
|
// 7. userStream[1...] each waits on deviceStream
|
||||||
|
// The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
|
||||||
|
// at least one of the two streams to be strong-stream.
|
||||||
|
cudaStream_t launchStream = tasks->streams->stream;
|
||||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->deviceStream), result, failure);
|
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->deviceStream), result, failure);
|
||||||
|
|
||||||
// Create dependency for nccl device work on user streams.
|
// Create dependency for device stream on user streams. First from extra user
|
||||||
for (struct ncclCudaStreamList* l=tasks->streams; l != nullptr; l = l->next) {
|
// streams to deviceStream. Then deviceStream to first user stream.
|
||||||
|
for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) {
|
||||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, l->stream), result, failure);
|
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, l->stream), result, failure);
|
||||||
}
|
}
|
||||||
|
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure);
|
||||||
|
|
||||||
if (persistent || comm->persistentRefs != 0) {
|
if (persistent || comm->persistentRefs != 0) {
|
||||||
|
// We have to launch host tasks to push proxy args. We are careful to only
|
||||||
|
// do this if necessary since host tasks impose a high performance cost in CUDA.
|
||||||
bool acquired = false;
|
bool acquired = false;
|
||||||
for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
|
for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
|
||||||
if (plan->hasProxyOps) {
|
if (plan->hasProxyOps) {
|
||||||
@ -959,6 +978,8 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (acquired) {
|
if (acquired) {
|
||||||
|
// Make to-be-launched kernels dependent on just-launched host stream tasks.
|
||||||
|
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->hostStream), result, failure);
|
||||||
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->hostStream), result, failure);
|
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->hostStream), result, failure);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -984,14 +1005,67 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= 11080
|
||||||
|
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
|
||||||
|
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||||
struct ncclTasks* tasks = &comm->tasks;
|
struct ncclTasks* tasks = &comm->tasks;
|
||||||
|
void *fn = plan->kernelFn;
|
||||||
|
cudaStream_t launchStream = tasks->streams->stream;
|
||||||
dim3 grid = {(unsigned)plan->channelCount, 1, 1};
|
dim3 grid = {(unsigned)plan->channelCount, 1, 1};
|
||||||
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
|
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
|
||||||
void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
|
void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
|
||||||
NCCLCHECK(ncclStrongStreamLaunchKernel(
|
|
||||||
tasks->capturingGraph, &comm->deviceStream, plan->kernelFn, grid, block, args, 0
|
#if CUDART_VERSION >= 11080
|
||||||
));
|
int driverVersion;
|
||||||
|
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
|
||||||
|
|
||||||
|
unsigned int clusterSize = 0;
|
||||||
|
clusterSize = ncclParamCGAClusterSize();
|
||||||
|
if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
|
||||||
|
static bool warned = false;
|
||||||
|
if (warned == false) {
|
||||||
|
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
|
||||||
|
clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
|
||||||
|
warned = true;
|
||||||
|
}
|
||||||
|
clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (clusterSize && driverVersion >= 11080) {
|
||||||
|
cudaLaunchConfig_t launchConfig = {0};
|
||||||
|
cudaLaunchAttribute launchAttrs[2];
|
||||||
|
/* Cooperative Group Array (CGA)
|
||||||
|
* On sm90 and later we have an extra level of hierarchy where we
|
||||||
|
* can group together several blocks within the Grid, called
|
||||||
|
* Thread Block Clusters.
|
||||||
|
* Clusters enable multiple thread blocks running concurrently
|
||||||
|
* across multiple SMs to synchronize and collaboratively fetch
|
||||||
|
* and exchange data. A cluster of blocks are guaranteed to be
|
||||||
|
* concurrently scheduled onto a group of SMs.
|
||||||
|
* The maximum value is 8 and it must be divisible into the grid dimensions
|
||||||
|
*/
|
||||||
|
// Grid dimension must be divisible by clusterSize
|
||||||
|
if (grid.x % clusterSize) clusterSize = 1;
|
||||||
|
launchAttrs[0].id = cudaLaunchAttributeClusterDimension;
|
||||||
|
launchAttrs[0].val.clusterDim = {clusterSize, 1, 1};
|
||||||
|
launchAttrs[1].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
|
||||||
|
launchAttrs[1].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
|
||||||
|
|
||||||
|
launchConfig.gridDim = grid;
|
||||||
|
launchConfig.blockDim = block;
|
||||||
|
launchConfig.attrs = launchAttrs;
|
||||||
|
launchConfig.numAttrs = sizeof(launchAttrs)/sizeof(launchAttrs[0]);
|
||||||
|
launchConfig.stream = launchStream;
|
||||||
|
|
||||||
|
CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// Standard kernel launch
|
||||||
|
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1017,17 +1091,21 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
|||||||
// Reset queue to empty without destroying plans since those will be sent
|
// Reset queue to empty without destroying plans since those will be sent
|
||||||
// back to us for reclaiming via callbackQueue.
|
// back to us for reclaiming via callbackQueue.
|
||||||
ncclIntruQueueConstruct(&comm->planQueue);
|
ncclIntruQueueConstruct(&comm->planQueue);
|
||||||
// Close strong stream "transaction" encompassing cuda launches
|
cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch
|
||||||
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->deviceStream), result, resume1);
|
// Create dependency for deviceStream on launchStream.
|
||||||
|
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream), result, resume1);
|
||||||
resume1:
|
resume1:
|
||||||
// Create dependency for user streams on nccl device work.
|
// Create dependency for other user streams (skip launch stream).
|
||||||
struct ncclCudaStreamList* sl = tasks->streams;
|
struct ncclCudaStreamList* sl = tasks->streams->next;
|
||||||
tasks->streams = nullptr; // reset streams to empty
|
tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
|
||||||
while (sl != nullptr) {
|
while (sl != nullptr) {
|
||||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream), result, resume2);
|
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream), result, resume2);
|
||||||
resume2:
|
resume2:
|
||||||
sl = sl->next;
|
sl = sl->next;
|
||||||
}
|
}
|
||||||
|
// Release device stream as acquired in ncclLaunchPrepare()
|
||||||
|
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->deviceStream), result, resume3);
|
||||||
|
resume3:;
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -1364,12 +1442,12 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
|
|||||||
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
|
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
|
||||||
if (isSendNotRecv) {
|
if (isSendNotRecv) {
|
||||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
||||||
comm->connectSend[peer] |= (1<<channelId);
|
comm->connectSend[peer] |= (1UL<<channelId);
|
||||||
ncclGroupCommPreconnect(comm);
|
ncclGroupCommPreconnect(comm);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
||||||
comm->connectRecv[peer] |= (1<<channelId);
|
comm->connectRecv[peer] |= (1UL<<channelId);
|
||||||
ncclGroupCommPreconnect(comm);
|
ncclGroupCommPreconnect(comm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1429,6 +1507,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
|
|||||||
}
|
}
|
||||||
if (l->stream == info->stream)
|
if (l->stream == info->stream)
|
||||||
break; // Already seen stream.
|
break; // Already seen stream.
|
||||||
|
l = l->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
@ -399,6 +399,19 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Set to 0 to disable the flush on Hopper when using GDR
|
||||||
|
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
|
||||||
|
|
||||||
|
// Determine whether we need to flush the GDR recv buffers
|
||||||
|
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {
|
||||||
|
int g;
|
||||||
|
NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
|
||||||
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||||
|
// Flush is required on Ampere and earlier
|
||||||
|
*flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush();
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
|
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
|
||||||
|
|
||||||
// Check whether going through the network would be faster than going through P2P/SHM.
|
// Check whether going through the network would be faster than going through P2P/SHM.
|
||||||
|
@ -727,7 +727,7 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
|
|||||||
}
|
}
|
||||||
|
|
||||||
float speedArrayIntra[] = { 44.0, 30.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
|
float speedArrayIntra[] = { 44.0, 30.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
|
||||||
float speedArrayInter[] = { 48.0, 30.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||||
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
|
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
|
||||||
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
|
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
|
||||||
|
|
||||||
|
@ -72,10 +72,24 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
|||||||
/* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 } }
|
/* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 } }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Array indexes used below */
|
||||||
|
#define VOLTA_COMPCAP_IDX 0
|
||||||
|
#define AMPERE_COMPCAP_IDX 1
|
||||||
|
#define HOPPER_COMPCAP_IDX 2
|
||||||
|
|
||||||
// LL128 max BW per channel
|
// LL128 max BW per channel
|
||||||
static const double ll128MaxBwPerCh = 20.0;
|
static const double ll128MaxBwPerCh = 20.0;
|
||||||
static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} };
|
static const double llMaxBws[3][3] = {
|
||||||
static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8} };
|
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
|
||||||
|
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
|
||||||
|
/* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}
|
||||||
|
};
|
||||||
|
|
||||||
|
static const double perChMaxTreeBws[3][3] = {
|
||||||
|
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
|
||||||
|
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
|
||||||
|
/* Hopper (N1/N2/N4) */ {24.0, 23.6, 17.8},
|
||||||
|
};
|
||||||
|
|
||||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
||||||
int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
||||||
@ -94,14 +108,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
int nRanks = comm->nRanks;
|
int nRanks = comm->nRanks;
|
||||||
if (nRanks <= 1) return ncclSuccess;
|
if (nRanks <= 1) return ncclSuccess;
|
||||||
|
|
||||||
int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
|
int compCapIndex = (minCompCap == 80 && maxCompCap == 80) ? AMPERE_COMPCAP_IDX : ((minCompCap == 90 && maxCompCap == 90) ? HOPPER_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
|
||||||
int cpuArch, cpuVendor, cpuModel;
|
int cpuArch, cpuVendor, cpuModel;
|
||||||
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
||||||
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
||||||
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
|
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
|
||||||
int index1 = nNodes == 1 ? compCap80 : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
|
int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
|
||||||
double llMaxBw = llMaxBws[index1][index2];
|
double llMaxBw = llMaxBws[index1][index2];
|
||||||
double perChMaxTreeBw = perChMaxTreeBws[compCap80][index2];
|
double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
|
||||||
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
|
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
|
||||||
if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
||||||
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||||
@ -128,7 +142,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
float busBw = graphs[a]->nChannels * bw;
|
float busBw = graphs[a]->nChannels * bw;
|
||||||
|
|
||||||
// Various model refinements
|
// Various model refinements
|
||||||
if (compCap80) busBw = std::min(busBw, 235.0f);
|
if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
|
||||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
|
||||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh*graphs[a]->nChannels);
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh*graphs[a]->nChannels);
|
||||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||||
@ -212,9 +226,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
int pEnable = protoEnable[p];
|
int pEnable = protoEnable[p];
|
||||||
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
||||||
// Enable LL128 by default only on Volta/Ampere+NVLink. Other cases are not tested and may cause silent data corruption.
|
// Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
|
||||||
pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
|
pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
|
||||||
((minCompCap == 70 && maxCompCap == 70) || (minCompCap == 80 && maxCompCap == 80)) ? 1 : 0;
|
((minCompCap == 70 && maxCompCap == 70) || (minCompCap == 80 && maxCompCap == 80) || (minCompCap == 90 && maxCompCap == 90)) ? 1 : 0;
|
||||||
}
|
}
|
||||||
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
||||||
// Only disable algo for Allreduce since others only have one
|
// Only disable algo for Allreduce since others only have one
|
||||||
|
@ -628,7 +628,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
|||||||
NCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode));
|
NCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode));
|
||||||
if (nvlNode == NULL) {
|
if (nvlNode == NULL) {
|
||||||
// NVML NVLink detection
|
// NVML NVLink detection
|
||||||
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
|
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : (sm < 90) ? 12 : 18;
|
||||||
|
|
||||||
if (maxNvLinks > 0 && nvmlDev == NULL) {
|
if (maxNvLinks > 0 && nvmlDev == NULL) {
|
||||||
WARN("No NVML device handle. Skipping nvlink detection.");
|
WARN("No NVML device handle. Skipping nvlink detection.");
|
||||||
@ -641,8 +641,21 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
|||||||
if ((ncclNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
|
if ((ncclNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
|
||||||
|
|
||||||
// Make sure the Nvlink is up. The previous call should have trained the link.
|
// Make sure the Nvlink is up. The previous call should have trained the link.
|
||||||
nvmlEnableState_t isActive;
|
nvmlEnableState_t isActive = NVML_FEATURE_DISABLED;
|
||||||
if ((ncclNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
|
#if CUDART_VERSION >= 11080
|
||||||
|
if (sm >= 90) {
|
||||||
|
nvmlFieldValue_t fv;
|
||||||
|
fv.fieldId = NVML_FI_DEV_NVLINK_GET_STATE;
|
||||||
|
fv.scopeId = l;
|
||||||
|
// fv.value will contain NV_FEATURE_ENABLED or NV_FEATURE_DISABLED
|
||||||
|
if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 1, &fv) == ncclSuccess) && (fv.nvmlReturn == NVML_SUCCESS))
|
||||||
|
isActive = (nvmlEnableState_t) fv.value.uiVal;
|
||||||
|
} else /* FALLTHRU to GetNvLinkState if before SM90 */
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
(void) ncclNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive);
|
||||||
|
}
|
||||||
|
if (isActive != NVML_FEATURE_ENABLED) continue;
|
||||||
|
|
||||||
// Try to figure out what's on the other side of the NVLink
|
// Try to figure out what's on the other side of the NVLink
|
||||||
nvmlPciInfo_t remoteProc;
|
nvmlPciInfo_t remoteProc;
|
||||||
|
@ -211,8 +211,8 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
|
|||||||
for (int i = 0; i < comm->nRanks; i++) {
|
for (int i = 0; i < comm->nRanks; i++) {
|
||||||
comm->tasks.peers[i].sendSeen = false;
|
comm->tasks.peers[i].sendSeen = false;
|
||||||
comm->tasks.peers[i].recvSeen = false;
|
comm->tasks.peers[i].recvSeen = false;
|
||||||
comm->connectSend[i] = 0;
|
comm->connectSend[i] = 0UL;
|
||||||
comm->connectRecv[i] = 0;
|
comm->connectRecv[i] = 0UL;
|
||||||
}
|
}
|
||||||
comm->unlaunchedPlansHead = nullptr;
|
comm->unlaunchedPlansHead = nullptr;
|
||||||
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
|
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
|
||||||
|
@ -168,8 +168,8 @@ struct ncclComm {
|
|||||||
ncclCollNet_t* ncclCollNet;
|
ncclCollNet_t* ncclCollNet;
|
||||||
void* bootstrap;
|
void* bootstrap;
|
||||||
// Bitmasks for ncclTransportP2pSetup
|
// Bitmasks for ncclTransportP2pSetup
|
||||||
uint32_t* connectSend;
|
uint64_t* connectSend;
|
||||||
uint32_t* connectRecv;
|
uint64_t* connectRecv;
|
||||||
|
|
||||||
int rank; // my rank in the communicator
|
int rank; // my rank in the communicator
|
||||||
int nRanks; // number of GPUs in communicator
|
int nRanks; // number of GPUs in communicator
|
||||||
|
@ -8,6 +8,8 @@
|
|||||||
#define NCCL_CUDAWRAP_H_
|
#define NCCL_CUDAWRAP_H_
|
||||||
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include "checks.h"
|
||||||
|
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
#include <cudaTypedefs.h>
|
#include <cudaTypedefs.h>
|
||||||
@ -83,6 +85,18 @@ DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
|
|||||||
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
|
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
|
||||||
|
|
||||||
|
|
||||||
ncclResult_t cudaLibraryInit(void);
|
ncclResult_t ncclCudaLibraryInit(void);
|
||||||
|
|
||||||
|
extern int ncclCudaDriverVersionCache;
|
||||||
|
|
||||||
|
inline ncclResult_t ncclCudaDriverVersion(int* driver) {
|
||||||
|
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
|
||||||
|
if (version == -1) {
|
||||||
|
CUDACHECK(cudaDriverGetVersion(&version));
|
||||||
|
__atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
|
||||||
|
}
|
||||||
|
*driver = version;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -33,6 +33,7 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
|
|||||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
|
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
|
||||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||||
|
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
|
||||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
||||||
int ncclPxnDisable(struct ncclComm* comm);
|
int ncclPxnDisable(struct ncclComm* comm);
|
||||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||||
|
@ -107,6 +107,75 @@ typedef enum nvmlGpuP2PCapsIndex_enum
|
|||||||
NVML_P2P_CAPS_INDEX_UNKNOWN
|
NVML_P2P_CAPS_INDEX_UNKNOWN
|
||||||
} nvmlGpuP2PCapsIndex_t;
|
} nvmlGpuP2PCapsIndex_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents the type for sample value returned
|
||||||
|
*/
|
||||||
|
typedef enum nvmlValueType_enum
|
||||||
|
{
|
||||||
|
NVML_VALUE_TYPE_DOUBLE = 0,
|
||||||
|
NVML_VALUE_TYPE_UNSIGNED_INT = 1,
|
||||||
|
NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
|
||||||
|
NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
|
||||||
|
NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
|
||||||
|
|
||||||
|
// Keep this last
|
||||||
|
NVML_VALUE_TYPE_COUNT
|
||||||
|
}nvmlValueType_t;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Union to represent different types of Value
|
||||||
|
*/
|
||||||
|
typedef union nvmlValue_st
|
||||||
|
{
|
||||||
|
double dVal; //!< If the value is double
|
||||||
|
unsigned int uiVal; //!< If the value is unsigned int
|
||||||
|
unsigned long ulVal; //!< If the value is unsigned long
|
||||||
|
unsigned long long ullVal; //!< If the value is unsigned long long
|
||||||
|
signed long long sllVal; //!< If the value is signed long long
|
||||||
|
}nvmlValue_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Field Identifiers.
|
||||||
|
*
|
||||||
|
* All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* NVLink Speed */
|
||||||
|
#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links
|
||||||
|
#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remote device NVLink ID
|
||||||
|
*
|
||||||
|
* Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
|
||||||
|
*/
|
||||||
|
#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NVSwitch: connected NVLink count
|
||||||
|
*/
|
||||||
|
#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch
|
||||||
|
|
||||||
|
#define NVML_FI_DEV_NVLINK_GET_SPEED 164
|
||||||
|
#define NVML_FI_DEV_NVLINK_GET_STATE 165
|
||||||
|
#define NVML_FI_DEV_NVLINK_GET_VERSION 166
|
||||||
|
#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Information for a Field Value Sample
|
||||||
|
*/
|
||||||
|
typedef struct nvmlFieldValue_st
|
||||||
|
{
|
||||||
|
unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
|
||||||
|
unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId.
|
||||||
|
long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970
|
||||||
|
long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call.
|
||||||
|
nvmlValueType_t valueType; //!< Type of the value stored in value
|
||||||
|
nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS
|
||||||
|
nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
|
||||||
|
} nvmlFieldValue_t;
|
||||||
|
|
||||||
/* End of nvml.h */
|
/* End of nvml.h */
|
||||||
#endif // NCCL_NVML_DIRECT
|
#endif // NCCL_NVML_DIRECT
|
||||||
|
|
||||||
@ -135,4 +204,6 @@ ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
|
|||||||
ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
|
ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
|
||||||
ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
|
ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
|
||||||
ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
|
ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
|
||||||
|
ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
|
||||||
|
|
||||||
#endif // End include guard
|
#endif // End include guard
|
||||||
|
@ -22,7 +22,7 @@ struct ncclCudaGraph {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
inline struct ncclCudaGraph ncclCudaGraphNull() {
|
inline struct ncclCudaGraph ncclCudaGraphNone() {
|
||||||
struct ncclCudaGraph tmp;
|
struct ncclCudaGraph tmp;
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
tmp.graph = nullptr;
|
tmp.graph = nullptr;
|
||||||
@ -50,7 +50,6 @@ inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) {
|
|||||||
ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream);
|
ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream);
|
||||||
ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg);
|
ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg);
|
||||||
|
|
||||||
|
|
||||||
/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
|
/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
|
||||||
* identity while being captured. Regular streams have the deficiency that the
|
* identity while being captured. Regular streams have the deficiency that the
|
||||||
* captured form of a stream in one graph launch has no relation to the
|
* captured form of a stream in one graph launch has no relation to the
|
||||||
@ -88,7 +87,7 @@ ncclResult_t ncclStrongStreamAcquire(
|
|||||||
// Acquire-fence the strong stream assuming no graph is capturing. This permits
|
// Acquire-fence the strong stream assuming no graph is capturing. This permits
|
||||||
// the caller to enqueue directly to the `ss->stream` member using native CUDA
|
// the caller to enqueue directly to the `ss->stream` member using native CUDA
|
||||||
// calls. Strong stream must be released via:
|
// calls. Strong stream must be released via:
|
||||||
// ncclStrongStreamRelease(ncclCudaGraphNull(), graphRefs, ss);
|
// ncclStrongStreamRelease(ncclCudaGraphNone(), ss);
|
||||||
ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
|
ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
|
||||||
|
|
||||||
// Release-fence of the strong stream.
|
// Release-fence of the strong stream.
|
||||||
|
12
src/init.cc
12
src/init.cc
@ -416,7 +416,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
|||||||
|
|
||||||
NCCLCHECK(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.stream));
|
NCCLCHECK(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.stream));
|
||||||
CUDACHECK(cudaStreamSynchronize(comm->deviceStream.stream));
|
CUDACHECK(cudaStreamSynchronize(comm->deviceStream.stream));
|
||||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
|
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -955,13 +955,13 @@ collnet_cleanup:
|
|||||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||||
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
|
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
|
||||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
|
if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
|
||||||
comm->connectSend[peer] |= (1<<channelId);
|
comm->connectSend[peer] |= (1UL<<channelId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||||
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
|
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
|
||||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
|
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
|
||||||
comm->connectRecv[peer] |= (1<<channelId);
|
comm->connectRecv[peer] |= (1UL<<channelId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1172,7 +1172,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
|
|||||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||||
|
|
||||||
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
||||||
(void) cudaLibraryInit();
|
(void)ncclCudaLibraryInit();
|
||||||
|
|
||||||
int cudaDev;
|
int cudaDev;
|
||||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||||
@ -1187,7 +1187,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
|||||||
int totalnDev;
|
int totalnDev;
|
||||||
int *gpuFlags = NULL;
|
int *gpuFlags = NULL;
|
||||||
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
||||||
(void) cudaLibraryInit();
|
(void)ncclCudaLibraryInit();
|
||||||
|
|
||||||
NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
|
NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
|
||||||
if (ndev < 0) {
|
if (ndev < 0) {
|
||||||
@ -1279,7 +1279,7 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
|
|||||||
}
|
}
|
||||||
if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv;
|
if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv;
|
||||||
|
|
||||||
(void) cudaLibraryInit();
|
(void)ncclCudaLibraryInit();
|
||||||
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, exit);
|
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, exit);
|
||||||
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
|
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
|
|||||||
#define CUDA_DRIVER_MIN_VERSION 11030
|
#define CUDA_DRIVER_MIN_VERSION 11030
|
||||||
|
|
||||||
static void *cudaLib;
|
static void *cudaLib;
|
||||||
static int cudaDriverVersion;
|
int ncclCudaDriverVersionCache = -1;
|
||||||
|
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
/*
|
/*
|
||||||
@ -107,16 +107,17 @@ static void initOnceFunc() {
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
res = pfn_cuDriverGetVersion(&cudaDriverVersion);
|
int driverVersion;
|
||||||
|
res = pfn_cuDriverGetVersion(&driverVersion);
|
||||||
if (res != 0) {
|
if (res != 0) {
|
||||||
WARN("cuDriverGetVersion failed with %d", res);
|
WARN("cuDriverGetVersion failed with %d", res);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
INFO(NCCL_INIT, "cudaDriverVersion %d", cudaDriverVersion);
|
INFO(NCCL_INIT, "cudaDriverVersion %d", driverVersion);
|
||||||
|
|
||||||
if (cudaDriverVersion < CUDA_DRIVER_MIN_VERSION) {
|
if (driverVersion < CUDA_DRIVER_MIN_VERSION) {
|
||||||
// WARN("CUDA Driver version found is %d. Minimum requirement is %d", cudaDriverVersion, CUDA_DRIVER_MIN_VERSION);
|
// WARN("CUDA Driver version found is %d. Minimum requirement is %d", driverVersion, CUDA_DRIVER_MIN_VERSION);
|
||||||
// Silently ignore version check mismatch for backwards compatibility
|
// Silently ignore version check mismatch for backwards compatibility
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
@ -148,7 +149,7 @@ error:
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t cudaLibraryInit() {
|
ncclResult_t ncclCudaLibraryInit() {
|
||||||
pthread_once(&initOnceControl, initOnceFunc);
|
pthread_once(&initOnceControl, initOnceFunc);
|
||||||
return initResult;
|
return initResult;
|
||||||
}
|
}
|
||||||
|
@ -38,6 +38,7 @@ namespace {
|
|||||||
NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult))
|
NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult))
|
||||||
NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
|
NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
|
||||||
NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
|
NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
|
||||||
|
NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
|
||||||
|
|
||||||
std::mutex lock; // NVML has had some thread safety bugs
|
std::mutex lock; // NVML has had some thread safety bugs
|
||||||
bool initialized = false;
|
bool initialized = false;
|
||||||
@ -80,7 +81,8 @@ ncclResult_t ncclNvmlEnsureInitialized() {
|
|||||||
{(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
|
{(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
|
||||||
{(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
|
{(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
|
||||||
{(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
|
{(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
|
||||||
{(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"}
|
{(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
|
||||||
|
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}
|
||||||
};
|
};
|
||||||
for(Symbol sym: symbols) {
|
for(Symbol sym: symbols) {
|
||||||
*sym.ppfn = dlsym(libhandle, sym.name);
|
*sym.ppfn = dlsym(libhandle, sym.name);
|
||||||
@ -260,3 +262,10 @@ ncclResult_t ncclNvmlDeviceGetP2PStatus(
|
|||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values) {
|
||||||
|
NCCLCHECK(ncclNvmlEnsureInitialized());
|
||||||
|
std::lock_guard<std::mutex> locked(lock);
|
||||||
|
NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "strongstream.h"
|
#include "strongstream.h"
|
||||||
|
#include "cudawrap.h"
|
||||||
#include "checks.h"
|
#include "checks.h"
|
||||||
#include "param.h"
|
#include "param.h"
|
||||||
|
|
||||||
@ -14,10 +15,8 @@ ncclResult_t ncclCudaGetCapturingGraph(
|
|||||||
struct ncclCudaGraph* graph, cudaStream_t stream
|
struct ncclCudaGraph* graph, cudaStream_t stream
|
||||||
) {
|
) {
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
thread_local int driver = -1;
|
int driver;
|
||||||
if (driver == -1) {
|
NCCLCHECK(ncclCudaDriverVersion(&driver));
|
||||||
CUDACHECK(cudaDriverGetVersion(&driver));
|
|
||||||
}
|
|
||||||
if (driver < 11030) {
|
if (driver < 11030) {
|
||||||
cudaStreamCaptureStatus status;
|
cudaStreamCaptureStatus status;
|
||||||
unsigned long long gid;
|
unsigned long long gid;
|
||||||
@ -192,11 +191,11 @@ ncclResult_t ncclStrongStreamWaitStream(
|
|||||||
CUDACHECK(cudaEventRecord(b->event, b->stream));
|
CUDACHECK(cudaEventRecord(b->event, b->stream));
|
||||||
}
|
}
|
||||||
CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
|
CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
|
||||||
a->eventIsLagging = 1;
|
|
||||||
} else {
|
} else {
|
||||||
cudaGraphNode_t pair[2] = {a->node, b->node};
|
cudaGraphNode_t pair[2] = {a->node, b->node};
|
||||||
CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
|
CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
|
||||||
}
|
}
|
||||||
|
a->eventIsLagging = 1;
|
||||||
#else
|
#else
|
||||||
CUDACHECK(cudaEventRecord(b->event, b->stream));
|
CUDACHECK(cudaEventRecord(b->event, b->stream));
|
||||||
CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
|
CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
|
||||||
@ -232,9 +231,8 @@ ncclResult_t ncclStrongStreamWaitStream(
|
|||||||
}
|
}
|
||||||
cudaGraphNode_t pair[2] = {a->node, tie};
|
cudaGraphNode_t pair[2] = {a->node, tie};
|
||||||
CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
|
CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
|
||||||
|
a->eventIsLagging = 1;
|
||||||
}
|
}
|
||||||
// a->eventIsLagging doesn't change since we are just updating the
|
|
||||||
// dependencies of a->node.
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
CUDACHECK(cudaEventRecord(a->event, b));
|
CUDACHECK(cudaEventRecord(a->event, b));
|
||||||
|
@ -1055,8 +1055,8 @@ void* ncclProxyService(void* _args) {
|
|||||||
int asyncOpCount = 0;
|
int asyncOpCount = 0;
|
||||||
while ((stop == 0 || (stop == 1 && npeers > 0)) && *comm->abortFlag == 0) {
|
while ((stop == 0 || (stop == 1 && npeers > 0)) && *comm->abortFlag == 0) {
|
||||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||||
if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500) < 0) {
|
if (poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500) < 0) {
|
||||||
WARN("[Proxy Service] Poll failed with error %d", error);
|
WARN("[Proxy Service] Poll failed: %s\n", strerror(errno));
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
|
if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
|
||||||
|
@ -42,7 +42,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
||||||
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
||||||
struct ncclChannel* channel = &comm->channels[channelId];
|
struct ncclChannel* channel = &comm->channels[channelId];
|
||||||
uint32_t mask = 1 << channelId;
|
uint64_t mask = 1UL << channel->id;
|
||||||
for (int i=0; i<nrecv; i++) {
|
for (int i=0; i<nrecv; i++) {
|
||||||
int peer = peerRecv[i];
|
int peer = peerRecv[i];
|
||||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
|
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
|
||||||
@ -77,15 +77,15 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
||||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||||
uint32_t recvMask = comm->connectRecv[recvPeer];
|
uint64_t recvMask = comm->connectRecv[recvPeer];
|
||||||
uint32_t sendMask = comm->connectSend[sendPeer];
|
uint64_t sendMask = comm->connectSend[sendPeer];
|
||||||
|
|
||||||
struct ncclConnect* recvData = data;
|
struct ncclConnect* recvData = data;
|
||||||
int sendChannels = 0, recvChannels = 0;
|
int sendChannels = 0, recvChannels = 0;
|
||||||
int type;
|
int type;
|
||||||
TIME_START(0);
|
TIME_START(0);
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (recvMask & (1<<c)) {
|
if (recvMask & (1UL<<c)) {
|
||||||
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
|
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
|
||||||
if (type > highestType) highestType = type;
|
if (type > highestType) highestType = type;
|
||||||
}
|
}
|
||||||
@ -94,7 +94,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
TIME_START(1);
|
TIME_START(1);
|
||||||
struct ncclConnect* sendData = recvData+recvChannels;
|
struct ncclConnect* sendData = recvData+recvChannels;
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (sendMask & (1<<c)) {
|
if (sendMask & (1UL<<c)) {
|
||||||
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
|
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
|
||||||
if (type > highestType) highestType = type;
|
if (type > highestType) highestType = type;
|
||||||
}
|
}
|
||||||
@ -119,7 +119,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
|
|
||||||
TIME_START(3);
|
TIME_START(3);
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (sendMask & (1<<c)) {
|
if (sendMask & (1UL<<c)) {
|
||||||
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
||||||
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
||||||
conn->connected = 1;
|
conn->connected = 1;
|
||||||
@ -129,7 +129,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
TIME_STOP(3);
|
TIME_STOP(3);
|
||||||
TIME_START(4);
|
TIME_START(4);
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (recvMask & (1<<c)) {
|
if (recvMask & (1UL<<c)) {
|
||||||
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
||||||
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
||||||
conn->connected = 1;
|
conn->connected = 1;
|
||||||
@ -137,7 +137,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
TIME_STOP(4);
|
TIME_STOP(4);
|
||||||
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
|
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
|
||||||
}
|
}
|
||||||
CUDACHECK(cudaStreamSynchronize(transportSetupStream));
|
CUDACHECK(cudaStreamSynchronize(transportSetupStream));
|
||||||
CUDACHECK(cudaStreamDestroy(transportSetupStream));
|
CUDACHECK(cudaStreamDestroy(transportSetupStream));
|
||||||
|
@ -121,6 +121,7 @@ struct recvResources {
|
|||||||
int netDev;
|
int netDev;
|
||||||
int useGdr;
|
int useGdr;
|
||||||
int useDmaBuf;
|
int useDmaBuf;
|
||||||
|
int needFlush;
|
||||||
uint64_t* gdcSync;
|
uint64_t* gdcSync;
|
||||||
uint64_t* gdcFlush;
|
uint64_t* gdcFlush;
|
||||||
void* gdrDesc;
|
void* gdrDesc;
|
||||||
@ -139,6 +140,7 @@ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
|
|||||||
struct setupReq {
|
struct setupReq {
|
||||||
int netDev;
|
int netDev;
|
||||||
int useGdr;
|
int useGdr;
|
||||||
|
int needFlush;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -151,6 +153,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
|
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
|
||||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||||
send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||||
|
// Determine whether we need to flush the GDR buffer on recv or not
|
||||||
|
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||||
|
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
|
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
|
||||||
@ -392,6 +396,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
|
|||||||
|
|
||||||
resources->netDev = req->netDev;
|
resources->netDev = req->netDev;
|
||||||
resources->useGdr = req->useGdr;
|
resources->useGdr = req->useGdr;
|
||||||
|
resources->needFlush = req->needFlush;
|
||||||
ncclNetProperties_t props;
|
ncclNetProperties_t props;
|
||||||
NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
|
NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
|
||||||
/* DMA-BUF support */
|
/* DMA-BUF support */
|
||||||
@ -754,7 +759,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
|||||||
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
|
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
|
||||||
sub->received += args->sliceSteps;
|
sub->received += args->sliceSteps;
|
||||||
sub->requests[buffSlot] = NULL;
|
sub->requests[buffSlot] = NULL;
|
||||||
if (1 && reqFifo[group][buffSlot].size > 0 && resources->useGdr) {
|
if (reqFifo[group][buffSlot].size > 0 && resources->useGdr && resources->needFlush) {
|
||||||
// GDRCOPY support
|
// GDRCOPY support
|
||||||
if (resources->gdcFlush) {
|
if (resources->gdcFlush) {
|
||||||
#if defined (__x86_64__)
|
#if defined (__x86_64__)
|
||||||
|
@ -118,6 +118,7 @@ struct recvResources {
|
|||||||
int netDev;
|
int netDev;
|
||||||
int useGdr;
|
int useGdr;
|
||||||
int useDmaBuf;
|
int useDmaBuf;
|
||||||
|
int needFlush;
|
||||||
int maxRecvs;
|
int maxRecvs;
|
||||||
uint64_t* gdcSync;
|
uint64_t* gdcSync;
|
||||||
uint64_t* gdcFlush;
|
uint64_t* gdcFlush;
|
||||||
@ -152,6 +153,7 @@ struct setupReq {
|
|||||||
int shared;
|
int shared;
|
||||||
int netDev;
|
int netDev;
|
||||||
int useGdr;
|
int useGdr;
|
||||||
|
int needFlush;
|
||||||
int channelId;
|
int channelId;
|
||||||
int connIndex;
|
int connIndex;
|
||||||
};
|
};
|
||||||
@ -205,6 +207,9 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
|
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
|
||||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
|
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
|
||||||
|
|
||||||
|
// Determine whether we need to flush the GDR buffer on recv or not
|
||||||
|
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||||
|
|
||||||
// We don't support PXN on receive yet
|
// We don't support PXN on receive yet
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
|
||||||
|
|
||||||
@ -470,6 +475,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
|
|||||||
resources->netDev = req->netDev;
|
resources->netDev = req->netDev;
|
||||||
resources->shared = connection->shared = req->shared;
|
resources->shared = connection->shared = req->shared;
|
||||||
resources->useGdr = req->useGdr;
|
resources->useGdr = req->useGdr;
|
||||||
|
resources->needFlush = req->needFlush;
|
||||||
resources->channelId = req->channelId;
|
resources->channelId = req->channelId;
|
||||||
resources->connIndex = req->connIndex;
|
resources->connIndex = req->connIndex;
|
||||||
ncclNetProperties_t props;
|
ncclNetProperties_t props;
|
||||||
@ -1033,7 +1039,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
|||||||
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
|
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
|
||||||
NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
|
NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
|
||||||
if (done) {
|
if (done) {
|
||||||
int useGdr = 0;
|
int needFlush = 0;
|
||||||
int totalSize = 0;
|
int totalSize = 0;
|
||||||
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
|
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
|
||||||
for (int i=0; i<subGroup->groupSize; i++) {
|
for (int i=0; i<subGroup->groupSize; i++) {
|
||||||
@ -1042,11 +1048,11 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
|||||||
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
|
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
|
||||||
if (step < sub->nsteps) {
|
if (step < sub->nsteps) {
|
||||||
struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
|
struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
|
||||||
if (resources->useGdr) useGdr = 1;
|
if (resources->useGdr) needFlush |= resources->needFlush;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
subGroup->requests[step%NCCL_STEPS] = NULL;
|
subGroup->requests[step%NCCL_STEPS] = NULL;
|
||||||
if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && useGdr) {
|
if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && needFlush) {
|
||||||
// GDRCOPY support
|
// GDRCOPY support
|
||||||
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
|
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
|
||||||
if (resources->gdcFlush) {
|
if (resources->gdcFlush) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user