Minimize the performance impact of the device kernel profiling support when the profiler plugin is not loaded. Reduce the overheads of CUDA graph capturing, which increased in NCCL 2.26.2 for large graphs. Fix the exchange of enhanced connection establishment (ECE) options to address potential slowdowns on networks utilizing RoCE. Test if cuMem host allocations work and if not, disable them. Enabled by default since NCCL 2.24 if the CUDA driver version is at least 12.6, such allocations rely on NUMA support, which is by default not available under Docker. We recommend invoking Docker with "--cap-add SYS_NICE" to enable it. Fix an initialization error when running with NCCL_NET_GDR_C2C=1 on multiple MNNVL domains with non-uniform network configurations across nodes. Fix the printing of sub-seconds in the debug log when using a custom NCCL_DEBUG_TIMESTAMP_FORMAT setting.
292 lines
9.8 KiB
C++
292 lines
9.8 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "alloc.h"
|
|
#include "nccl.h"
|
|
#include "debug.h"
|
|
#include "param.h"
|
|
#include "cudawrap.h"
|
|
|
|
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
|
|
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
|
|
NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", -1);
|
|
// Handle type used for cuMemCreate()
|
|
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
|
|
|
static int ncclCuMemSupported = 0;
|
|
|
|
// Determine whether CUMEM & VMM RDMA is supported on this platform
|
|
int ncclIsCuMemSupported() {
|
|
#if CUDART_VERSION < 11030
|
|
return 0;
|
|
#else
|
|
CUdevice currentDev;
|
|
int cudaDev;
|
|
int cudaDriverVersion;
|
|
int flag = 0;
|
|
ncclResult_t ret = ncclSuccess;
|
|
CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
|
|
if (cudaDriverVersion < 12000) return 0; // Need CUDA_VISIBLE_DEVICES support
|
|
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error);
|
|
if (CUPFN(cuMemCreate) == NULL) return 0;
|
|
CUCHECKGOTO(cuDeviceGet(¤tDev, cudaDev), ret, error);
|
|
// Query device to see if CUMEM VMM support is available
|
|
CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
|
|
if (!flag) return 0;
|
|
error:
|
|
return (ret == ncclSuccess);
|
|
#endif
|
|
}
|
|
|
|
int ncclCuMemEnable() {
|
|
// NCCL_CUMEM_ENABLE=-2 means auto-detect CUMEM support
|
|
int param = ncclParamCuMemEnable();
|
|
return param >= 0 ? param : (param == -2 && ncclCuMemSupported);
|
|
}
|
|
|
|
static int ncclCumemHostEnable = -1;
|
|
int ncclCuMemHostEnable() {
|
|
if (ncclCumemHostEnable != -1)
|
|
return ncclCumemHostEnable;
|
|
#if CUDART_VERSION < 12020
|
|
ncclCumemHostEnable = 0;
|
|
return ncclCumemHostEnable;
|
|
#else
|
|
ncclResult_t ret = ncclSuccess;
|
|
int cudaDriverVersion;
|
|
int paramValue = -1;
|
|
CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
|
|
if (cudaDriverVersion < 12020) {
|
|
ncclCumemHostEnable = 0;
|
|
}
|
|
else {
|
|
paramValue = ncclParamCuMemHostEnable();
|
|
if (paramValue != -1)
|
|
ncclCumemHostEnable = paramValue;
|
|
else
|
|
ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0;
|
|
if (ncclCumemHostEnable) {
|
|
// Verify that host allocations actually work. Docker in particular is known to disable "get_mempolicy",
|
|
// causing such allocations to fail (this can be fixed by invoking Docker with "--cap-add SYS_NICE").
|
|
int cudaDev;
|
|
CUdevice currentDev;
|
|
int cpuNumaNodeId = -1;
|
|
CUmemAllocationProp prop = {};
|
|
size_t granularity = 0;
|
|
size_t size;
|
|
CUmemGenericAllocationHandle handle;
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
|
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
|
|
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
|
|
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
|
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.requestedHandleTypes = ncclCuMemHandleType;
|
|
prop.location.id = cpuNumaNodeId;
|
|
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
|
size = 1;
|
|
ALIGN_SIZE(size, granularity);
|
|
if (CUPFN(cuMemCreate(&handle, size, &prop, 0)) != CUDA_SUCCESS) {
|
|
INFO(NCCL_INIT, "cuMem host allocations do not appear to be working; falling back to a /dev/shm/ based "
|
|
"implementation. This could be due to the container runtime disabling NUMA support. "
|
|
"To disable this warning, set NCCL_CUMEM_HOST_ENABLE=0");
|
|
ncclCumemHostEnable = 0;
|
|
} else {
|
|
CUCHECK(cuMemRelease(handle));
|
|
}
|
|
}
|
|
}
|
|
return ncclCumemHostEnable;
|
|
error:
|
|
return (ret == ncclSuccess);
|
|
#endif
|
|
}
|
|
|
|
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
|
DECLARE_CUDA_PFN(cuDeviceGet);
|
|
DECLARE_CUDA_PFN(cuDeviceGetAttribute);
|
|
DECLARE_CUDA_PFN(cuGetErrorString);
|
|
DECLARE_CUDA_PFN(cuGetErrorName);
|
|
/* enqueue.cc */
|
|
DECLARE_CUDA_PFN(cuMemGetAddressRange);
|
|
DECLARE_CUDA_PFN(cuLaunchKernel);
|
|
#if CUDA_VERSION >= 11080
|
|
DECLARE_CUDA_PFN(cuLaunchKernelEx);
|
|
#endif
|
|
/* proxy.cc */
|
|
DECLARE_CUDA_PFN(cuCtxCreate);
|
|
DECLARE_CUDA_PFN(cuCtxDestroy);
|
|
DECLARE_CUDA_PFN(cuCtxGetCurrent);
|
|
DECLARE_CUDA_PFN(cuCtxSetCurrent);
|
|
DECLARE_CUDA_PFN(cuCtxGetDevice);
|
|
/* cuMem API support */
|
|
DECLARE_CUDA_PFN(cuMemAddressReserve);
|
|
DECLARE_CUDA_PFN(cuMemAddressFree);
|
|
DECLARE_CUDA_PFN(cuMemCreate);
|
|
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity);
|
|
DECLARE_CUDA_PFN(cuMemExportToShareableHandle);
|
|
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle);
|
|
DECLARE_CUDA_PFN(cuMemMap);
|
|
DECLARE_CUDA_PFN(cuMemRelease);
|
|
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
|
|
DECLARE_CUDA_PFN(cuMemSetAccess);
|
|
DECLARE_CUDA_PFN(cuMemUnmap);
|
|
DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle);
|
|
/* ncclMemAlloc/Free */
|
|
DECLARE_CUDA_PFN(cuPointerGetAttribute);
|
|
#if CUDA_VERSION >= 11070
|
|
/* transport/collNet.cc/net.cc*/
|
|
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
|
#endif
|
|
#if CUDA_VERSION >= 12010
|
|
/* NVSwitch Multicast support */
|
|
DECLARE_CUDA_PFN(cuMulticastAddDevice);
|
|
DECLARE_CUDA_PFN(cuMulticastBindMem);
|
|
DECLARE_CUDA_PFN(cuMulticastBindAddr);
|
|
DECLARE_CUDA_PFN(cuMulticastCreate);
|
|
DECLARE_CUDA_PFN(cuMulticastGetGranularity);
|
|
DECLARE_CUDA_PFN(cuMulticastUnbind);
|
|
#endif
|
|
#endif
|
|
|
|
#define CUDA_DRIVER_MIN_VERSION 11030
|
|
|
|
int ncclCudaDriverVersionCache = -1;
|
|
bool ncclCudaLaunchBlocking = false;
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
|
|
#if CUDART_VERSION >= 12000
|
|
#define LOAD_SYM(symbol, ignore) do { \
|
|
cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
|
|
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
|
|
if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
|
|
if (!ignore) { \
|
|
WARN("Retrieve %s failed with %d status %d", #symbol, res, driverStatus); \
|
|
return ncclSystemError; } \
|
|
} } while(0)
|
|
#else
|
|
#define LOAD_SYM(symbol, ignore) do { \
|
|
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault); \
|
|
if (res != cudaSuccess) { \
|
|
if (!ignore) { \
|
|
WARN("Retrieve %s failed with %d", #symbol, res); \
|
|
return ncclSystemError; } \
|
|
} } while(0)
|
|
#endif
|
|
|
|
/*
|
|
Load the CUDA symbols
|
|
*/
|
|
static ncclResult_t cudaPfnFuncLoader(void) {
|
|
|
|
cudaError_t res;
|
|
|
|
LOAD_SYM(cuGetErrorString, 0);
|
|
LOAD_SYM(cuGetErrorName, 0);
|
|
LOAD_SYM(cuDeviceGet, 0);
|
|
LOAD_SYM(cuDeviceGetAttribute, 0);
|
|
LOAD_SYM(cuMemGetAddressRange, 1);
|
|
LOAD_SYM(cuCtxCreate, 1);
|
|
LOAD_SYM(cuCtxDestroy, 1);
|
|
LOAD_SYM(cuCtxGetCurrent, 1);
|
|
LOAD_SYM(cuCtxSetCurrent, 1);
|
|
LOAD_SYM(cuCtxGetDevice, 1);
|
|
LOAD_SYM(cuLaunchKernel, 1);
|
|
#if CUDA_VERSION >= 11080
|
|
LOAD_SYM(cuLaunchKernelEx, 1);
|
|
#endif
|
|
/* cuMem API support */
|
|
LOAD_SYM(cuMemAddressReserve, 1);
|
|
LOAD_SYM(cuMemAddressFree, 1);
|
|
LOAD_SYM(cuMemCreate, 1);
|
|
LOAD_SYM(cuMemGetAllocationGranularity, 1);
|
|
LOAD_SYM(cuMemExportToShareableHandle, 1);
|
|
LOAD_SYM(cuMemImportFromShareableHandle, 1);
|
|
LOAD_SYM(cuMemMap, 1);
|
|
LOAD_SYM(cuMemRelease, 1);
|
|
LOAD_SYM(cuMemRetainAllocationHandle, 1);
|
|
LOAD_SYM(cuMemSetAccess, 1);
|
|
LOAD_SYM(cuMemUnmap, 1);
|
|
LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1);
|
|
/* ncclMemAlloc/Free */
|
|
LOAD_SYM(cuPointerGetAttribute, 1);
|
|
#if CUDA_VERSION >= 11070
|
|
LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
|
|
#endif
|
|
#if CUDA_VERSION >= 12010
|
|
/* NVSwitch Multicast support */
|
|
LOAD_SYM(cuMulticastAddDevice, 1);
|
|
LOAD_SYM(cuMulticastBindMem, 1);
|
|
LOAD_SYM(cuMulticastBindAddr, 1);
|
|
LOAD_SYM(cuMulticastCreate, 1);
|
|
LOAD_SYM(cuMulticastGetGranularity, 1);
|
|
LOAD_SYM(cuMulticastUnbind, 1);
|
|
#endif
|
|
return ncclSuccess;
|
|
}
|
|
#endif
|
|
|
|
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
|
static ncclResult_t initResult;
|
|
|
|
static void initOnceFunc() {
|
|
do {
|
|
const char* val = ncclGetEnv("CUDA_LAUNCH_BLOCKING");
|
|
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
|
|
} while (0);
|
|
|
|
ncclResult_t ret = ncclSuccess;
|
|
int cudaDev;
|
|
int driverVersion;
|
|
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error); // Initialize the driver
|
|
|
|
CUDACHECKGOTO(cudaDriverGetVersion(&driverVersion), ret, error);
|
|
INFO(NCCL_INIT, "cudaDriverVersion %d", driverVersion);
|
|
|
|
if (driverVersion < CUDA_DRIVER_MIN_VERSION) {
|
|
// WARN("CUDA Driver version found is %d. Minimum requirement is %d", driverVersion, CUDA_DRIVER_MIN_VERSION);
|
|
// Silently ignore version check mismatch for backwards compatibility
|
|
goto error;
|
|
}
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
if (cudaPfnFuncLoader()) {
|
|
WARN("CUDA some PFN functions not found in the library");
|
|
goto error;
|
|
}
|
|
#endif
|
|
|
|
// Determine whether we support the cuMem APIs or not
|
|
ncclCuMemSupported = ncclIsCuMemSupported();
|
|
|
|
/* To use cuMem* for host memory allocation, we need to create context on each visible device.
|
|
* This is a workaround needed in CUDA 12.2 and CUDA 12.3 which is fixed in 12.4. */
|
|
if (ncclCuMemSupported && ncclCuMemHostEnable() && 12020 <= driverVersion && driverVersion <= 12030) {
|
|
int deviceCnt, saveDevice;
|
|
cudaGetDevice(&saveDevice);
|
|
cudaGetDeviceCount(&deviceCnt);
|
|
for (int i = 0; i < deviceCnt; ++i) {
|
|
cudaSetDevice(i);
|
|
cudaFree(NULL);
|
|
}
|
|
cudaSetDevice(saveDevice);
|
|
}
|
|
initResult = ret;
|
|
return;
|
|
error:
|
|
initResult = ncclSystemError;
|
|
return;
|
|
}
|
|
|
|
ncclResult_t ncclCudaLibraryInit() {
|
|
pthread_once(&initOnceControl, initOnceFunc);
|
|
return initResult;
|
|
}
|