Optimize CUDA graph launch; avoid launching a CPU callback for intra-node operations. Simplify kernel common code to improve the latency of send/recv operations. Strengthen CUDA streams semantics. Change NET API to v6, to add dmabuf support. Add ncclGetLastError() function. Add ncclRemoteError code and use it for remote network errors. Support the use of a different NCCL_NET parameter per communicator. Add support for SHM and P2P transfers using cudaMemcpy.
76 lines
2.9 KiB
C++
76 lines
2.9 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "argcheck.h"
|
|
#include "comm.h"
|
|
|
|
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
|
cudaPointerAttributes attr;
|
|
cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
|
|
if (err != cudaSuccess || attr.devicePointer == NULL) {
|
|
WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer);
|
|
return ncclInvalidArgument;
|
|
}
|
|
#if CUDART_VERSION >= 10000
|
|
if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
|
|
#else
|
|
if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
|
|
#endif
|
|
WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
|
|
return ncclInvalidArgument;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
|
|
if (ptr == NULL) {
|
|
WARN("%s : %s argument is NULL", opname, ptrname);
|
|
return ncclInvalidArgument;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
|
// First, the easy ones
|
|
if (info->root < 0 || info->root >= info->comm->nRanks) {
|
|
WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
|
|
return ncclInvalidArgument;
|
|
}
|
|
if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
|
|
WARN("%s : invalid type %d", info->opName, info->datatype);
|
|
return ncclInvalidArgument;
|
|
}
|
|
// Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
|
|
NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));
|
|
|
|
if (info->op < 0 || ncclMaxRedOp < info->op) {
|
|
WARN("%s : invalid reduction operation %d", info->opName, info->op);
|
|
return ncclInvalidArgument;
|
|
}
|
|
int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps);
|
|
if (ncclNumOps <= info->op &&
|
|
(info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) {
|
|
WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op);
|
|
return ncclInvalidArgument;
|
|
}
|
|
|
|
if (info->comm->checkPointers) {
|
|
if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) {
|
|
if (info->count >0)
|
|
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName));
|
|
} else {
|
|
// Check CUDA device pointers
|
|
if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
|
|
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
|
|
}
|
|
if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
|
|
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
|
|
}
|
|
}
|
|
}
|
|
return ncclSuccess;
|
|
}
|