Add support for CUDA 12.0, drop Kepler (sm_35). Support for H100 features. Make socket code more robust and protected. Solves #555. Improve performance on large CUDA graphs, reducing dependencies. Reduce inter-socket bandwidth on AMD CPUs to favor better paths. Various fixes to ncclCommAbort. Make service thread polling resistant to EINTR. Compile with profiling API by default. Extend NVTX instrumentation with call arguments.
34 lines
1.4 KiB
C++
34 lines
1.4 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "enqueue.h"
|
|
#include "collectives.h"
|
|
#include "nccl.h"
|
|
|
|
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
|
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
|
struct NvtxParamsReduce {
|
|
size_t bytes;
|
|
int root;
|
|
ncclRedOp_t op;
|
|
};
|
|
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
|
|
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
|
offsetof(NvtxParamsReduce, op)}
|
|
};
|
|
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
|
|
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
|
|
|
|
struct ncclInfo info = { ncclFuncReduce, "Reduce",
|
|
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
|
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
|
|
return ncclEnqueueCheck(&info);
|
|
}
|