From 648e9fbb58813cda91e785201f5c631f1ed4ddaa Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Mon, 5 Dec 2016 18:06:24 -0800 Subject: [PATCH] Adding missing file --- src/common_coll.h | 118 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 src/common_coll.h diff --git a/src/common_coll.h b/src/common_coll.h new file mode 100644 index 0000000..63896d7 --- /dev/null +++ b/src/common_coll.h @@ -0,0 +1,118 @@ +/************************************************************************* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COMMON_COLL_H_ +#define COMMON_COLL_H_ + +#include "core.h" + +static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, pointer); + if (err != cudaSuccess || attr.devicePointer == NULL) { + WARN("%s : %s is not a valid pointer\n", opname, ptrname); + return ncclInvalidDevicePointer; + } + if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { + WARN("%s : %s allocated on device %d mismatchs with NCCL device %d \n", opname, ptrname, attr.device, comm->cudaDev); + return ncclInvalidDevicePointer; + } + return ncclSuccess; +} + +static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { + if (ptr == NULL) { + WARN("%s : %s argument is NULL", opname, ptrname); + return ncclInvalidArgument; + } + return ncclSuccess; +} + +static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, int count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) { + NCCLCHECK(PtrCheck(comm, opname, "comm")); + // First, the easy ones + if (root < 0 || root >= comm->nRanks) { + WARN("%s : invalid root %d (root should be in the 0..%d range)\n", opname, root, comm->nRanks); + return ncclInvalidRank; + } + if (type < 0 || type >= nccl_NUM_TYPES) { + WARN("%s : invalid type %d\n", opname, type); + return ncclInvalidType; + } + if (op < 0 || op >= nccl_NUM_OPS) { + WARN("%s : invalid reduction operation %d\n", opname, op); + return ncclInvalidOperation; + } + if (count < 0) { + WARN("%s : invalid count %d\n", opname, count); + return ncclInvalidArgument; + } + + // Check pointers + NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname)) + if (strcmp(opname, "Reduce") == 0 && comm->rank != root) { + // No need to check recvbuff pointer for non-root reduce + return ncclSuccess; + } + NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname)) + return ncclSuccess; +} + +// Kernel launch +template +struct KernelArgs { + // general parameters + int nRanks; + int root; + int buffSize; + int N; + int opIndex; + volatile int * __restrict__ opCounter; + int * __restrict__ doneCount; + bool pushrecv; + + // some pre-computed sizes + int SliceSize; + int SliceOffset; + int ChunkSize; + int NumChunks; + + // local and remote input, output, and buffer + const T * __restrict__ ThisInput; + T * __restrict__ ThisOutput; + + DevRing* ring; + int nRings; +}; + +template +void ArgsSetup(KernelArgs *args, const void* sendbuff, void* recvbuff, + const int root, const int count, ncclComm *comm) { + args->nRanks = comm->nRanks; + args->root = root; + args->buffSize = comm->buffSizePerRing; + args->N = count; + args->opIndex = comm->opSched; + args->opCounter = comm->opCounter; + args->doneCount = &comm->devMem->doneCount; + args->ThisInput = (const T*)sendbuff; + args->ThisOutput = (T*)recvbuff; + args->ring = comm->devRing; + args->pushrecv = comm->globalMemSpace; + args->nRings = comm->nRings; +} + +#define LAUNCH_KERNEL(K, THREADS, UNROLL, FUNC, T, \ + args, stream) do { \ + dim3 grid(args.nRings, 1, 1); \ + dim3 block(THREADS+1, 1, 1); \ + void* argptrs[] = {&args}; \ + CUDACHECK(cudaLaunchKernel( \ + (void*)K, \ + grid, block, argptrs, 0, stream), ncclUnhandledCudaError); \ +} while (0) + +#endif