163 lines
5.2 KiB
C++
163 lines
5.2 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef CORE_H_
|
|
#define CORE_H_
|
|
|
|
|
|
#include "nccl.h"
|
|
#include <cstdio>
|
|
#include <cuda_runtime.h>
|
|
|
|
#define MAXRANKS 32
|
|
#define DEFAULT_BUFFER_SIZE_BYTES (1UL << 21)
|
|
#define NCCL_MEM_PAD_ALIGN 65536
|
|
|
|
|
|
struct ncclMem {
|
|
union { // Pad this block so that devBuff is correctly aligned
|
|
struct {
|
|
int flags[2];
|
|
void* recvPtrs;
|
|
int opCounter; // Used to determine when remote Communicators are ready.
|
|
// Only used in host memory.
|
|
};
|
|
char pad[NCCL_MEM_PAD_ALIGN];
|
|
};
|
|
// devBuff will be bigger ; we only use its offset/address.
|
|
char buff[1];
|
|
};
|
|
|
|
template <typename T>
|
|
struct alignas(long long) DevRing {
|
|
volatile int* __restrict__ prevOpCounter;
|
|
volatile int* __restrict__ nextOpCounter;
|
|
volatile int* __restrict__ sendFlagToNext;
|
|
volatile int* __restrict__ sendFlagToPrev;
|
|
volatile int* __restrict__ recvFlagFromNext;
|
|
volatile int* __restrict__ recvFlagFromPrev;
|
|
|
|
T* volatile * __restrict__ recvPtrFromNext;
|
|
T* volatile * __restrict__ sendPtrToPrev;
|
|
T* __restrict__ recvBuffer;
|
|
T* __restrict__ sendBuffer;
|
|
|
|
int userRank[MAXRANKS];
|
|
};
|
|
|
|
struct NodeRef {
|
|
ncclMem* remote; // TODO: Verify if these
|
|
ncclMem* local; // are still needed.
|
|
enum {DEVICE, HOST} type;
|
|
ncclMem* devCleanup; // Used only when remote comm uses same process & GPU
|
|
ncclMem* hostCleanup; // Used whenever target is in different process
|
|
int* opCounter; // TODO: see if this can be removed too.
|
|
};
|
|
|
|
|
|
struct ncclComm {
|
|
int rank; // my rank in the communicator
|
|
int nRanks; // number of GPUs in communicator
|
|
int cudaDev; // my cuda device index
|
|
|
|
// Device and Host allocated chunks. Stored here to correctly free() memory.
|
|
ncclMem* devMem;
|
|
ncclMem* hostMem;
|
|
int hostMemState;
|
|
int opSched; // Scheduling operation index
|
|
int* opCounter; // Counter of completed operations
|
|
|
|
cudaStream_t prevStream; // cache last used stream
|
|
cudaEvent_t doneEvent; // orders operations in different streams
|
|
|
|
// Maps an internal nccl index to user-specified rank order. This is necessary
|
|
// since we need to know how the user expects data to be ordered across
|
|
// devices. Ordered from current device.
|
|
int* userFromRing;
|
|
|
|
// copy of the above stored on each device
|
|
int* devUserFromRing;
|
|
|
|
// Ring order
|
|
int* ncclFromRing; // TODO: REMOVE IF NOT NEEDED BEYOND CORE.CU
|
|
|
|
// Size of temp buffer in bytes.
|
|
size_t buffSize;
|
|
|
|
// Whether we have remote access to the recvbuff pointers passed from remote
|
|
// GPUs. In single process mode this can be used as long as QPI links are
|
|
// not present. In multi-process, we never push to a remote recvbuff.
|
|
int globalMemSpace;
|
|
|
|
// Device copy of the communicator
|
|
struct ncclComm *devComm; // TODO: Remove this if not useful
|
|
|
|
// Device-side ring view
|
|
DevRing<char>* devRing;
|
|
|
|
// Device-to-device communication structures to access remote or local device
|
|
// memory. Actual allocation larger than 1.
|
|
NodeRef ptrs[1];
|
|
};
|
|
|
|
|
|
typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
|
|
extern DebugLevel ncclDebugLevel;
|
|
|
|
#define WARN(...) do { \
|
|
if (ncclDebugLevel >= WARN) { \
|
|
printf("WARN %s:%d ", __FILE__, __LINE__); \
|
|
printf(__VA_ARGS__); \
|
|
printf("\n"); \
|
|
fflush(stdout); \
|
|
if (ncclDebugLevel >= ABORT) abort(); \
|
|
} \
|
|
} while(0)
|
|
|
|
#define INFO(...) do { \
|
|
if (ncclDebugLevel >= INFO) { \
|
|
printf("INFO "); printf(__VA_ARGS__); printf("\n"); \
|
|
fflush(stdout); \
|
|
} \
|
|
} while(0)
|
|
|
|
// Check CUDA calls
|
|
#define CUDACHECK(cmd, retcode) do { \
|
|
cudaError_t e = cmd; \
|
|
if( e != cudaSuccess ) { \
|
|
WARN("Cuda failure '%s'\n", cudaGetErrorString(e)); \
|
|
return retcode; \
|
|
} \
|
|
} while(false)
|
|
|
|
// Propagate errors up
|
|
#define NCCLCHECK(call) do { \
|
|
ncclResult_t res = call; \
|
|
if (res != ncclSuccess) { \
|
|
return res; \
|
|
} \
|
|
} while (0);
|
|
|
|
#ifdef PROFAPI
|
|
#define NCCL_API(ret, func, args...) \
|
|
__attribute__ ((visibility("default"))) \
|
|
__attribute__ ((alias(#func))) \
|
|
ret p##func (args); \
|
|
extern "C" \
|
|
__attribute__ ((visibility("default"))) \
|
|
__attribute__ ((weak)) \
|
|
ret func(args)
|
|
#else
|
|
#define NCCL_API(ret, func, args...) \
|
|
extern "C" \
|
|
__attribute__ ((visibility("default"))) \
|
|
ret func(args)
|
|
#endif // end PROFAPI
|
|
|
|
|
|
#endif // end include guard
|
|
|