Add profiling API
This commit is contained in:
parent
1d6715fe20
commit
e3dbc6110e
27
src/nccl.h
27
src/nccl.h
@ -1,7 +1,7 @@
|
|||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* See LICENCE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#ifndef NCCL_H_
|
#ifndef NCCL_H_
|
||||||
@ -48,6 +48,7 @@ typedef enum { ncclSuccess = 0,
|
|||||||
* ncclCommInitAll. uniqueId will be created in such a way that it is
|
* ncclCommInitAll. uniqueId will be created in such a way that it is
|
||||||
* guaranteed to be unique accross the host. */
|
* guaranteed to be unique accross the host. */
|
||||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||||
|
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||||
|
|
||||||
/* Creates a new communicator (multi process version).
|
/* Creates a new communicator (multi process version).
|
||||||
* rank must be between 0 and ndev-1 and unique within a communicator clique.
|
* rank must be between 0 and ndev-1 and unique within a communicator clique.
|
||||||
@ -56,6 +57,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
|||||||
* ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
|
* ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
|
||||||
* BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
|
* BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
|
||||||
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
|
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
|
||||||
|
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
|
||||||
|
|
||||||
/* Creates a clique of communicators.
|
/* Creates a clique of communicators.
|
||||||
* This is a convenience function to create a single-process communicator clique.
|
* This is a convenience function to create a single-process communicator clique.
|
||||||
@ -64,21 +66,27 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, i
|
|||||||
* If devlist is NULL, the first ndev CUDA devices are used.
|
* If devlist is NULL, the first ndev CUDA devices are used.
|
||||||
* Order of devlist defines user-order of processors within the communicator. */
|
* Order of devlist defines user-order of processors within the communicator. */
|
||||||
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||||
|
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||||
|
|
||||||
/* Frees resources associated with communicator object. */
|
/* Frees resources associated with communicator object. */
|
||||||
void ncclCommDestroy(ncclComm_t comm);
|
void ncclCommDestroy(ncclComm_t comm);
|
||||||
|
void pncclCommDestroy(ncclComm_t comm);
|
||||||
|
|
||||||
/* Returns nice error message. */
|
/* Returns nice error message. */
|
||||||
const char* ncclGetErrorString(ncclResult_t result);
|
const char* ncclGetErrorString(ncclResult_t result);
|
||||||
|
const char* pncclGetErrorString(ncclResult_t result);
|
||||||
|
|
||||||
/* Sets count to number of devices in the communicator clique. */
|
/* Sets count to number of devices in the communicator clique. */
|
||||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
||||||
|
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
|
||||||
|
|
||||||
/* Returns cuda device number associated with communicator. */
|
/* Returns cuda device number associated with communicator. */
|
||||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||||
|
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||||
|
|
||||||
/* Returns user-ordered "rank" assocaiated with communicator. */
|
/* Returns user-ordered "rank" assocaiated with communicator. */
|
||||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||||
|
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||||
|
|
||||||
/* Reduction opperation selector */
|
/* Reduction opperation selector */
|
||||||
typedef enum { ncclSum = 0,
|
typedef enum { ncclSum = 0,
|
||||||
@ -107,6 +115,8 @@ typedef enum { ncclChar = 0,
|
|||||||
*/
|
*/
|
||||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
|
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
|
||||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
|
||||||
|
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
/* Reduces data arrays of length count in sendbuff using op operation, and leaves
|
/* Reduces data arrays of length count in sendbuff using op operation, and leaves
|
||||||
* identical copies of result on each GPUs recvbuff.
|
* identical copies of result on each GPUs recvbuff.
|
||||||
@ -114,6 +124,8 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclData
|
|||||||
* Must be called separately for each communicator in communicator clique. */
|
* Must be called separately for each communicator in communicator clique. */
|
||||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
|
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
|
||||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count,
|
||||||
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
/* Reduces data in sendbuff using op operation and leaves reduced result scattered
|
/* Reduces data in sendbuff using op operation and leaves reduced result scattered
|
||||||
* over the devices so that recvbuff on the i-th GPU will contain the i-th block of
|
* over the devices so that recvbuff on the i-th GPU will contain the i-th block of
|
||||||
@ -124,6 +136,9 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
|
|||||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||||
int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||||
|
int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||||
|
cudaStream_t stream);
|
||||||
|
|
||||||
/* Copies count values from root to all other devices.
|
/* Copies count values from root to all other devices.
|
||||||
* Root specifies the source device in user-order
|
* Root specifies the source device in user-order
|
||||||
@ -131,6 +146,8 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
|||||||
* Must be called separately for each communicator in communicator clique. */
|
* Must be called separately for each communicator in communicator clique. */
|
||||||
ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
|
ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
|
||||||
ncclComm_t comm, cudaStream_t stream);
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
|
||||||
/* Each device gathers count values from other GPUs.
|
/* Each device gathers count values from other GPUs.
|
||||||
@ -141,6 +158,8 @@ ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
|
|||||||
* Must be called separately for each communicator in communicator clique. */
|
* Must be called separately for each communicator in communicator clique. */
|
||||||
ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
|
ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
|
||||||
void* recvbuff, ncclComm_t comm, cudaStream_t stream);
|
void* recvbuff, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
|
||||||
|
void* recvbuff, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
|
||||||
/* The following collective operations are not implemented yet */
|
/* The following collective operations are not implemented yet */
|
||||||
@ -152,6 +171,8 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
|
|||||||
// * All GPUs, including root, perform copies into recvbuff.
|
// * All GPUs, including root, perform copies into recvbuff.
|
||||||
//ncclResult_t ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
|
//ncclResult_t ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
|
||||||
// void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
|
// void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
//ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
|
||||||
|
// void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
///* Root device scatters count values to each devices.
|
///* Root device scatters count values to each devices.
|
||||||
// * sendbuff may be NULL on all devices except a single root
|
// * sendbuff may be NULL on all devices except a single root
|
||||||
@ -161,6 +182,8 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
|
|||||||
// * Called separately for each device in the ncclComm. */
|
// * Called separately for each device in the ncclComm. */
|
||||||
//ncclResult_t ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
|
//ncclResult_t ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
|
||||||
// int count, int root, ncclComm_t comm, cudaStream_t stream);
|
// int count, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
//ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
|
||||||
|
// int count, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
//
|
//
|
||||||
///* All GPUs scatter blocks of count elements to other devices.
|
///* All GPUs scatter blocks of count elements to other devices.
|
||||||
// * Must be called separately for each device in the ncclComm.
|
// * Must be called separately for each device in the ncclComm.
|
||||||
@ -169,6 +192,8 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
|
|||||||
// * Called separately for each device in the ncclComm. */
|
// * Called separately for each device in the ncclComm. */
|
||||||
//ncclResult_t ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
|
//ncclResult_t ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
|
||||||
// void* recvbuff, ncclComm_t comm, cudaStream_t stream);
|
// void* recvbuff, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
//ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
|
||||||
|
// void* recvbuff, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // end extern "C"
|
} // end extern "C"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user