Add profiling API

2016-09-22 11:56:51 -07:00 · 2016-09-22 11:56:51 -07:00 · e3dbc6110e
commit e3dbc6110e
parent 1d6715fe20
1 changed files with 44 additions and 19 deletions
--- a/src/nccl.h
+++ b/src/nccl.h
@ -1,7 +1,7 @@
 /*************************************************************************
 * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
 ************************************************************************/
 #ifndef NCCL_H_
@ -48,6 +48,7 @@ typedef enum { ncclSuccess                 =  0,
 * ncclCommInitAll. uniqueId will be created in such a way that it is
 * guaranteed to be unique accross the host. */
 ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
 ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
 /* Creates a new communicator (multi process version).
 * rank must be between 0 and ndev-1 and unique within a communicator clique.
@ -56,6 +57,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
 * ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
 * BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
 ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
 ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
 /* Creates a clique of communicators.
 * This is a convenience function to create a single-process communicator clique.
@ -64,21 +66,27 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, i
 * If devlist is NULL, the first ndev CUDA devices are used.
 * Order of devlist defines user-order of processors within the communicator. */
 ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 /* Frees resources associated with communicator object. */
 void  ncclCommDestroy(ncclComm_t comm);
 void pncclCommDestroy(ncclComm_t comm);
 /* Returns nice error message. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 /* Sets count to number of devices in the communicator clique. */
 ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
 ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
 /* Returns cuda device number associated with communicator. */
 ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
 ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
 /* Returns user-ordered "rank" assocaiated with communicator. */
 ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
 ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
 /* Reduction opperation selector */
 typedef enum { ncclSum        = 0,
@ -107,6 +115,8 @@ typedef enum { ncclChar       = 0,
 */
 ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 /* Reduces data arrays of length count in sendbuff using op operation, and leaves
 * identical copies of result on each GPUs recvbuff.
@ -114,6 +124,8 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclData
 * Must be called separately for each communicator in communicator clique. */
 ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
 /* Reduces data in sendbuff using op operation and leaves reduced result scattered
 * over the devices so that recvbuff on the i-th GPU will contain the i-th block of
@ -124,6 +136,9 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
 ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
    cudaStream_t stream);
 ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
    cudaStream_t stream);
 /* Copies count values from root to all other devices.
 * Root specifies the source device in user-order
@ -131,6 +146,8 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
 * Must be called separately for each communicator in communicator clique. */
 ncclResult_t  ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream);
 /* Each device gathers count values from other GPUs.
@ -141,6 +158,8 @@ ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
 * Must be called separately for each communicator in communicator clique. */
 ncclResult_t  ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
 /* The following collective operations are not implemented yet */
@ -152,6 +171,8 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
 // * All GPUs, including root, perform copies into recvbuff.
 //ncclResult_t  ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
 //    void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
 //ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
 //                        void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
 ///* Root device scatters count values to each devices.
 // * sendbuff may be NULL on all devices except a single root
@ -161,6 +182,8 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
 // * Called separately for each device in the ncclComm. */
 //ncclResult_t  ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
 //    int count, int root, ncclComm_t comm, cudaStream_t stream);
 //ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
 //    int count, int root, ncclComm_t comm, cudaStream_t stream);
 //
 ///* All GPUs scatter blocks of count elements to other devices.
 // * Must be called separately for each device in the ncclComm.
@ -169,6 +192,8 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
 // * Called separately for each device in the ncclComm. */
 //ncclResult_t  ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
 //    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
 //ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
 //    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
 #ifdef __cplusplus
 } // end extern "C"