Add profiling API

This commit is contained in:
Sylvain Jeaugey 2016-09-22 11:56:51 -07:00
parent 1d6715fe20
commit e3dbc6110e

View File

@ -1,7 +1,7 @@
/************************************************************************* /*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* *
* See LICENCE.txt for license information * See LICENSE.txt for license information
************************************************************************/ ************************************************************************/
#ifndef NCCL_H_ #ifndef NCCL_H_
@ -47,7 +47,8 @@ typedef enum { ncclSuccess = 0,
/* Generates a unique Id with each call. Used to generate commId for /* Generates a unique Id with each call. Used to generate commId for
* ncclCommInitAll. uniqueId will be created in such a way that it is * ncclCommInitAll. uniqueId will be created in such a way that it is
* guaranteed to be unique accross the host. */ * guaranteed to be unique accross the host. */
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
/* Creates a new communicator (multi process version). /* Creates a new communicator (multi process version).
* rank must be between 0 and ndev-1 and unique within a communicator clique. * rank must be between 0 and ndev-1 and unique within a communicator clique.
@ -55,7 +56,8 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
* The communicator is created on the current CUDA device. * The communicator is created on the current CUDA device.
* ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST * ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
* BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */ * BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank); ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
/* Creates a clique of communicators. /* Creates a clique of communicators.
* This is a convenience function to create a single-process communicator clique. * This is a convenience function to create a single-process communicator clique.
@ -63,22 +65,28 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, i
* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
* If devlist is NULL, the first ndev CUDA devices are used. * If devlist is NULL, the first ndev CUDA devices are used.
* Order of devlist defines user-order of processors within the communicator. */ * Order of devlist defines user-order of processors within the communicator. */
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
/* Frees resources associated with communicator object. */ /* Frees resources associated with communicator object. */
void ncclCommDestroy(ncclComm_t comm); void ncclCommDestroy(ncclComm_t comm);
void pncclCommDestroy(ncclComm_t comm);
/* Returns nice error message. */ /* Returns nice error message. */
const char* ncclGetErrorString(ncclResult_t result); const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);
/* Sets count to number of devices in the communicator clique. */ /* Sets count to number of devices in the communicator clique. */
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
/* Returns cuda device number associated with communicator. */ /* Returns cuda device number associated with communicator. */
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
/* Returns user-ordered "rank" assocaiated with communicator. */ /* Returns user-ordered "rank" assocaiated with communicator. */
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
/* Reduction opperation selector */ /* Reduction opperation selector */
typedef enum { ncclSum = 0, typedef enum { ncclSum = 0,
@ -105,14 +113,18 @@ typedef enum { ncclChar = 0,
* the same device. * the same device.
* Must be called separately for each communicator in communicator clique. * Must be called separately for each communicator in communicator clique.
*/ */
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
/* Reduces data arrays of length count in sendbuff using op operation, and leaves /* Reduces data arrays of length count in sendbuff using op operation, and leaves
* identical copies of result on each GPUs recvbuff. * identical copies of result on each GPUs recvbuff.
* Sendbuff and recvbuff are assumed to reside on the same device. * Sendbuff and recvbuff are assumed to reside on the same device.
* Must be called separately for each communicator in communicator clique. */ * Must be called separately for each communicator in communicator clique. */
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count, ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
/* Reduces data in sendbuff using op operation and leaves reduced result scattered /* Reduces data in sendbuff using op operation and leaves reduced result scattered
@ -121,7 +133,10 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
* sendbuff has size at least ndev*recvcount elements, where ndev is number of * sendbuff has size at least ndev*recvcount elements, where ndev is number of
* communicators in communicator clique * communicators in communicator clique
* Must be called separately for each communicator in communicator clique.*/ * Must be called separately for each communicator in communicator clique.*/
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
cudaStream_t stream);
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
cudaStream_t stream); cudaStream_t stream);
@ -129,7 +144,9 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
* Root specifies the source device in user-order * Root specifies the source device in user-order
* (see ncclCommInit). * (see ncclCommInit).
* Must be called separately for each communicator in communicator clique. */ * Must be called separately for each communicator in communicator clique. */
ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root, ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream); ncclComm_t comm, cudaStream_t stream);
@ -139,7 +156,9 @@ ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
* in communicator clique. * in communicator clique.
* Sendbuff and recvbuff are assumed to reside on same device. * Sendbuff and recvbuff are assumed to reside on same device.
* Must be called separately for each communicator in communicator clique. */ * Must be called separately for each communicator in communicator clique. */
ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype, ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
void* recvbuff, ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
void* recvbuff, ncclComm_t comm, cudaStream_t stream); void* recvbuff, ncclComm_t comm, cudaStream_t stream);
@ -150,7 +169,9 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
// * On the root device, sendbuff and recvbuff are assumed to reside on the same device. // * On the root device, sendbuff and recvbuff are assumed to reside on the same device.
// * Must be called separately for each communicator in communicator clique. */ // * Must be called separately for each communicator in communicator clique. */
// * All GPUs, including root, perform copies into recvbuff. // * All GPUs, including root, perform copies into recvbuff.
//ncclResult_t ncclGather(const void* sendbuff, int count, ncclDataType_t datatype, //ncclResult_t ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
// void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
//ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
// void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream); // void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
///* Root device scatters count values to each devices. ///* Root device scatters count values to each devices.
@ -159,16 +180,20 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
// * recvbuff allocated on each gpu, including root, size=count. // * recvbuff allocated on each gpu, including root, size=count.
// * Result is ordered by comm's logical device order. // * Result is ordered by comm's logical device order.
// * Called separately for each device in the ncclComm. */ // * Called separately for each device in the ncclComm. */
//ncclResult_t ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff, //ncclResult_t ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
// int count, int root, ncclComm_t comm, cudaStream_t stream); // int count, int root, ncclComm_t comm, cudaStream_t stream);
//ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
// int count, int root, ncclComm_t comm, cudaStream_t stream);
// //
///* All GPUs scatter blocks of count elements to other devices. ///* All GPUs scatter blocks of count elements to other devices.
// * Must be called separately for each device in the ncclComm. // * Must be called separately for each device in the ncclComm.
// * sendbuff and recvbuff assumed to reside on same device and // * sendbuff and recvbuff assumed to reside on same device and
// * have size at least nGPUs*count. // * have size at least nGPUs*count.
// * Called separately for each device in the ncclComm. */ // * Called separately for each device in the ncclComm. */
//ncclResult_t ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype, //ncclResult_t ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
// void* recvbuff, ncclComm_t comm, cudaStream_t stream); // void* recvbuff, ncclComm_t comm, cudaStream_t stream);
//ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
// void* recvbuff, ncclComm_t comm, cudaStream_t stream);
#ifdef __cplusplus #ifdef __cplusplus
} // end extern "C" } // end extern "C"