Update ext-profiler example

Sync ext-profiler example with 2.26.2.
This commit is contained in:
Giuseppe Congiu 2025-04-09 09:02:40 -07:00 committed by Sylvain Jeaugey
parent f44ac759fe
commit 145e67e707
13 changed files with 621 additions and 78 deletions

View File

@ -49,9 +49,9 @@ of newer ones.
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
from old API versions. It also provides error codes in `err.h`.
# API (v2)
# API (v3)
Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.
```
typedef struct {
@ -70,7 +70,7 @@ typedef struct {
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
@ -82,13 +82,13 @@ typedef struct {
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v2_t;
} ncclProfiler_v3_t;
```
## Error codes
@ -156,7 +156,6 @@ typedef struct {
size_t count; // data count
int root; // root rank
const char* datatype; // string containing the name of the datatype
size_t trafficBytes; // number of transfer bytes
uint8_t nMaxChannels; // max number of channels for this collective
uint8_t nWarps; // number of GPU warps for this collective
const char* algo; // string containing name of the algorithm for this collective
@ -185,12 +184,22 @@ typedef struct {
struct { // proxyStep events metadata
int step; // individual step in `ncclProxyOp`
} proxyStep;
struct {
uint8_t channelId; // id of the channel used by the kernel
} kernelCh;
struct {
int64_t id; // net plugin id (used by net and profiler plugins to agree on event definitions)
void* data; // pointer to network plugin defined event
} netPlugin;
};
} ncclProfilerEventDescr_v2_t;
} ncclProfilerEventDescr_v3_t;
```
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
`ncclProfileNetPlugin`.
#### stopEvent
@ -236,7 +245,7 @@ typedef enum {
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
} ncclProfilerEventState_v2_t;
} ncclProfilerEventState_v3_t;
```
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
network requests for the GPU kernel. This includes everything else that the proxy thread might be
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
processes work items for the enqueued NCCL operations.
`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
network defined event definition using the plugin id in the event descriptor. The plugin identifier
is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
unused and available for future extensions.
A network IB plugin can use this infrastructure to define a QP event as:
```C
#define NCCL_PROFILER_NET_IB_VER 1
enum {
ncclProfileQp = (1 << 0),
};
// The data structure version is encoded in the plugin identifier bitmask and
// passed to NCCL core through the profiler callback. NCCL copies the plugin
// identifier in the event descriptor before calling the profiler startEvent
// function. The profiler should inspect the plugin id to find out the source
// plugin as well as the version of the event struct
typedef struct {
uint8_t type; // event type (plugin defined)
union {
struct {
int device; // network device id
uint64_t wr_id; // work request id
int opcode; // ibv opcode
int qpNum; // QP number
size_t length; // work request data length
} qp;
};
} ncclProfilerNetIbDescr_v1_t;
```
The network event infrastructure is network agnostic. A different network socket plugin can
use it to define a socket event as:
```C
#define NCCL_PROFILER_NET_SOCKET_VER 1
enum {
ncclProfileSocket = (1 << 0),
};
// The data structure version is encoded in the plugin identifier bitmask and
// passed to NCCL core through the profiler callback. NCCL copies the plugin
// identifier in the event descriptor before calling the profiler startEvent
// function. The profiler should inspect the plugin id to find out the source
// plugin as well as the version of the event struct
typedef struct {
uint8_t type; // event type (plugin defined)
union {
struct {
int fd;
int op;
size_t length;
} sock;
};
} ncclProfilerNetSockDescr_v1_t;
```
The network plugin creates an event (descriptor) and passes it to the profiler callback,
along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
event descriptor, attaches the network plugin defined event as external data, and calls
the profiler `startEvent` function.
```C
ncclResult_t isend(..., void* phandle, ...) {
...
int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
ncclProfilerNetIbDescr_v1_t eDescr = { };
eDescr.type = ncclProfileQp;
eDescr.qp = { ... };
ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
...
}
```
State transitions for the events described can also come with event attribute updates. For this
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
@ -264,7 +356,7 @@ typedef union {
struct { // attributes to update for ncclProfileProxyCtrl
int appendedProxyOps; // number of appended proxy ops thus far
} proxyCtrl;
} ncclProfilerEventStateArgs_v2_t;
} ncclProfilerEventStateArgs_v3_t;
```
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@ -279,14 +371,22 @@ Group event
+- Collective event
| |
| +- ProxyOp event
| |
| +- ProxyStep event
| | |
| | +- ProxyStep event
| | |
| | +- NetPlugin event
| |
| +- KernelCh event
|
+- Point-to-point event
|
+- ProxyOp event
|
+- ProxyStep event
| |
| +- ProxyStep event
| |
| +- NetPlugin event
|
+- KernelCh event
ProxyCtrl event
```
@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p
dereference a pointer from another address space the event descriptor includes the PID of the originator.
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
parent event.
# Known Limitations
In intra-node communication, or whenever a rank does not have any network activity for which proxy events
are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
collective. However, this time only represents the launch time of the collective and not the actual
execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.

View File

@ -10,7 +10,7 @@ PLUGIN_SO := libnccl-profiler.so
default: $(PLUGIN_SO)
$(PLUGIN_SO): plugin.c event.c print_event.c
$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
clean:
rm -f $(PLUGIN_SO)

View File

@ -33,10 +33,42 @@
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
#define MAX_COMM_CLIQUES (32 * 8)
#define MAX_EVENTS_PER_REQ (8)
struct proxyOp;
struct proxyStep;
struct netPlugin {
uint8_t type;
int pluginType;
int pluginVer;
uint8_t pluginEvent;
union {
struct {
int device;
int qpNum;
int opcode;
uint64_t wr_id;
size_t length;
} qp;
struct {
int fd;
int op;
size_t length;
} sock;
};
double startTs;
double stopTs;
struct proxyStep* parent;
};
struct kernelCh {
uint8_t type;
uint8_t channelId;
struct taskEventBase* parent;
double startTs;
double stopTs;
};
struct proxyStep {
uint8_t type; // type of event: network transfer
@ -46,6 +78,8 @@ struct proxyStep {
double startTs;
double stopTs;
struct proxyOp* parent;
struct netPlugin net[MAX_EVENTS_PER_REQ];
int nNetEvents;
};
struct proxyOp {
@ -101,7 +135,6 @@ struct collective {
void const* sendBuff;
void* recvBuff;
size_t count;
size_t trafficBytes;
int root;
const char* datatype;
uint8_t nMaxChannels;
@ -111,6 +144,7 @@ struct collective {
struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
int nProxyOps[MAX_CHANNELS];
struct kernelCh kernel[MAX_CHANNELS];
};
struct p2p {
@ -121,6 +155,7 @@ struct p2p {
const char* datatype;
int peer;
struct proxyOp op[MAX_CHANNELS];
struct kernelCh kernel[MAX_CHANNELS];
};
struct group {

View File

@ -0,0 +1,34 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NET_IB_V1_H_
#define NET_IB_V1_H_
#define NCCL_PROFILER_NET_IB_VER 1
enum {
ncclProfileQp = (1 << 0),
};
// The data structure version is encoded in the plugin identifier bitmask and
// passed to NCCL core through the profiler callback. NCCL copies the plugin
// identifier in the event descriptor before calling the profiler startEvent
// function. The profiler should inspect the plugin id to find out the source
// plugin as well as the version of the event struct
typedef struct {
uint8_t type; // event type (plugin defined)
union {
struct {
int device; // network device id
uint64_t wr_id; // work request id
int opcode; // ibv opcode
int qpNum; // QP number
size_t length; // work request data length
} qp;
};
} ncclProfilerNetIbDescr_v1_t;
#endif

View File

@ -0,0 +1,32 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NET_SOCKET_V1_H_
#define NET_SOCKET_V1_H_
#define NCCL_PROFILER_NET_SOCKET_VER 1
enum {
ncclProfileSocket = (1 << 0),
};
// The data structure version is encoded in the plugin identifier bitmask and
// passed to NCCL core through the profiler callback. NCCL copies the plugin
// identifier in the event descriptor before calling the profiler startEvent
// function. The profiler should inspect the plugin id to find out the source
// plugin as well as the version of the event struct
typedef struct {
uint8_t type; // event type (plugin defined)
union {
struct {
int fd;
int op;
size_t length;
} sock;
};
} ncclProfilerNetSockDescr_v1_t;
#endif

View File

@ -4,8 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_H_
#define NCCL_PROFILER_H_
#ifndef PROFILER_H_
#define PROFILER_H_
#include <stdint.h>
#include <stdlib.h>
@ -13,7 +13,54 @@
#include "common.h"
#include "err.h"
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileKernelCh = (1 << 6), // kernel channel event type
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
};
typedef enum {
ncclProfilerProxyOpSendPosted,
ncclProfilerProxyOpSendRemFifoWait,
ncclProfilerProxyOpSendTransmitted,
ncclProfilerProxyOpSendDone,
ncclProfilerProxyOpRecvPosted,
ncclProfilerProxyOpRecvReceived,
ncclProfilerProxyOpRecvTransmitted,
ncclProfilerProxyOpRecvDone,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait,
ncclProfilerProxyStepSendWait,
ncclProfilerProxyStepRecvWait,
ncclProfilerProxyStepRecvFlushWait,
ncclProfilerProxyStepRecvGPUWait,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle,
ncclProfilerProxyCtrlActive,
ncclProfilerProxyCtrlSleep,
ncclProfilerProxyCtrlWakeup,
ncclProfilerProxyCtrlAppend,
ncclProfilerProxyCtrlAppendEnd,
} ncclProfilerEventState_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
#include "profiler_v3.h"
#include "profiler_v2.h"
#include "profiler_v1.h"
#include "profiler_net.h"
typedef ncclProfiler_v3_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
#endif // end include guard

View File

@ -0,0 +1,22 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_NET_H_
#define PROFILER_NET_H_
#define NCCL_PROFILER_NET_VER_BITS (16)
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
typedef enum {
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
} ncclProfilerNetType;
#include "net_ib_v1.h"
#include "net_socket_v1.h"
#endif

View File

@ -4,8 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_V1_H_
#define NCCL_PROFILER_V1_H_
#ifndef PROFILER_V1_H_
#define PROFILER_V1_H_
#include <stdint.h>
@ -59,8 +59,16 @@ typedef struct {
};
} ncclProfilerEventDescr_v1_t;
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v1_t;
typedef struct {
const char* name;

View File

@ -4,20 +4,11 @@
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_V2_H_
#define NCCL_PROFILER_V2_H_
#ifndef PROFILER_V2_H_
#define PROFILER_V2_H_
#include <stdint.h>
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
};
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
@ -65,32 +56,6 @@ typedef struct {
};
} ncclProfilerEventDescr_v2_t;
typedef enum {
ncclProfilerProxyOpSendPosted,
ncclProfilerProxyOpSendRemFifoWait,
ncclProfilerProxyOpSendTransmitted,
ncclProfilerProxyOpSendDone,
ncclProfilerProxyOpRecvPosted,
ncclProfilerProxyOpRecvReceived,
ncclProfilerProxyOpRecvTransmitted,
ncclProfilerProxyOpRecvDone,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait,
ncclProfilerProxyStepSendWait,
ncclProfilerProxyStepRecvWait,
ncclProfilerProxyStepRecvFlushWait,
ncclProfilerProxyStepRecvGPUWait,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle,
ncclProfilerProxyCtrlActive,
ncclProfilerProxyCtrlSleep,
ncclProfilerProxyCtrlWakeup,
ncclProfilerProxyCtrlAppend,
ncclProfilerProxyCtrlAppendEnd,
} ncclProfilerEventState_v2_t;
typedef union {
struct {
size_t transSize;
@ -138,9 +103,4 @@ typedef struct {
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v2_t;
typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v2_t ncclProfiler_t;
#endif

View File

@ -0,0 +1,119 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V3_H_
#define PROFILER_V3_H_
#include <stdint.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nMaxChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* name;
uint64_t commHash;
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v3_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v3_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v3_t;
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t;
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v3_t ncclProfiler_t;
#endif

View File

@ -58,6 +58,7 @@ __hidden double gettime(void) {
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static pid_t pid;
static int* eActivationMaskPtr;
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
pthread_mutex_lock(&lock);
@ -65,7 +66,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
// first thread initializes event mask, environment and detach pool
const char* str;
str = getenv("NCCL_PROFILE_EVENT_MASK");
__atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
__atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
@ -100,6 +101,9 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
}
pthread_mutex_unlock(&lock);
// store pointer to activation mask globally
eActivationMaskPtr = eActivationMask;
// pre-allocate memory for event object pools in dedicated profiler context
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
@ -199,8 +203,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
if (base->type == ncclProfileColl) {
struct collective* c = (struct collective *)base;
// reset event proxyOps & proxySteps
memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
// release collective events in the group and return them to the collective pool
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
@ -252,7 +254,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
event->count = eDescr->coll.count;
event->root = eDescr->coll.root;
event->datatype = eDescr->coll.datatype;
event->trafficBytes = eDescr->coll.trafficBytes;
event->nMaxChannels = eDescr->coll.nMaxChannels;
event->nWarps = eDescr->coll.nWarps;
event->algo = eDescr->coll.algo;
@ -373,7 +374,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "ProxyOpStart");
}
} else if (eDescr->type == ncclProfileProxyStep) {
} else if (eDescr->type == ncclProfileProxyStep) {
// the parent might be null if we run out of events
struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
@ -385,8 +386,77 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
event->isSend = parent->isSend;
event->parent = parent;
event->startTs = gettime() - startTime;
event->nNetEvents = 0;
*eHandle = event;
debugEvent(event, "ProxyStepStart");
} else if (eDescr->type == ncclProfileKernelCh) {
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
if (eventBase == NULL) return ncclSuccess;
if (eventBase->type == ncclProfileColl) {
struct collective* parent = (struct collective *)eDescr->parentObj;
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
event->type = ncclProfileKernelCh;
event->channelId = eDescr->kernelCh.channelId;
event->parent = eventBase;
event->startTs = gettime() - startTime;
*eHandle = event;
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "KernelChStart");
} else { // ncclProfileP2p
struct p2p* parent = (struct p2p *)eDescr->parentObj;
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
event->type = ncclProfileKernelCh;
event->channelId = eDescr->kernelCh.channelId;
event->parent = eventBase;
event->startTs = gettime() - startTime;
*eHandle = event;
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "KernelChStart");
}
} else if (eDescr->type == ncclProfileNetPlugin) {
struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
int64_t pluginId = eDescr->netPlugin.id;
int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK;
int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK;
if (type == NCCL_PROFILER_NET_TYPE_IB) {
if (ver == 1) {
ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data;
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
event->type = ncclProfileNetPlugin;
event->pluginType = type;
event->pluginVer = ver;
if (descr->type == ncclProfileQp) {
event->pluginEvent = ncclProfileQp;
event->qp.device = descr->qp.device;
event->qp.wr_id = descr->qp.wr_id;
event->qp.opcode = descr->qp.opcode;
event->qp.qpNum = descr->qp.qpNum;
event->qp.length = descr->qp.length;
}
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "NetPluginStart");
}
} else if (type == NCCL_PROFILER_NET_TYPE_SOCK) {
if (ver == 1) {
ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data;
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
event->type = ncclProfileNetPlugin;
event->pluginType = type;
event->pluginVer = ver;
if (descr->type == ncclProfileSocket) {
event->pluginEvent = ncclProfileSocket;
event->sock.fd = descr->sock.fd;
event->sock.op = descr->sock.op;
event->sock.length = descr->sock.length;
}
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "NetPluginStart");
}
}
}
return ncclSuccess;
}
@ -445,6 +515,15 @@ void updateEvent(void* handle) {
struct proxyCtrl* event = (struct proxyCtrl *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "ProxyCtrlStop");
} else if (type == ncclProfileKernelCh) {
struct kernelCh* event = (struct kernelCh *)handle;
event->stopTs = gettime() - startTime;
updateEvent(event->parent);
debugEvent(event, "KernelChStop");
} else if (type == ncclProfileNetPlugin) {
struct netPlugin* event = (struct netPlugin *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "NetPluginStop");
}
}
@ -506,7 +585,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
return ncclSuccess;
}
ncclProfiler_t ncclProfiler_v2 = {
ncclProfiler_t ncclProfiler_v3 = {
"Example-profiler",
exampleProfilerInit,
exampleProfilerStartEvent,
@ -514,3 +593,17 @@ ncclProfiler_t ncclProfiler_v2 = {
exampleProfilerRecordEventState,
exampleProfilerFinalize,
};
int exampleProfilerStart(int eActivationMask) {
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
__atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
}
return ncclSuccess;
}
int exampleProfilerStop(void) {
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
__atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
}
return ncclSuccess;
}

View File

@ -0,0 +1,13 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PLUGIN_H_
#define PLUGIN_H_
int exampleProfilerStart(int eActivationMask);
int exampleProfilerStop(void);
#endif

View File

@ -72,7 +72,7 @@ __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
}
static __thread int proxyStepId;
__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
if (event->isSend) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
@ -84,8 +84,6 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
} else {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
@ -93,6 +91,14 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
}
}
__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
if (event->isSend) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
} else {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
@ -106,6 +112,19 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
}
}
static __thread int kernelId;
__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
if (event->type != ncclProfileKernelCh) return;
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n",
"KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId);
}
__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
if (event->type != ncclProfileKernelCh) return;
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"KernelCh", kernelId, getpid(), 1, event->stopTs);
}
static __thread int proxyCtrlId;
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
const char* str;
@ -127,6 +146,29 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
str, proxyCtrlId++, getpid(), 1, event->stopTs);
}
static __thread int ibQpId, sockId;
__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) {
if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) {
if (event->pluginVer == 1) {
if (event->pluginEvent == ncclProfileQp) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n",
"Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"Qp", ibQpId++, getpid(), 1, event->stopTs);
}
}
} else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) {
if (event->pluginVer == 1) {
if (event->pluginEvent == ncclProfileSocket) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n",
"Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"Sock", sockId++, getpid(), 1, event->stopTs);
}
}
}
}
//#define DEBUG_EVENTS
void debugEvent(void* eHandle, const char* tag) {
#ifdef DEBUG_EVENTS
@ -146,8 +188,10 @@ void debugEvent(void* eHandle, const char* tag) {
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
fprintf(fh, " parent = %p\n", event->base.parent);
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
for (int j = 0; j < MAX_OPS; j++) {
for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
}
fprintf(fh, " startTs = %f\n", event->base.startTs);
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
fprintf(fh, "}\n");
@ -178,6 +222,20 @@ void debugEvent(void* eHandle, const char* tag) {
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileKernelCh) {
struct kernelCh* event = (struct kernelCh *)eHandle;
fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag);
fprintf(fh, " parent = %p\n", event->parent);
fprintf(fh, " channel = %d\n", event->channelId);
} else if (type == ncclProfileNetPlugin) {
struct netPlugin* event = (struct netPlugin *)eHandle;
fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag);
fprintf(fh, " pluginType = %d\n", event->pluginType);
fprintf(fh, " pluginVer = %d\n", event->pluginVer);
fprintf(fh, " pluginEvent = %d\n", event->pluginEvent);
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
}
fclose(fh);
#endif
@ -200,17 +258,21 @@ void printEvent(FILE* fh, void* handle) {
struct collective* c = (struct collective *)handle;
printCollEventHeader(fh, c);
for (int i = 0; i < MAX_CHANNELS; i++) {
printKernelChEventHeader(fh, &c->kernel[i]);
for (int j = 0; j < c->nProxyOps[i]; j++) {
printEvent(fh, &c->send[i][j]);
printEvent(fh, &c->recv[i][j]);
}
printKernelChEventTrailer(fh, &c->kernel[i]);
}
printCollEventTrailer(fh, c);
} else if (type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)handle;
printP2pEventHeader(fh, p);
for (int i = 0; i < MAX_CHANNELS; i++) {
printKernelChEventHeader(fh, &p->kernel[i]);
printEvent(fh, &p->op[i]);
printKernelChEventTrailer(fh, &p->kernel[i]);
}
printP2pEventTrailer(fh, p);
} else if (type == ncclProfileProxyOp) {
@ -222,7 +284,11 @@ void printEvent(FILE* fh, void* handle) {
printProxyOpEventTrailer(fh, p);
} else if (type == ncclProfileProxyStep) {
struct proxyStep* p = (struct proxyStep *)handle;
printProxyStepEvent(fh, p);
printProxyStepEventHeader(fh, p);
for (int q = 0; q < p->nNetEvents; q++) {
printNetPluginEvent(fh, &p->net[q]);
}
printProxyStepEventTrailer(fh, p);
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* p = (struct proxyCtrl *)handle;
printProxyCtrlEvent(fh, p);