Update ext-profiler example
Sync ext-profiler example with 2.26.2.
This commit is contained in:
parent
f44ac759fe
commit
145e67e707
@ -49,9 +49,9 @@ of newer ones.
|
|||||||
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
|
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
|
||||||
from old API versions. It also provides error codes in `err.h`.
|
from old API versions. It also provides error codes in `err.h`.
|
||||||
|
|
||||||
# API (v2)
|
# API (v3)
|
||||||
|
|
||||||
Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
|
Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.
|
||||||
|
|
||||||
```
|
```
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -70,7 +70,7 @@ typedef struct {
|
|||||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||||
// Output
|
// Output
|
||||||
// - eHandle: return event handle for supplied event descriptor object
|
// - eHandle: return event handle for supplied event descriptor object
|
||||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
|
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||||
|
|
||||||
// stopEvent - stop/finalize an event inside and event set
|
// stopEvent - stop/finalize an event inside and event set
|
||||||
// Input
|
// Input
|
||||||
@ -82,13 +82,13 @@ typedef struct {
|
|||||||
// - eHandle : handle to event object created through startEvent
|
// - eHandle : handle to event object created through startEvent
|
||||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||||
// - eState : event state transition
|
// - eState : event state transition
|
||||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
|
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||||
|
|
||||||
// finalize - finalize the profiler plugin
|
// finalize - finalize the profiler plugin
|
||||||
// Input
|
// Input
|
||||||
// - context: opaque profiler context object
|
// - context: opaque profiler context object
|
||||||
ncclResult_t (*finalize)(void* context);
|
ncclResult_t (*finalize)(void* context);
|
||||||
} ncclProfiler_v2_t;
|
} ncclProfiler_v3_t;
|
||||||
```
|
```
|
||||||
|
|
||||||
## Error codes
|
## Error codes
|
||||||
@ -156,7 +156,6 @@ typedef struct {
|
|||||||
size_t count; // data count
|
size_t count; // data count
|
||||||
int root; // root rank
|
int root; // root rank
|
||||||
const char* datatype; // string containing the name of the datatype
|
const char* datatype; // string containing the name of the datatype
|
||||||
size_t trafficBytes; // number of transfer bytes
|
|
||||||
uint8_t nMaxChannels; // max number of channels for this collective
|
uint8_t nMaxChannels; // max number of channels for this collective
|
||||||
uint8_t nWarps; // number of GPU warps for this collective
|
uint8_t nWarps; // number of GPU warps for this collective
|
||||||
const char* algo; // string containing name of the algorithm for this collective
|
const char* algo; // string containing name of the algorithm for this collective
|
||||||
@ -185,12 +184,22 @@ typedef struct {
|
|||||||
struct { // proxyStep events metadata
|
struct { // proxyStep events metadata
|
||||||
int step; // individual step in `ncclProxyOp`
|
int step; // individual step in `ncclProxyOp`
|
||||||
} proxyStep;
|
} proxyStep;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
uint8_t channelId; // id of the channel used by the kernel
|
||||||
|
} kernelCh;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
int64_t id; // net plugin id (used by net and profiler plugins to agree on event definitions)
|
||||||
|
void* data; // pointer to network plugin defined event
|
||||||
|
} netPlugin;
|
||||||
};
|
};
|
||||||
} ncclProfilerEventDescr_v2_t;
|
} ncclProfilerEventDescr_v3_t;
|
||||||
```
|
```
|
||||||
|
|
||||||
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
|
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
|
||||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
|
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
|
||||||
|
`ncclProfileNetPlugin`.
|
||||||
|
|
||||||
#### stopEvent
|
#### stopEvent
|
||||||
|
|
||||||
@ -236,7 +245,7 @@ typedef enum {
|
|||||||
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
|
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
|
||||||
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
|
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
|
||||||
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
|
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
|
||||||
} ncclProfilerEventState_v2_t;
|
} ncclProfilerEventState_v3_t;
|
||||||
```
|
```
|
||||||
|
|
||||||
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
|
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
|
||||||
@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
|
|||||||
network requests for the GPU kernel. This includes everything else that the proxy thread might be
|
network requests for the GPU kernel. This includes everything else that the proxy thread might be
|
||||||
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
|
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
|
||||||
|
|
||||||
|
`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
|
||||||
|
processes work items for the enqueued NCCL operations.
|
||||||
|
|
||||||
|
`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
|
||||||
|
their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
|
||||||
|
the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
|
||||||
|
network defined event definition using the plugin id in the event descriptor. The plugin identifier
|
||||||
|
is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
|
||||||
|
16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
|
||||||
|
unused and available for future extensions.
|
||||||
|
|
||||||
|
A network IB plugin can use this infrastructure to define a QP event as:
|
||||||
|
|
||||||
|
```C
|
||||||
|
#define NCCL_PROFILER_NET_IB_VER 1
|
||||||
|
|
||||||
|
enum {
|
||||||
|
ncclProfileQp = (1 << 0),
|
||||||
|
};
|
||||||
|
|
||||||
|
// The data structure version is encoded in the plugin identifier bitmask and
|
||||||
|
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||||
|
// identifier in the event descriptor before calling the profiler startEvent
|
||||||
|
// function. The profiler should inspect the plugin id to find out the source
|
||||||
|
// plugin as well as the version of the event struct
|
||||||
|
typedef struct {
|
||||||
|
uint8_t type; // event type (plugin defined)
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
int device; // network device id
|
||||||
|
uint64_t wr_id; // work request id
|
||||||
|
int opcode; // ibv opcode
|
||||||
|
int qpNum; // QP number
|
||||||
|
size_t length; // work request data length
|
||||||
|
} qp;
|
||||||
|
};
|
||||||
|
} ncclProfilerNetIbDescr_v1_t;
|
||||||
|
```
|
||||||
|
|
||||||
|
The network event infrastructure is network agnostic. A different network socket plugin can
|
||||||
|
use it to define a socket event as:
|
||||||
|
|
||||||
|
```C
|
||||||
|
#define NCCL_PROFILER_NET_SOCKET_VER 1
|
||||||
|
|
||||||
|
enum {
|
||||||
|
ncclProfileSocket = (1 << 0),
|
||||||
|
};
|
||||||
|
|
||||||
|
// The data structure version is encoded in the plugin identifier bitmask and
|
||||||
|
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||||
|
// identifier in the event descriptor before calling the profiler startEvent
|
||||||
|
// function. The profiler should inspect the plugin id to find out the source
|
||||||
|
// plugin as well as the version of the event struct
|
||||||
|
typedef struct {
|
||||||
|
uint8_t type; // event type (plugin defined)
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
int fd;
|
||||||
|
int op;
|
||||||
|
size_t length;
|
||||||
|
} sock;
|
||||||
|
};
|
||||||
|
} ncclProfilerNetSockDescr_v1_t;
|
||||||
|
```
|
||||||
|
|
||||||
|
The network plugin creates an event (descriptor) and passes it to the profiler callback,
|
||||||
|
along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
|
||||||
|
event descriptor, attaches the network plugin defined event as external data, and calls
|
||||||
|
the profiler `startEvent` function.
|
||||||
|
|
||||||
|
```C
|
||||||
|
ncclResult_t isend(..., void* phandle, ...) {
|
||||||
|
...
|
||||||
|
int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||||
|
ncclProfilerNetIbDescr_v1_t eDescr = { };
|
||||||
|
eDescr.type = ncclProfileQp;
|
||||||
|
eDescr.qp = { ... };
|
||||||
|
ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
State transitions for the events described can also come with event attribute updates. For this
|
State transitions for the events described can also come with event attribute updates. For this
|
||||||
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
|
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
|
||||||
|
|
||||||
@ -264,7 +356,7 @@ typedef union {
|
|||||||
struct { // attributes to update for ncclProfileProxyCtrl
|
struct { // attributes to update for ncclProfileProxyCtrl
|
||||||
int appendedProxyOps; // number of appended proxy ops thus far
|
int appendedProxyOps; // number of appended proxy ops thus far
|
||||||
} proxyCtrl;
|
} proxyCtrl;
|
||||||
} ncclProfilerEventStateArgs_v2_t;
|
} ncclProfilerEventStateArgs_v3_t;
|
||||||
```
|
```
|
||||||
|
|
||||||
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
|
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
|
||||||
@ -279,14 +371,22 @@ Group event
|
|||||||
+- Collective event
|
+- Collective event
|
||||||
| |
|
| |
|
||||||
| +- ProxyOp event
|
| +- ProxyOp event
|
||||||
|
| | |
|
||||||
|
| | +- ProxyStep event
|
||||||
|
| | |
|
||||||
|
| | +- NetPlugin event
|
||||||
| |
|
| |
|
||||||
| +- ProxyStep event
|
| +- KernelCh event
|
||||||
|
|
|
|
||||||
+- Point-to-point event
|
+- Point-to-point event
|
||||||
|
|
|
|
||||||
+- ProxyOp event
|
+- ProxyOp event
|
||||||
|
| |
|
||||||
|
| +- ProxyStep event
|
||||||
|
| |
|
||||||
|
| +- NetPlugin event
|
||||||
|
|
|
|
||||||
+- ProxyStep event
|
+- KernelCh event
|
||||||
|
|
||||||
ProxyCtrl event
|
ProxyCtrl event
|
||||||
```
|
```
|
||||||
@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p
|
|||||||
dereference a pointer from another address space the event descriptor includes the PID of the originator.
|
dereference a pointer from another address space the event descriptor includes the PID of the originator.
|
||||||
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
|
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
|
||||||
parent event.
|
parent event.
|
||||||
|
|
||||||
|
# Known Limitations
|
||||||
|
|
||||||
|
In intra-node communication, or whenever a rank does not have any network activity for which proxy events
|
||||||
|
are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
|
||||||
|
enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
|
||||||
|
collective. However, this time only represents the launch time of the collective and not the actual
|
||||||
|
execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
|
||||||
|
|
||||||
|
Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
|
||||||
|
thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
|
||||||
|
the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
|
||||||
|
accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
|
||||||
|
delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.
|
||||||
|
@ -10,7 +10,7 @@ PLUGIN_SO := libnccl-profiler.so
|
|||||||
default: $(PLUGIN_SO)
|
default: $(PLUGIN_SO)
|
||||||
|
|
||||||
$(PLUGIN_SO): plugin.c event.c print_event.c
|
$(PLUGIN_SO): plugin.c event.c print_event.c
|
||||||
$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f $(PLUGIN_SO)
|
rm -f $(PLUGIN_SO)
|
||||||
|
@ -33,10 +33,42 @@
|
|||||||
|
|
||||||
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
|
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
|
||||||
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
|
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
|
||||||
|
#define MAX_EVENTS_PER_REQ (8)
|
||||||
#define MAX_COMM_CLIQUES (32 * 8)
|
|
||||||
|
|
||||||
struct proxyOp;
|
struct proxyOp;
|
||||||
|
struct proxyStep;
|
||||||
|
|
||||||
|
struct netPlugin {
|
||||||
|
uint8_t type;
|
||||||
|
int pluginType;
|
||||||
|
int pluginVer;
|
||||||
|
uint8_t pluginEvent;
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
int device;
|
||||||
|
int qpNum;
|
||||||
|
int opcode;
|
||||||
|
uint64_t wr_id;
|
||||||
|
size_t length;
|
||||||
|
} qp;
|
||||||
|
struct {
|
||||||
|
int fd;
|
||||||
|
int op;
|
||||||
|
size_t length;
|
||||||
|
} sock;
|
||||||
|
};
|
||||||
|
double startTs;
|
||||||
|
double stopTs;
|
||||||
|
struct proxyStep* parent;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct kernelCh {
|
||||||
|
uint8_t type;
|
||||||
|
uint8_t channelId;
|
||||||
|
struct taskEventBase* parent;
|
||||||
|
double startTs;
|
||||||
|
double stopTs;
|
||||||
|
};
|
||||||
|
|
||||||
struct proxyStep {
|
struct proxyStep {
|
||||||
uint8_t type; // type of event: network transfer
|
uint8_t type; // type of event: network transfer
|
||||||
@ -46,6 +78,8 @@ struct proxyStep {
|
|||||||
double startTs;
|
double startTs;
|
||||||
double stopTs;
|
double stopTs;
|
||||||
struct proxyOp* parent;
|
struct proxyOp* parent;
|
||||||
|
struct netPlugin net[MAX_EVENTS_PER_REQ];
|
||||||
|
int nNetEvents;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct proxyOp {
|
struct proxyOp {
|
||||||
@ -101,7 +135,6 @@ struct collective {
|
|||||||
void const* sendBuff;
|
void const* sendBuff;
|
||||||
void* recvBuff;
|
void* recvBuff;
|
||||||
size_t count;
|
size_t count;
|
||||||
size_t trafficBytes;
|
|
||||||
int root;
|
int root;
|
||||||
const char* datatype;
|
const char* datatype;
|
||||||
uint8_t nMaxChannels;
|
uint8_t nMaxChannels;
|
||||||
@ -111,6 +144,7 @@ struct collective {
|
|||||||
struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
|
struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
|
||||||
struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
|
struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
|
||||||
int nProxyOps[MAX_CHANNELS];
|
int nProxyOps[MAX_CHANNELS];
|
||||||
|
struct kernelCh kernel[MAX_CHANNELS];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct p2p {
|
struct p2p {
|
||||||
@ -121,6 +155,7 @@ struct p2p {
|
|||||||
const char* datatype;
|
const char* datatype;
|
||||||
int peer;
|
int peer;
|
||||||
struct proxyOp op[MAX_CHANNELS];
|
struct proxyOp op[MAX_CHANNELS];
|
||||||
|
struct kernelCh kernel[MAX_CHANNELS];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct group {
|
struct group {
|
||||||
|
34
ext-profiler/example/nccl/net_ib_v1.h
Normal file
34
ext-profiler/example/nccl/net_ib_v1.h
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NET_IB_V1_H_
|
||||||
|
#define NET_IB_V1_H_
|
||||||
|
|
||||||
|
#define NCCL_PROFILER_NET_IB_VER 1
|
||||||
|
|
||||||
|
enum {
|
||||||
|
ncclProfileQp = (1 << 0),
|
||||||
|
};
|
||||||
|
|
||||||
|
// The data structure version is encoded in the plugin identifier bitmask and
|
||||||
|
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||||
|
// identifier in the event descriptor before calling the profiler startEvent
|
||||||
|
// function. The profiler should inspect the plugin id to find out the source
|
||||||
|
// plugin as well as the version of the event struct
|
||||||
|
typedef struct {
|
||||||
|
uint8_t type; // event type (plugin defined)
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
int device; // network device id
|
||||||
|
uint64_t wr_id; // work request id
|
||||||
|
int opcode; // ibv opcode
|
||||||
|
int qpNum; // QP number
|
||||||
|
size_t length; // work request data length
|
||||||
|
} qp;
|
||||||
|
};
|
||||||
|
} ncclProfilerNetIbDescr_v1_t;
|
||||||
|
|
||||||
|
#endif
|
32
ext-profiler/example/nccl/net_socket_v1.h
Normal file
32
ext-profiler/example/nccl/net_socket_v1.h
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NET_SOCKET_V1_H_
|
||||||
|
#define NET_SOCKET_V1_H_
|
||||||
|
|
||||||
|
#define NCCL_PROFILER_NET_SOCKET_VER 1
|
||||||
|
|
||||||
|
enum {
|
||||||
|
ncclProfileSocket = (1 << 0),
|
||||||
|
};
|
||||||
|
|
||||||
|
// The data structure version is encoded in the plugin identifier bitmask and
|
||||||
|
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||||
|
// identifier in the event descriptor before calling the profiler startEvent
|
||||||
|
// function. The profiler should inspect the plugin id to find out the source
|
||||||
|
// plugin as well as the version of the event struct
|
||||||
|
typedef struct {
|
||||||
|
uint8_t type; // event type (plugin defined)
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
int fd;
|
||||||
|
int op;
|
||||||
|
size_t length;
|
||||||
|
} sock;
|
||||||
|
};
|
||||||
|
} ncclProfilerNetSockDescr_v1_t;
|
||||||
|
|
||||||
|
#endif
|
@ -4,8 +4,8 @@
|
|||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#ifndef NCCL_PROFILER_H_
|
#ifndef PROFILER_H_
|
||||||
#define NCCL_PROFILER_H_
|
#define PROFILER_H_
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -13,7 +13,54 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "err.h"
|
#include "err.h"
|
||||||
|
|
||||||
|
enum {
|
||||||
|
ncclProfileGroup = (1 << 0), // group event type
|
||||||
|
ncclProfileColl = (1 << 1), // host collective call event type
|
||||||
|
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||||
|
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||||
|
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||||
|
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||||
|
ncclProfileKernelCh = (1 << 6), // kernel channel event type
|
||||||
|
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
ncclProfilerProxyOpSendPosted,
|
||||||
|
ncclProfilerProxyOpSendRemFifoWait,
|
||||||
|
ncclProfilerProxyOpSendTransmitted,
|
||||||
|
ncclProfilerProxyOpSendDone,
|
||||||
|
ncclProfilerProxyOpRecvPosted,
|
||||||
|
ncclProfilerProxyOpRecvReceived,
|
||||||
|
ncclProfilerProxyOpRecvTransmitted,
|
||||||
|
ncclProfilerProxyOpRecvDone,
|
||||||
|
|
||||||
|
/* Legacy proxy profiler states */
|
||||||
|
ncclProfilerProxyStepSendGPUWait,
|
||||||
|
ncclProfilerProxyStepSendWait,
|
||||||
|
ncclProfilerProxyStepRecvWait,
|
||||||
|
ncclProfilerProxyStepRecvFlushWait,
|
||||||
|
ncclProfilerProxyStepRecvGPUWait,
|
||||||
|
|
||||||
|
/* Legacy proxy control states */
|
||||||
|
ncclProfilerProxyCtrlIdle,
|
||||||
|
ncclProfilerProxyCtrlActive,
|
||||||
|
ncclProfilerProxyCtrlSleep,
|
||||||
|
ncclProfilerProxyCtrlWakeup,
|
||||||
|
ncclProfilerProxyCtrlAppend,
|
||||||
|
ncclProfilerProxyCtrlAppendEnd,
|
||||||
|
} ncclProfilerEventState_t;
|
||||||
|
|
||||||
|
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||||
|
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||||
|
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||||
|
|
||||||
|
#include "profiler_v3.h"
|
||||||
#include "profiler_v2.h"
|
#include "profiler_v2.h"
|
||||||
#include "profiler_v1.h"
|
#include "profiler_v1.h"
|
||||||
|
#include "profiler_net.h"
|
||||||
|
|
||||||
|
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||||
|
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||||
|
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||||
|
|
||||||
#endif // end include guard
|
#endif // end include guard
|
||||||
|
22
ext-profiler/example/nccl/profiler_net.h
Normal file
22
ext-profiler/example/nccl/profiler_net.h
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef PROFILER_NET_H_
|
||||||
|
#define PROFILER_NET_H_
|
||||||
|
|
||||||
|
#define NCCL_PROFILER_NET_VER_BITS (16)
|
||||||
|
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
|
||||||
|
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
|
||||||
|
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
|
||||||
|
} ncclProfilerNetType;
|
||||||
|
|
||||||
|
#include "net_ib_v1.h"
|
||||||
|
#include "net_socket_v1.h"
|
||||||
|
|
||||||
|
#endif
|
@ -4,8 +4,8 @@
|
|||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#ifndef NCCL_PROFILER_V1_H_
|
#ifndef PROFILER_V1_H_
|
||||||
#define NCCL_PROFILER_V1_H_
|
#define PROFILER_V1_H_
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
@ -59,8 +59,16 @@ typedef struct {
|
|||||||
};
|
};
|
||||||
} ncclProfilerEventDescr_v1_t;
|
} ncclProfilerEventDescr_v1_t;
|
||||||
|
|
||||||
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
|
typedef union {
|
||||||
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
|
struct {
|
||||||
|
size_t transSize;
|
||||||
|
int steps;
|
||||||
|
} proxyOp;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
int appendedProxyOps;
|
||||||
|
} proxyCtrl;
|
||||||
|
} ncclProfilerEventStateArgs_v1_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const char* name;
|
const char* name;
|
||||||
|
@ -4,20 +4,11 @@
|
|||||||
* See LICENSE.txt for license information
|
* See LICENSE.txt for license information
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#ifndef NCCL_PROFILER_V2_H_
|
#ifndef PROFILER_V2_H_
|
||||||
#define NCCL_PROFILER_V2_H_
|
#define PROFILER_V2_H_
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
enum {
|
|
||||||
ncclProfileGroup = (1 << 0), // group event type
|
|
||||||
ncclProfileColl = (1 << 1), // host collective call event type
|
|
||||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
|
||||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
|
||||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
|
||||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||||
@ -65,32 +56,6 @@ typedef struct {
|
|||||||
};
|
};
|
||||||
} ncclProfilerEventDescr_v2_t;
|
} ncclProfilerEventDescr_v2_t;
|
||||||
|
|
||||||
typedef enum {
|
|
||||||
ncclProfilerProxyOpSendPosted,
|
|
||||||
ncclProfilerProxyOpSendRemFifoWait,
|
|
||||||
ncclProfilerProxyOpSendTransmitted,
|
|
||||||
ncclProfilerProxyOpSendDone,
|
|
||||||
ncclProfilerProxyOpRecvPosted,
|
|
||||||
ncclProfilerProxyOpRecvReceived,
|
|
||||||
ncclProfilerProxyOpRecvTransmitted,
|
|
||||||
ncclProfilerProxyOpRecvDone,
|
|
||||||
|
|
||||||
/* Legacy proxy profiler states */
|
|
||||||
ncclProfilerProxyStepSendGPUWait,
|
|
||||||
ncclProfilerProxyStepSendWait,
|
|
||||||
ncclProfilerProxyStepRecvWait,
|
|
||||||
ncclProfilerProxyStepRecvFlushWait,
|
|
||||||
ncclProfilerProxyStepRecvGPUWait,
|
|
||||||
|
|
||||||
/* Legacy proxy control states */
|
|
||||||
ncclProfilerProxyCtrlIdle,
|
|
||||||
ncclProfilerProxyCtrlActive,
|
|
||||||
ncclProfilerProxyCtrlSleep,
|
|
||||||
ncclProfilerProxyCtrlWakeup,
|
|
||||||
ncclProfilerProxyCtrlAppend,
|
|
||||||
ncclProfilerProxyCtrlAppendEnd,
|
|
||||||
} ncclProfilerEventState_v2_t;
|
|
||||||
|
|
||||||
typedef union {
|
typedef union {
|
||||||
struct {
|
struct {
|
||||||
size_t transSize;
|
size_t transSize;
|
||||||
@ -138,9 +103,4 @@ typedef struct {
|
|||||||
ncclResult_t (*finalize)(void* context);
|
ncclResult_t (*finalize)(void* context);
|
||||||
} ncclProfiler_v2_t;
|
} ncclProfiler_v2_t;
|
||||||
|
|
||||||
typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
|
|
||||||
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
|
|
||||||
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
|
|
||||||
typedef ncclProfiler_v2_t ncclProfiler_t;
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
119
ext-profiler/example/nccl/profiler_v3.h
Normal file
119
ext-profiler/example/nccl/profiler_v3.h
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef PROFILER_V3_H_
|
||||||
|
#define PROFILER_V3_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||||
|
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||||
|
int rank; // originating rank
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
const char* name;
|
||||||
|
uint64_t commHash;
|
||||||
|
uint64_t seqNumber;
|
||||||
|
const char* func;
|
||||||
|
void const* sendBuff;
|
||||||
|
void* recvBuff;
|
||||||
|
size_t count;
|
||||||
|
int root;
|
||||||
|
const char* datatype;
|
||||||
|
uint8_t nMaxChannels;
|
||||||
|
uint8_t nWarps;
|
||||||
|
const char* algo;
|
||||||
|
const char* proto;
|
||||||
|
} coll;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
const char* name;
|
||||||
|
uint64_t commHash;
|
||||||
|
const char* func;
|
||||||
|
void* buff;
|
||||||
|
const char* datatype;
|
||||||
|
size_t count;
|
||||||
|
int peer;
|
||||||
|
} p2p;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
pid_t pid; // pid of the originating process
|
||||||
|
uint8_t channelId; // channel id for this proxy operation
|
||||||
|
int peer; // remote rank for send/recv
|
||||||
|
int nSteps; // number of steps for this proxy operation
|
||||||
|
int chunkSize; // amount of data transferred by this proxy operation
|
||||||
|
int isSend;
|
||||||
|
} proxyOp;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
int step;
|
||||||
|
} proxyStep;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
uint8_t channelId;
|
||||||
|
} kernelCh;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
int64_t id;
|
||||||
|
void* data;
|
||||||
|
} netPlugin;
|
||||||
|
};
|
||||||
|
} ncclProfilerEventDescr_v3_t;
|
||||||
|
|
||||||
|
typedef union {
|
||||||
|
struct {
|
||||||
|
size_t transSize;
|
||||||
|
int steps;
|
||||||
|
} proxyOp;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
int appendedProxyOps;
|
||||||
|
} proxyCtrl;
|
||||||
|
} ncclProfilerEventStateArgs_v3_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
const char* name;
|
||||||
|
|
||||||
|
// init - initialize the profiler plugin
|
||||||
|
// Input
|
||||||
|
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||||
|
// Output
|
||||||
|
// - eActivationMask: bitmask of active events set by the plugin
|
||||||
|
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||||
|
|
||||||
|
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||||
|
// Input
|
||||||
|
// - context: opaque profiler context object
|
||||||
|
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||||
|
// Output
|
||||||
|
// - eHandle: return event handle for supplied event descriptor object
|
||||||
|
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||||
|
|
||||||
|
// stopEvent - stop/finalize an event inside and event set
|
||||||
|
// Input
|
||||||
|
// - eHandle: handle to event object
|
||||||
|
ncclResult_t (*stopEvent)(void* eHandle);
|
||||||
|
|
||||||
|
// recordEventState - record event state transitions and event attribute updates
|
||||||
|
// Input
|
||||||
|
// - eHandle : handle to event object created through startEvent
|
||||||
|
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||||
|
// - eState : event state transition
|
||||||
|
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||||
|
|
||||||
|
// finalize - finalize the profiler plugin
|
||||||
|
// Input
|
||||||
|
// - context: opaque profiler context object
|
||||||
|
ncclResult_t (*finalize)(void* context);
|
||||||
|
} ncclProfiler_v3_t;
|
||||||
|
|
||||||
|
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||||
|
typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t;
|
||||||
|
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||||
|
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||||
|
|
||||||
|
#endif
|
@ -58,6 +58,7 @@ __hidden double gettime(void) {
|
|||||||
|
|
||||||
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
static pid_t pid;
|
static pid_t pid;
|
||||||
|
static int* eActivationMaskPtr;
|
||||||
|
|
||||||
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
|
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
|
||||||
pthread_mutex_lock(&lock);
|
pthread_mutex_lock(&lock);
|
||||||
@ -65,7 +66,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
|
|||||||
// first thread initializes event mask, environment and detach pool
|
// first thread initializes event mask, environment and detach pool
|
||||||
const char* str;
|
const char* str;
|
||||||
str = getenv("NCCL_PROFILE_EVENT_MASK");
|
str = getenv("NCCL_PROFILE_EVENT_MASK");
|
||||||
__atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
|
__atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
|
||||||
|
|
||||||
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
|
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
|
||||||
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
|
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
|
||||||
@ -100,6 +101,9 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
|
|||||||
}
|
}
|
||||||
pthread_mutex_unlock(&lock);
|
pthread_mutex_unlock(&lock);
|
||||||
|
|
||||||
|
// store pointer to activation mask globally
|
||||||
|
eActivationMaskPtr = eActivationMask;
|
||||||
|
|
||||||
// pre-allocate memory for event object pools in dedicated profiler context
|
// pre-allocate memory for event object pools in dedicated profiler context
|
||||||
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
|
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
|
||||||
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
|
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
|
||||||
@ -199,8 +203,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
|||||||
if (base->type == ncclProfileColl) {
|
if (base->type == ncclProfileColl) {
|
||||||
struct collective* c = (struct collective *)base;
|
struct collective* c = (struct collective *)base;
|
||||||
// reset event proxyOps & proxySteps
|
// reset event proxyOps & proxySteps
|
||||||
memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
|
|
||||||
memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
|
|
||||||
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
|
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
|
||||||
// release collective events in the group and return them to the collective pool
|
// release collective events in the group and return them to the collective pool
|
||||||
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
|
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
|
||||||
@ -252,7 +254,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
|||||||
event->count = eDescr->coll.count;
|
event->count = eDescr->coll.count;
|
||||||
event->root = eDescr->coll.root;
|
event->root = eDescr->coll.root;
|
||||||
event->datatype = eDescr->coll.datatype;
|
event->datatype = eDescr->coll.datatype;
|
||||||
event->trafficBytes = eDescr->coll.trafficBytes;
|
|
||||||
event->nMaxChannels = eDescr->coll.nMaxChannels;
|
event->nMaxChannels = eDescr->coll.nMaxChannels;
|
||||||
event->nWarps = eDescr->coll.nWarps;
|
event->nWarps = eDescr->coll.nWarps;
|
||||||
event->algo = eDescr->coll.algo;
|
event->algo = eDescr->coll.algo;
|
||||||
@ -385,8 +386,77 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
|||||||
event->isSend = parent->isSend;
|
event->isSend = parent->isSend;
|
||||||
event->parent = parent;
|
event->parent = parent;
|
||||||
event->startTs = gettime() - startTime;
|
event->startTs = gettime() - startTime;
|
||||||
|
event->nNetEvents = 0;
|
||||||
*eHandle = event;
|
*eHandle = event;
|
||||||
debugEvent(event, "ProxyStepStart");
|
debugEvent(event, "ProxyStepStart");
|
||||||
|
} else if (eDescr->type == ncclProfileKernelCh) {
|
||||||
|
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
|
||||||
|
if (eventBase == NULL) return ncclSuccess;
|
||||||
|
if (eventBase->type == ncclProfileColl) {
|
||||||
|
struct collective* parent = (struct collective *)eDescr->parentObj;
|
||||||
|
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
|
||||||
|
event->type = ncclProfileKernelCh;
|
||||||
|
event->channelId = eDescr->kernelCh.channelId;
|
||||||
|
event->parent = eventBase;
|
||||||
|
event->startTs = gettime() - startTime;
|
||||||
|
*eHandle = event;
|
||||||
|
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||||
|
debugEvent(event, "KernelChStart");
|
||||||
|
} else { // ncclProfileP2p
|
||||||
|
struct p2p* parent = (struct p2p *)eDescr->parentObj;
|
||||||
|
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
|
||||||
|
event->type = ncclProfileKernelCh;
|
||||||
|
event->channelId = eDescr->kernelCh.channelId;
|
||||||
|
event->parent = eventBase;
|
||||||
|
event->startTs = gettime() - startTime;
|
||||||
|
*eHandle = event;
|
||||||
|
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||||
|
debugEvent(event, "KernelChStart");
|
||||||
|
}
|
||||||
|
} else if (eDescr->type == ncclProfileNetPlugin) {
|
||||||
|
struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj;
|
||||||
|
if (parent == NULL) return ncclSuccess;
|
||||||
|
|
||||||
|
int64_t pluginId = eDescr->netPlugin.id;
|
||||||
|
int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK;
|
||||||
|
int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK;
|
||||||
|
if (type == NCCL_PROFILER_NET_TYPE_IB) {
|
||||||
|
if (ver == 1) {
|
||||||
|
ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data;
|
||||||
|
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
|
||||||
|
event->type = ncclProfileNetPlugin;
|
||||||
|
event->pluginType = type;
|
||||||
|
event->pluginVer = ver;
|
||||||
|
if (descr->type == ncclProfileQp) {
|
||||||
|
event->pluginEvent = ncclProfileQp;
|
||||||
|
event->qp.device = descr->qp.device;
|
||||||
|
event->qp.wr_id = descr->qp.wr_id;
|
||||||
|
event->qp.opcode = descr->qp.opcode;
|
||||||
|
event->qp.qpNum = descr->qp.qpNum;
|
||||||
|
event->qp.length = descr->qp.length;
|
||||||
|
}
|
||||||
|
event->startTs = gettime() - startTime;
|
||||||
|
*eHandle = event;
|
||||||
|
debugEvent(event, "NetPluginStart");
|
||||||
|
}
|
||||||
|
} else if (type == NCCL_PROFILER_NET_TYPE_SOCK) {
|
||||||
|
if (ver == 1) {
|
||||||
|
ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data;
|
||||||
|
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
|
||||||
|
event->type = ncclProfileNetPlugin;
|
||||||
|
event->pluginType = type;
|
||||||
|
event->pluginVer = ver;
|
||||||
|
if (descr->type == ncclProfileSocket) {
|
||||||
|
event->pluginEvent = ncclProfileSocket;
|
||||||
|
event->sock.fd = descr->sock.fd;
|
||||||
|
event->sock.op = descr->sock.op;
|
||||||
|
event->sock.length = descr->sock.length;
|
||||||
|
}
|
||||||
|
event->startTs = gettime() - startTime;
|
||||||
|
*eHandle = event;
|
||||||
|
debugEvent(event, "NetPluginStart");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -445,6 +515,15 @@ void updateEvent(void* handle) {
|
|||||||
struct proxyCtrl* event = (struct proxyCtrl *)handle;
|
struct proxyCtrl* event = (struct proxyCtrl *)handle;
|
||||||
event->stopTs = gettime() - startTime;
|
event->stopTs = gettime() - startTime;
|
||||||
debugEvent(event, "ProxyCtrlStop");
|
debugEvent(event, "ProxyCtrlStop");
|
||||||
|
} else if (type == ncclProfileKernelCh) {
|
||||||
|
struct kernelCh* event = (struct kernelCh *)handle;
|
||||||
|
event->stopTs = gettime() - startTime;
|
||||||
|
updateEvent(event->parent);
|
||||||
|
debugEvent(event, "KernelChStop");
|
||||||
|
} else if (type == ncclProfileNetPlugin) {
|
||||||
|
struct netPlugin* event = (struct netPlugin *)handle;
|
||||||
|
event->stopTs = gettime() - startTime;
|
||||||
|
debugEvent(event, "NetPluginStop");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -506,7 +585,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclProfiler_t ncclProfiler_v2 = {
|
ncclProfiler_t ncclProfiler_v3 = {
|
||||||
"Example-profiler",
|
"Example-profiler",
|
||||||
exampleProfilerInit,
|
exampleProfilerInit,
|
||||||
exampleProfilerStartEvent,
|
exampleProfilerStartEvent,
|
||||||
@ -514,3 +593,17 @@ ncclProfiler_t ncclProfiler_v2 = {
|
|||||||
exampleProfilerRecordEventState,
|
exampleProfilerRecordEventState,
|
||||||
exampleProfilerFinalize,
|
exampleProfilerFinalize,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
int exampleProfilerStart(int eActivationMask) {
|
||||||
|
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
|
||||||
|
__atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
int exampleProfilerStop(void) {
|
||||||
|
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
|
||||||
|
__atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
13
ext-profiler/example/plugin.h
Normal file
13
ext-profiler/example/plugin.h
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef PLUGIN_H_
|
||||||
|
#define PLUGIN_H_
|
||||||
|
|
||||||
|
int exampleProfilerStart(int eActivationMask);
|
||||||
|
int exampleProfilerStop(void);
|
||||||
|
|
||||||
|
#endif
|
@ -72,7 +72,7 @@ __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static __thread int proxyStepId;
|
static __thread int proxyStepId;
|
||||||
__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
|
||||||
if (event->isSend) {
|
if (event->isSend) {
|
||||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||||
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||||
@ -84,8 +84,6 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
|||||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
|
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
|
||||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||||
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
|
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
|
||||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
|
||||||
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
|
|
||||||
} else {
|
} else {
|
||||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||||
@ -93,6 +91,14 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
|||||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
|
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
|
||||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
|
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
|
||||||
|
if (event->isSend) {
|
||||||
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||||
|
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||||
|
} else {
|
||||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
|
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
|
||||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||||
@ -106,6 +112,19 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __thread int kernelId;
|
||||||
|
__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
|
||||||
|
if (event->type != ncclProfileKernelCh) return;
|
||||||
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n",
|
||||||
|
"KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId);
|
||||||
|
}
|
||||||
|
|
||||||
|
__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
|
||||||
|
if (event->type != ncclProfileKernelCh) return;
|
||||||
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||||
|
"KernelCh", kernelId, getpid(), 1, event->stopTs);
|
||||||
|
}
|
||||||
|
|
||||||
static __thread int proxyCtrlId;
|
static __thread int proxyCtrlId;
|
||||||
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
||||||
const char* str;
|
const char* str;
|
||||||
@ -127,6 +146,29 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
|||||||
str, proxyCtrlId++, getpid(), 1, event->stopTs);
|
str, proxyCtrlId++, getpid(), 1, event->stopTs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __thread int ibQpId, sockId;
|
||||||
|
__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) {
|
||||||
|
if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) {
|
||||||
|
if (event->pluginVer == 1) {
|
||||||
|
if (event->pluginEvent == ncclProfileQp) {
|
||||||
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n",
|
||||||
|
"Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length);
|
||||||
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||||
|
"Qp", ibQpId++, getpid(), 1, event->stopTs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) {
|
||||||
|
if (event->pluginVer == 1) {
|
||||||
|
if (event->pluginEvent == ncclProfileSocket) {
|
||||||
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n",
|
||||||
|
"Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length);
|
||||||
|
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||||
|
"Sock", sockId++, getpid(), 1, event->stopTs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//#define DEBUG_EVENTS
|
//#define DEBUG_EVENTS
|
||||||
void debugEvent(void* eHandle, const char* tag) {
|
void debugEvent(void* eHandle, const char* tag) {
|
||||||
#ifdef DEBUG_EVENTS
|
#ifdef DEBUG_EVENTS
|
||||||
@ -146,8 +188,10 @@ void debugEvent(void* eHandle, const char* tag) {
|
|||||||
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
|
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
|
||||||
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
|
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
|
||||||
fprintf(fh, " parent = %p\n", event->base.parent);
|
fprintf(fh, " parent = %p\n", event->base.parent);
|
||||||
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
|
for (int j = 0; j < MAX_OPS; j++) {
|
||||||
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
|
for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
|
||||||
|
for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
|
||||||
|
}
|
||||||
fprintf(fh, " startTs = %f\n", event->base.startTs);
|
fprintf(fh, " startTs = %f\n", event->base.startTs);
|
||||||
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
|
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
|
||||||
fprintf(fh, "}\n");
|
fprintf(fh, "}\n");
|
||||||
@ -178,6 +222,20 @@ void debugEvent(void* eHandle, const char* tag) {
|
|||||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||||
fprintf(fh, "}\n");
|
fprintf(fh, "}\n");
|
||||||
|
} else if (type == ncclProfileKernelCh) {
|
||||||
|
struct kernelCh* event = (struct kernelCh *)eHandle;
|
||||||
|
fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag);
|
||||||
|
fprintf(fh, " parent = %p\n", event->parent);
|
||||||
|
fprintf(fh, " channel = %d\n", event->channelId);
|
||||||
|
} else if (type == ncclProfileNetPlugin) {
|
||||||
|
struct netPlugin* event = (struct netPlugin *)eHandle;
|
||||||
|
fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag);
|
||||||
|
fprintf(fh, " pluginType = %d\n", event->pluginType);
|
||||||
|
fprintf(fh, " pluginVer = %d\n", event->pluginVer);
|
||||||
|
fprintf(fh, " pluginEvent = %d\n", event->pluginEvent);
|
||||||
|
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||||
|
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||||
|
fprintf(fh, "}\n");
|
||||||
}
|
}
|
||||||
fclose(fh);
|
fclose(fh);
|
||||||
#endif
|
#endif
|
||||||
@ -200,17 +258,21 @@ void printEvent(FILE* fh, void* handle) {
|
|||||||
struct collective* c = (struct collective *)handle;
|
struct collective* c = (struct collective *)handle;
|
||||||
printCollEventHeader(fh, c);
|
printCollEventHeader(fh, c);
|
||||||
for (int i = 0; i < MAX_CHANNELS; i++) {
|
for (int i = 0; i < MAX_CHANNELS; i++) {
|
||||||
|
printKernelChEventHeader(fh, &c->kernel[i]);
|
||||||
for (int j = 0; j < c->nProxyOps[i]; j++) {
|
for (int j = 0; j < c->nProxyOps[i]; j++) {
|
||||||
printEvent(fh, &c->send[i][j]);
|
printEvent(fh, &c->send[i][j]);
|
||||||
printEvent(fh, &c->recv[i][j]);
|
printEvent(fh, &c->recv[i][j]);
|
||||||
}
|
}
|
||||||
|
printKernelChEventTrailer(fh, &c->kernel[i]);
|
||||||
}
|
}
|
||||||
printCollEventTrailer(fh, c);
|
printCollEventTrailer(fh, c);
|
||||||
} else if (type == ncclProfileP2p) {
|
} else if (type == ncclProfileP2p) {
|
||||||
struct p2p* p = (struct p2p *)handle;
|
struct p2p* p = (struct p2p *)handle;
|
||||||
printP2pEventHeader(fh, p);
|
printP2pEventHeader(fh, p);
|
||||||
for (int i = 0; i < MAX_CHANNELS; i++) {
|
for (int i = 0; i < MAX_CHANNELS; i++) {
|
||||||
|
printKernelChEventHeader(fh, &p->kernel[i]);
|
||||||
printEvent(fh, &p->op[i]);
|
printEvent(fh, &p->op[i]);
|
||||||
|
printKernelChEventTrailer(fh, &p->kernel[i]);
|
||||||
}
|
}
|
||||||
printP2pEventTrailer(fh, p);
|
printP2pEventTrailer(fh, p);
|
||||||
} else if (type == ncclProfileProxyOp) {
|
} else if (type == ncclProfileProxyOp) {
|
||||||
@ -222,7 +284,11 @@ void printEvent(FILE* fh, void* handle) {
|
|||||||
printProxyOpEventTrailer(fh, p);
|
printProxyOpEventTrailer(fh, p);
|
||||||
} else if (type == ncclProfileProxyStep) {
|
} else if (type == ncclProfileProxyStep) {
|
||||||
struct proxyStep* p = (struct proxyStep *)handle;
|
struct proxyStep* p = (struct proxyStep *)handle;
|
||||||
printProxyStepEvent(fh, p);
|
printProxyStepEventHeader(fh, p);
|
||||||
|
for (int q = 0; q < p->nNetEvents; q++) {
|
||||||
|
printNetPluginEvent(fh, &p->net[q]);
|
||||||
|
}
|
||||||
|
printProxyStepEventTrailer(fh, p);
|
||||||
} else if (type == ncclProfileProxyCtrl) {
|
} else if (type == ncclProfileProxyCtrl) {
|
||||||
struct proxyCtrl* p = (struct proxyCtrl *)handle;
|
struct proxyCtrl* p = (struct proxyCtrl *)handle;
|
||||||
printProxyCtrlEvent(fh, p);
|
printProxyCtrlEvent(fh, p);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user