Update ext-profiler example
Sync ext-profiler example with 2.26.2.
This commit is contained in:
parent
f44ac759fe
commit
145e67e707
@ -49,9 +49,9 @@ of newer ones.
|
||||
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
|
||||
from old API versions. It also provides error codes in `err.h`.
|
||||
|
||||
# API (v2)
|
||||
# API (v3)
|
||||
|
||||
Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
|
||||
Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
@ -70,7 +70,7 @@ typedef struct {
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
@ -82,13 +82,13 @@ typedef struct {
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
} ncclProfiler_v3_t;
|
||||
```
|
||||
|
||||
## Error codes
|
||||
@ -156,7 +156,6 @@ typedef struct {
|
||||
size_t count; // data count
|
||||
int root; // root rank
|
||||
const char* datatype; // string containing the name of the datatype
|
||||
size_t trafficBytes; // number of transfer bytes
|
||||
uint8_t nMaxChannels; // max number of channels for this collective
|
||||
uint8_t nWarps; // number of GPU warps for this collective
|
||||
const char* algo; // string containing name of the algorithm for this collective
|
||||
@ -185,12 +184,22 @@ typedef struct {
|
||||
struct { // proxyStep events metadata
|
||||
int step; // individual step in `ncclProxyOp`
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId; // id of the channel used by the kernel
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id; // net plugin id (used by net and profiler plugins to agree on event definitions)
|
||||
void* data; // pointer to network plugin defined event
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
} ncclProfilerEventDescr_v3_t;
|
||||
```
|
||||
|
||||
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
|
||||
`ncclProfileNetPlugin`.
|
||||
|
||||
#### stopEvent
|
||||
|
||||
@ -236,7 +245,7 @@ typedef enum {
|
||||
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
|
||||
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
|
||||
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
|
||||
} ncclProfilerEventState_v2_t;
|
||||
} ncclProfilerEventState_v3_t;
|
||||
```
|
||||
|
||||
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
|
||||
@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
|
||||
network requests for the GPU kernel. This includes everything else that the proxy thread might be
|
||||
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
|
||||
|
||||
`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
|
||||
processes work items for the enqueued NCCL operations.
|
||||
|
||||
`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
|
||||
their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
|
||||
the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
|
||||
network defined event definition using the plugin id in the event descriptor. The plugin identifier
|
||||
is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
|
||||
16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
|
||||
unused and available for future extensions.
|
||||
|
||||
A network IB plugin can use this infrastructure to define a QP event as:
|
||||
|
||||
```C
|
||||
#define NCCL_PROFILER_NET_IB_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileQp = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int device; // network device id
|
||||
uint64_t wr_id; // work request id
|
||||
int opcode; // ibv opcode
|
||||
int qpNum; // QP number
|
||||
size_t length; // work request data length
|
||||
} qp;
|
||||
};
|
||||
} ncclProfilerNetIbDescr_v1_t;
|
||||
```
|
||||
|
||||
The network event infrastructure is network agnostic. A different network socket plugin can
|
||||
use it to define a socket event as:
|
||||
|
||||
```C
|
||||
#define NCCL_PROFILER_NET_SOCKET_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileSocket = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int fd;
|
||||
int op;
|
||||
size_t length;
|
||||
} sock;
|
||||
};
|
||||
} ncclProfilerNetSockDescr_v1_t;
|
||||
```
|
||||
|
||||
The network plugin creates an event (descriptor) and passes it to the profiler callback,
|
||||
along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
|
||||
event descriptor, attaches the network plugin defined event as external data, and calls
|
||||
the profiler `startEvent` function.
|
||||
|
||||
```C
|
||||
ncclResult_t isend(..., void* phandle, ...) {
|
||||
...
|
||||
int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
ncclProfilerNetIbDescr_v1_t eDescr = { };
|
||||
eDescr.type = ncclProfileQp;
|
||||
eDescr.qp = { ... };
|
||||
ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
State transitions for the events described can also come with event attribute updates. For this
|
||||
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
|
||||
|
||||
@ -264,7 +356,7 @@ typedef union {
|
||||
struct { // attributes to update for ncclProfileProxyCtrl
|
||||
int appendedProxyOps; // number of appended proxy ops thus far
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v2_t;
|
||||
} ncclProfilerEventStateArgs_v3_t;
|
||||
```
|
||||
|
||||
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
|
||||
@ -279,14 +371,22 @@ Group event
|
||||
+- Collective event
|
||||
| |
|
||||
| +- ProxyOp event
|
||||
| |
|
||||
| +- ProxyStep event
|
||||
| | |
|
||||
| | +- ProxyStep event
|
||||
| | |
|
||||
| | +- NetPlugin event
|
||||
| |
|
||||
| +- KernelCh event
|
||||
|
|
||||
+- Point-to-point event
|
||||
|
|
||||
+- ProxyOp event
|
||||
|
|
||||
+- ProxyStep event
|
||||
| |
|
||||
| +- ProxyStep event
|
||||
| |
|
||||
| +- NetPlugin event
|
||||
|
|
||||
+- KernelCh event
|
||||
|
||||
ProxyCtrl event
|
||||
```
|
||||
@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p
|
||||
dereference a pointer from another address space the event descriptor includes the PID of the originator.
|
||||
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
|
||||
parent event.
|
||||
|
||||
# Known Limitations
|
||||
|
||||
In intra-node communication, or whenever a rank does not have any network activity for which proxy events
|
||||
are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
|
||||
enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
|
||||
collective. However, this time only represents the launch time of the collective and not the actual
|
||||
execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
|
||||
|
||||
Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
|
||||
thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
|
||||
the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
|
||||
accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
|
||||
delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.
|
||||
|
@ -10,7 +10,7 @@ PLUGIN_SO := libnccl-profiler.so
|
||||
default: $(PLUGIN_SO)
|
||||
|
||||
$(PLUGIN_SO): plugin.c event.c print_event.c
|
||||
$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||
$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||
|
||||
clean:
|
||||
rm -f $(PLUGIN_SO)
|
||||
|
@ -33,10 +33,42 @@
|
||||
|
||||
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
|
||||
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
|
||||
|
||||
#define MAX_COMM_CLIQUES (32 * 8)
|
||||
#define MAX_EVENTS_PER_REQ (8)
|
||||
|
||||
struct proxyOp;
|
||||
struct proxyStep;
|
||||
|
||||
struct netPlugin {
|
||||
uint8_t type;
|
||||
int pluginType;
|
||||
int pluginVer;
|
||||
uint8_t pluginEvent;
|
||||
union {
|
||||
struct {
|
||||
int device;
|
||||
int qpNum;
|
||||
int opcode;
|
||||
uint64_t wr_id;
|
||||
size_t length;
|
||||
} qp;
|
||||
struct {
|
||||
int fd;
|
||||
int op;
|
||||
size_t length;
|
||||
} sock;
|
||||
};
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct proxyStep* parent;
|
||||
};
|
||||
|
||||
struct kernelCh {
|
||||
uint8_t type;
|
||||
uint8_t channelId;
|
||||
struct taskEventBase* parent;
|
||||
double startTs;
|
||||
double stopTs;
|
||||
};
|
||||
|
||||
struct proxyStep {
|
||||
uint8_t type; // type of event: network transfer
|
||||
@ -46,6 +78,8 @@ struct proxyStep {
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct proxyOp* parent;
|
||||
struct netPlugin net[MAX_EVENTS_PER_REQ];
|
||||
int nNetEvents;
|
||||
};
|
||||
|
||||
struct proxyOp {
|
||||
@ -101,7 +135,6 @@ struct collective {
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
size_t trafficBytes;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nMaxChannels;
|
||||
@ -111,6 +144,7 @@ struct collective {
|
||||
struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
|
||||
struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
|
||||
int nProxyOps[MAX_CHANNELS];
|
||||
struct kernelCh kernel[MAX_CHANNELS];
|
||||
};
|
||||
|
||||
struct p2p {
|
||||
@ -121,6 +155,7 @@ struct p2p {
|
||||
const char* datatype;
|
||||
int peer;
|
||||
struct proxyOp op[MAX_CHANNELS];
|
||||
struct kernelCh kernel[MAX_CHANNELS];
|
||||
};
|
||||
|
||||
struct group {
|
||||
|
34
ext-profiler/example/nccl/net_ib_v1.h
Normal file
34
ext-profiler/example/nccl/net_ib_v1.h
Normal file
@ -0,0 +1,34 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_IB_V1_H_
|
||||
#define NET_IB_V1_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_IB_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileQp = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int device; // network device id
|
||||
uint64_t wr_id; // work request id
|
||||
int opcode; // ibv opcode
|
||||
int qpNum; // QP number
|
||||
size_t length; // work request data length
|
||||
} qp;
|
||||
};
|
||||
} ncclProfilerNetIbDescr_v1_t;
|
||||
|
||||
#endif
|
32
ext-profiler/example/nccl/net_socket_v1.h
Normal file
32
ext-profiler/example/nccl/net_socket_v1.h
Normal file
@ -0,0 +1,32 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_SOCKET_V1_H_
|
||||
#define NET_SOCKET_V1_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_SOCKET_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileSocket = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int fd;
|
||||
int op;
|
||||
size_t length;
|
||||
} sock;
|
||||
};
|
||||
} ncclProfilerNetSockDescr_v1_t;
|
||||
|
||||
#endif
|
@ -4,8 +4,8 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_H_
|
||||
#define NCCL_PROFILER_H_
|
||||
#ifndef PROFILER_H_
|
||||
#define PROFILER_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
@ -13,7 +13,54 @@
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
ncclProfileKernelCh = (1 << 6), // kernel channel event type
|
||||
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
} ncclProfilerEventState_t;
|
||||
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||
|
||||
#include "profiler_v3.h"
|
||||
#include "profiler_v2.h"
|
||||
#include "profiler_v1.h"
|
||||
#include "profiler_net.h"
|
||||
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
|
||||
#endif // end include guard
|
||||
|
22
ext-profiler/example/nccl/profiler_net.h
Normal file
22
ext-profiler/example/nccl/profiler_net.h
Normal file
@ -0,0 +1,22 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_NET_H_
|
||||
#define PROFILER_NET_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_VER_BITS (16)
|
||||
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
|
||||
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
|
||||
|
||||
typedef enum {
|
||||
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
|
||||
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
|
||||
} ncclProfilerNetType;
|
||||
|
||||
#include "net_ib_v1.h"
|
||||
#include "net_socket_v1.h"
|
||||
|
||||
#endif
|
@ -4,8 +4,8 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_V1_H_
|
||||
#define NCCL_PROFILER_V1_H_
|
||||
#ifndef PROFILER_V1_H_
|
||||
#define PROFILER_V1_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@ -59,8 +59,16 @@ typedef struct {
|
||||
};
|
||||
} ncclProfilerEventDescr_v1_t;
|
||||
|
||||
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v1_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
@ -4,20 +4,11 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_V2_H_
|
||||
#define NCCL_PROFILER_V2_H_
|
||||
#ifndef PROFILER_V2_H_
|
||||
#define PROFILER_V2_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
@ -65,32 +56,6 @@ typedef struct {
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
} ncclProfilerEventState_v2_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
@ -138,9 +103,4 @@ typedef struct {
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
|
||||
typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
|
||||
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v2_t ncclProfiler_t;
|
||||
|
||||
#endif
|
||||
|
119
ext-profiler/example/nccl/profiler_v3.h
Normal file
119
ext-profiler/example/nccl/profiler_v3.h
Normal file
@ -0,0 +1,119 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V3_H_
|
||||
#define PROFILER_V3_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v3_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v3_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v3_t;
|
||||
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
|
||||
#endif
|
@ -58,6 +58,7 @@ __hidden double gettime(void) {
|
||||
|
||||
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static pid_t pid;
|
||||
static int* eActivationMaskPtr;
|
||||
|
||||
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
|
||||
pthread_mutex_lock(&lock);
|
||||
@ -65,7 +66,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
|
||||
// first thread initializes event mask, environment and detach pool
|
||||
const char* str;
|
||||
str = getenv("NCCL_PROFILE_EVENT_MASK");
|
||||
__atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
|
||||
__atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
|
||||
|
||||
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
|
||||
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
|
||||
@ -100,6 +101,9 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
|
||||
}
|
||||
pthread_mutex_unlock(&lock);
|
||||
|
||||
// store pointer to activation mask globally
|
||||
eActivationMaskPtr = eActivationMask;
|
||||
|
||||
// pre-allocate memory for event object pools in dedicated profiler context
|
||||
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
|
||||
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
|
||||
@ -199,8 +203,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
if (base->type == ncclProfileColl) {
|
||||
struct collective* c = (struct collective *)base;
|
||||
// reset event proxyOps & proxySteps
|
||||
memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
|
||||
memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
|
||||
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
|
||||
// release collective events in the group and return them to the collective pool
|
||||
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
|
||||
@ -252,7 +254,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->count = eDescr->coll.count;
|
||||
event->root = eDescr->coll.root;
|
||||
event->datatype = eDescr->coll.datatype;
|
||||
event->trafficBytes = eDescr->coll.trafficBytes;
|
||||
event->nMaxChannels = eDescr->coll.nMaxChannels;
|
||||
event->nWarps = eDescr->coll.nWarps;
|
||||
event->algo = eDescr->coll.algo;
|
||||
@ -373,7 +374,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "ProxyOpStart");
|
||||
}
|
||||
} else if (eDescr->type == ncclProfileProxyStep) {
|
||||
} else if (eDescr->type == ncclProfileProxyStep) {
|
||||
// the parent might be null if we run out of events
|
||||
struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
@ -385,8 +386,77 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->isSend = parent->isSend;
|
||||
event->parent = parent;
|
||||
event->startTs = gettime() - startTime;
|
||||
event->nNetEvents = 0;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "ProxyStepStart");
|
||||
} else if (eDescr->type == ncclProfileKernelCh) {
|
||||
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
|
||||
if (eventBase == NULL) return ncclSuccess;
|
||||
if (eventBase->type == ncclProfileColl) {
|
||||
struct collective* parent = (struct collective *)eDescr->parentObj;
|
||||
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
|
||||
event->type = ncclProfileKernelCh;
|
||||
event->channelId = eDescr->kernelCh.channelId;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "KernelChStart");
|
||||
} else { // ncclProfileP2p
|
||||
struct p2p* parent = (struct p2p *)eDescr->parentObj;
|
||||
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
|
||||
event->type = ncclProfileKernelCh;
|
||||
event->channelId = eDescr->kernelCh.channelId;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "KernelChStart");
|
||||
}
|
||||
} else if (eDescr->type == ncclProfileNetPlugin) {
|
||||
struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
|
||||
int64_t pluginId = eDescr->netPlugin.id;
|
||||
int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK;
|
||||
int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK;
|
||||
if (type == NCCL_PROFILER_NET_TYPE_IB) {
|
||||
if (ver == 1) {
|
||||
ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data;
|
||||
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
|
||||
event->type = ncclProfileNetPlugin;
|
||||
event->pluginType = type;
|
||||
event->pluginVer = ver;
|
||||
if (descr->type == ncclProfileQp) {
|
||||
event->pluginEvent = ncclProfileQp;
|
||||
event->qp.device = descr->qp.device;
|
||||
event->qp.wr_id = descr->qp.wr_id;
|
||||
event->qp.opcode = descr->qp.opcode;
|
||||
event->qp.qpNum = descr->qp.qpNum;
|
||||
event->qp.length = descr->qp.length;
|
||||
}
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "NetPluginStart");
|
||||
}
|
||||
} else if (type == NCCL_PROFILER_NET_TYPE_SOCK) {
|
||||
if (ver == 1) {
|
||||
ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data;
|
||||
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
|
||||
event->type = ncclProfileNetPlugin;
|
||||
event->pluginType = type;
|
||||
event->pluginVer = ver;
|
||||
if (descr->type == ncclProfileSocket) {
|
||||
event->pluginEvent = ncclProfileSocket;
|
||||
event->sock.fd = descr->sock.fd;
|
||||
event->sock.op = descr->sock.op;
|
||||
event->sock.length = descr->sock.length;
|
||||
}
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "NetPluginStart");
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@ -445,6 +515,15 @@ void updateEvent(void* handle) {
|
||||
struct proxyCtrl* event = (struct proxyCtrl *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
debugEvent(event, "ProxyCtrlStop");
|
||||
} else if (type == ncclProfileKernelCh) {
|
||||
struct kernelCh* event = (struct kernelCh *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
updateEvent(event->parent);
|
||||
debugEvent(event, "KernelChStop");
|
||||
} else if (type == ncclProfileNetPlugin) {
|
||||
struct netPlugin* event = (struct netPlugin *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
debugEvent(event, "NetPluginStop");
|
||||
}
|
||||
}
|
||||
|
||||
@ -506,7 +585,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclProfiler_t ncclProfiler_v2 = {
|
||||
ncclProfiler_t ncclProfiler_v3 = {
|
||||
"Example-profiler",
|
||||
exampleProfilerInit,
|
||||
exampleProfilerStartEvent,
|
||||
@ -514,3 +593,17 @@ ncclProfiler_t ncclProfiler_v2 = {
|
||||
exampleProfilerRecordEventState,
|
||||
exampleProfilerFinalize,
|
||||
};
|
||||
|
||||
int exampleProfilerStart(int eActivationMask) {
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
|
||||
__atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int exampleProfilerStop(void) {
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
|
||||
__atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
13
ext-profiler/example/plugin.h
Normal file
13
ext-profiler/example/plugin.h
Normal file
@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PLUGIN_H_
|
||||
#define PLUGIN_H_
|
||||
|
||||
int exampleProfilerStart(int eActivationMask);
|
||||
int exampleProfilerStop(void);
|
||||
|
||||
#endif
|
@ -72,7 +72,7 @@ __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
|
||||
}
|
||||
|
||||
static __thread int proxyStepId;
|
||||
__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
|
||||
if (event->isSend) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||
@ -84,8 +84,6 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||
} else {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||
@ -93,6 +91,14 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
|
||||
}
|
||||
}
|
||||
|
||||
__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
|
||||
if (event->isSend) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||
} else {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
@ -106,6 +112,19 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
}
|
||||
}
|
||||
|
||||
static __thread int kernelId;
|
||||
__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
|
||||
if (event->type != ncclProfileKernelCh) return;
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n",
|
||||
"KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId);
|
||||
}
|
||||
|
||||
__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
|
||||
if (event->type != ncclProfileKernelCh) return;
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"KernelCh", kernelId, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int proxyCtrlId;
|
||||
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
||||
const char* str;
|
||||
@ -127,6 +146,29 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
||||
str, proxyCtrlId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int ibQpId, sockId;
|
||||
__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) {
|
||||
if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) {
|
||||
if (event->pluginVer == 1) {
|
||||
if (event->pluginEvent == ncclProfileQp) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n",
|
||||
"Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"Qp", ibQpId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
}
|
||||
} else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) {
|
||||
if (event->pluginVer == 1) {
|
||||
if (event->pluginEvent == ncclProfileSocket) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n",
|
||||
"Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"Sock", sockId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#define DEBUG_EVENTS
|
||||
void debugEvent(void* eHandle, const char* tag) {
|
||||
#ifdef DEBUG_EVENTS
|
||||
@ -146,8 +188,10 @@ void debugEvent(void* eHandle, const char* tag) {
|
||||
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
|
||||
fprintf(fh, " parent = %p\n", event->base.parent);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
|
||||
for (int j = 0; j < MAX_OPS; j++) {
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
|
||||
}
|
||||
fprintf(fh, " startTs = %f\n", event->base.startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
@ -178,6 +222,20 @@ void debugEvent(void* eHandle, const char* tag) {
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
} else if (type == ncclProfileKernelCh) {
|
||||
struct kernelCh* event = (struct kernelCh *)eHandle;
|
||||
fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " parent = %p\n", event->parent);
|
||||
fprintf(fh, " channel = %d\n", event->channelId);
|
||||
} else if (type == ncclProfileNetPlugin) {
|
||||
struct netPlugin* event = (struct netPlugin *)eHandle;
|
||||
fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " pluginType = %d\n", event->pluginType);
|
||||
fprintf(fh, " pluginVer = %d\n", event->pluginVer);
|
||||
fprintf(fh, " pluginEvent = %d\n", event->pluginEvent);
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
}
|
||||
fclose(fh);
|
||||
#endif
|
||||
@ -200,17 +258,21 @@ void printEvent(FILE* fh, void* handle) {
|
||||
struct collective* c = (struct collective *)handle;
|
||||
printCollEventHeader(fh, c);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) {
|
||||
printKernelChEventHeader(fh, &c->kernel[i]);
|
||||
for (int j = 0; j < c->nProxyOps[i]; j++) {
|
||||
printEvent(fh, &c->send[i][j]);
|
||||
printEvent(fh, &c->recv[i][j]);
|
||||
}
|
||||
printKernelChEventTrailer(fh, &c->kernel[i]);
|
||||
}
|
||||
printCollEventTrailer(fh, c);
|
||||
} else if (type == ncclProfileP2p) {
|
||||
struct p2p* p = (struct p2p *)handle;
|
||||
printP2pEventHeader(fh, p);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) {
|
||||
printKernelChEventHeader(fh, &p->kernel[i]);
|
||||
printEvent(fh, &p->op[i]);
|
||||
printKernelChEventTrailer(fh, &p->kernel[i]);
|
||||
}
|
||||
printP2pEventTrailer(fh, p);
|
||||
} else if (type == ncclProfileProxyOp) {
|
||||
@ -222,7 +284,11 @@ void printEvent(FILE* fh, void* handle) {
|
||||
printProxyOpEventTrailer(fh, p);
|
||||
} else if (type == ncclProfileProxyStep) {
|
||||
struct proxyStep* p = (struct proxyStep *)handle;
|
||||
printProxyStepEvent(fh, p);
|
||||
printProxyStepEventHeader(fh, p);
|
||||
for (int q = 0; q < p->nNetEvents; q++) {
|
||||
printNetPluginEvent(fh, &p->net[q]);
|
||||
}
|
||||
printProxyStepEventTrailer(fh, p);
|
||||
} else if (type == ncclProfileProxyCtrl) {
|
||||
struct proxyCtrl* p = (struct proxyCtrl *)handle;
|
||||
printProxyCtrlEvent(fh, p);
|
||||
|
Loading…
x
Reference in New Issue
Block a user