From 145e67e70745c5f78f18334f82de29dbe59bde63 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Wed, 9 Apr 2025 09:02:40 -0700 Subject: [PATCH] Update ext-profiler example Sync ext-profiler example with 2.26.2. --- ext-profiler/README.md | 142 +++++++++++++++++++--- ext-profiler/example/Makefile | 2 +- ext-profiler/example/event.h | 41 ++++++- ext-profiler/example/nccl/net_ib_v1.h | 34 ++++++ ext-profiler/example/nccl/net_socket_v1.h | 32 +++++ ext-profiler/example/nccl/profiler.h | 51 +++++++- ext-profiler/example/nccl/profiler_net.h | 22 ++++ ext-profiler/example/nccl/profiler_v1.h | 16 ++- ext-profiler/example/nccl/profiler_v2.h | 44 +------ ext-profiler/example/nccl/profiler_v3.h | 119 ++++++++++++++++++ ext-profiler/example/plugin.c | 105 +++++++++++++++- ext-profiler/example/plugin.h | 13 ++ ext-profiler/example/print_event.c | 78 +++++++++++- 13 files changed, 621 insertions(+), 78 deletions(-) create mode 100644 ext-profiler/example/nccl/net_ib_v1.h create mode 100644 ext-profiler/example/nccl/net_socket_v1.h create mode 100644 ext-profiler/example/nccl/profiler_net.h create mode 100644 ext-profiler/example/nccl/profiler_v3.h create mode 100644 ext-profiler/example/plugin.h diff --git a/ext-profiler/README.md b/ext-profiler/README.md index 7ef44b2..2a4018c 100644 --- a/ext-profiler/README.md +++ b/ext-profiler/README.md @@ -49,9 +49,9 @@ of newer ones. The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. -# API (v2) +# API (v3) -Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections. +Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections. ``` typedef struct { @@ -70,7 +70,7 @@ typedef struct { // - eDescr : pointer to ncclProfilerEventDescr_t object // Output // - eHandle: return event handle for supplied event descriptor object - ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr); // stopEvent - stop/finalize an event inside and event set // Input @@ -82,13 +82,13 @@ typedef struct { // - eHandle : handle to event object created through startEvent // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition // - eState : event state transition - ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs); // finalize - finalize the profiler plugin // Input // - context: opaque profiler context object ncclResult_t (*finalize)(void* context); -} ncclProfiler_v2_t; +} ncclProfiler_v3_t; ``` ## Error codes @@ -156,7 +156,6 @@ typedef struct { size_t count; // data count int root; // root rank const char* datatype; // string containing the name of the datatype - size_t trafficBytes; // number of transfer bytes uint8_t nMaxChannels; // max number of channels for this collective uint8_t nWarps; // number of GPU warps for this collective const char* algo; // string containing name of the algorithm for this collective @@ -185,12 +184,22 @@ typedef struct { struct { // proxyStep events metadata int step; // individual step in `ncclProxyOp` } proxyStep; + + struct { + uint8_t channelId; // id of the channel used by the kernel + } kernelCh; + + struct { + int64_t id; // net plugin id (used by net and profiler plugins to agree on event definitions) + void* data; // pointer to network plugin defined event + } netPlugin; }; -} ncclProfilerEventDescr_v2_t; +} ncclProfilerEventDescr_v3_t; ``` NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`, -`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`. +`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and +`ncclProfileNetPlugin`. #### stopEvent @@ -236,7 +245,7 @@ typedef enum { ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end -} ncclProfilerEventState_v2_t; +} ncclProfilerEventState_v3_t; ``` `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing @@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events. network requests for the GPU kernel. This includes everything else that the proxy thread might be doing, including appending new `ncclProxyOp` objects to the list of work elements to process. +`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel +processes work items for the enqueued NCCL operations. + +`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define +their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and +the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the +network defined event definition using the plugin id in the event descriptor. The plugin identifier +is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next +16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are +unused and available for future extensions. + +A network IB plugin can use this infrastructure to define a QP event as: + +```C +#define NCCL_PROFILER_NET_IB_VER 1 + +enum { + ncclProfileQp = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int device; // network device id + uint64_t wr_id; // work request id + int opcode; // ibv opcode + int qpNum; // QP number + size_t length; // work request data length + } qp; + }; +} ncclProfilerNetIbDescr_v1_t; +``` + +The network event infrastructure is network agnostic. A different network socket plugin can +use it to define a socket event as: + +```C +#define NCCL_PROFILER_NET_SOCKET_VER 1 + +enum { + ncclProfileSocket = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int fd; + int op; + size_t length; + } sock; + }; +} ncclProfilerNetSockDescr_v1_t; +``` + +The network plugin creates an event (descriptor) and passes it to the profiler callback, +along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin` +event descriptor, attaches the network plugin defined event as external data, and calls +the profiler `startEvent` function. + +```C +ncclResult_t isend(..., void* phandle, ...) { + ... + int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; + ncclProfilerNetIbDescr_v1_t eDescr = { }; + eDescr.type = ncclProfileQp; + eDescr.qp = { ... }; + ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr); + ... +} +``` + State transitions for the events described can also come with event attribute updates. For this reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below. @@ -264,7 +356,7 @@ typedef union { struct { // attributes to update for ncclProfileProxyCtrl int appendedProxyOps; // number of appended proxy ops thus far } proxyCtrl; -} ncclProfilerEventStateArgs_v2_t; +} ncclProfilerEventStateArgs_v3_t; ``` The example profiler in `ext-profiler/example` contains details on how to capture and use the events above. @@ -279,14 +371,22 @@ Group event +- Collective event | | | +- ProxyOp event - | | - | +- ProxyStep event + | | | + | | +- ProxyStep event + | | | + | | +- NetPlugin event + | | + | +- KernelCh event | +- Point-to-point event | +- ProxyOp event - | - +- ProxyStep event + | | + | +- ProxyStep event + | | + | +- NetPlugin event + | + +- KernelCh event ProxyCtrl event ``` @@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p dereference a pointer from another address space the event descriptor includes the PID of the originator. The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the parent event. + +# Known Limitations + +In intra-node communication, or whenever a rank does not have any network activity for which proxy events +are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from +enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the +collective. However, this time only represents the launch time of the collective and not the actual +execution time. To reconstruct the execution time more accurately proxy and kernel events are provided. + +Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress +thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If +the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of +accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is +delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events. diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile index ee8e0cf..f5cc9f1 100644 --- a/ext-profiler/example/Makefile +++ b/ext-profiler/example/Makefile @@ -10,7 +10,7 @@ PLUGIN_SO := libnccl-profiler.so default: $(PLUGIN_SO) $(PLUGIN_SO): plugin.c event.c print_event.c - $(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ + $(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ clean: rm -f $(PLUGIN_SO) diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h index 1486a22..0638f2d 100644 --- a/ext-profiler/example/event.h +++ b/ext-profiler/example/event.h @@ -33,10 +33,42 @@ #define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES) #define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES) - -#define MAX_COMM_CLIQUES (32 * 8) +#define MAX_EVENTS_PER_REQ (8) struct proxyOp; +struct proxyStep; + +struct netPlugin { + uint8_t type; + int pluginType; + int pluginVer; + uint8_t pluginEvent; + union { + struct { + int device; + int qpNum; + int opcode; + uint64_t wr_id; + size_t length; + } qp; + struct { + int fd; + int op; + size_t length; + } sock; + }; + double startTs; + double stopTs; + struct proxyStep* parent; +}; + +struct kernelCh { + uint8_t type; + uint8_t channelId; + struct taskEventBase* parent; + double startTs; + double stopTs; +}; struct proxyStep { uint8_t type; // type of event: network transfer @@ -46,6 +78,8 @@ struct proxyStep { double startTs; double stopTs; struct proxyOp* parent; + struct netPlugin net[MAX_EVENTS_PER_REQ]; + int nNetEvents; }; struct proxyOp { @@ -101,7 +135,6 @@ struct collective { void const* sendBuff; void* recvBuff; size_t count; - size_t trafficBytes; int root; const char* datatype; uint8_t nMaxChannels; @@ -111,6 +144,7 @@ struct collective { struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events int nProxyOps[MAX_CHANNELS]; + struct kernelCh kernel[MAX_CHANNELS]; }; struct p2p { @@ -121,6 +155,7 @@ struct p2p { const char* datatype; int peer; struct proxyOp op[MAX_CHANNELS]; + struct kernelCh kernel[MAX_CHANNELS]; }; struct group { diff --git a/ext-profiler/example/nccl/net_ib_v1.h b/ext-profiler/example/nccl/net_ib_v1.h new file mode 100644 index 0000000..f142de5 --- /dev/null +++ b/ext-profiler/example/nccl/net_ib_v1.h @@ -0,0 +1,34 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_IB_V1_H_ +#define NET_IB_V1_H_ + +#define NCCL_PROFILER_NET_IB_VER 1 + +enum { + ncclProfileQp = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int device; // network device id + uint64_t wr_id; // work request id + int opcode; // ibv opcode + int qpNum; // QP number + size_t length; // work request data length + } qp; + }; +} ncclProfilerNetIbDescr_v1_t; + +#endif diff --git a/ext-profiler/example/nccl/net_socket_v1.h b/ext-profiler/example/nccl/net_socket_v1.h new file mode 100644 index 0000000..0cb664f --- /dev/null +++ b/ext-profiler/example/nccl/net_socket_v1.h @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_SOCKET_V1_H_ +#define NET_SOCKET_V1_H_ + +#define NCCL_PROFILER_NET_SOCKET_VER 1 + +enum { + ncclProfileSocket = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int fd; + int op; + size_t length; + } sock; + }; +} ncclProfilerNetSockDescr_v1_t; + +#endif diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h index 6680cfe..d02202d 100644 --- a/ext-profiler/example/nccl/profiler.h +++ b/ext-profiler/example/nccl/profiler.h @@ -4,8 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#ifndef NCCL_PROFILER_H_ -#define NCCL_PROFILER_H_ +#ifndef PROFILER_H_ +#define PROFILER_H_ #include #include @@ -13,7 +13,54 @@ #include "common.h" #include "err.h" +enum { + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileKernelCh = (1 << 6), // kernel channel event type + ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events +}; + +typedef enum { + ncclProfilerProxyOpSendPosted, + ncclProfilerProxyOpSendRemFifoWait, + ncclProfilerProxyOpSendTransmitted, + ncclProfilerProxyOpSendDone, + ncclProfilerProxyOpRecvPosted, + ncclProfilerProxyOpRecvReceived, + ncclProfilerProxyOpRecvTransmitted, + ncclProfilerProxyOpRecvDone, + + /* Legacy proxy profiler states */ + ncclProfilerProxyStepSendGPUWait, + ncclProfilerProxyStepSendWait, + ncclProfilerProxyStepRecvWait, + ncclProfilerProxyStepRecvFlushWait, + ncclProfilerProxyStepRecvGPUWait, + + /* Legacy proxy control states */ + ncclProfilerProxyCtrlIdle, + ncclProfilerProxyCtrlActive, + ncclProfilerProxyCtrlSleep, + ncclProfilerProxyCtrlWakeup, + ncclProfilerProxyCtrlAppend, + ncclProfilerProxyCtrlAppendEnd, +} ncclProfilerEventState_t; + +typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; + +#include "profiler_v3.h" #include "profiler_v2.h" #include "profiler_v1.h" +#include "profiler_net.h" + +typedef ncclProfiler_v3_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; #endif // end include guard diff --git a/ext-profiler/example/nccl/profiler_net.h b/ext-profiler/example/nccl/profiler_net.h new file mode 100644 index 0000000..2d087ca --- /dev/null +++ b/ext-profiler/example/nccl/profiler_net.h @@ -0,0 +1,22 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_NET_H_ +#define PROFILER_NET_H_ + +#define NCCL_PROFILER_NET_VER_BITS (16) +#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) +#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS) + +typedef enum { + NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS), + NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS), +} ncclProfilerNetType; + +#include "net_ib_v1.h" +#include "net_socket_v1.h" + +#endif diff --git a/ext-profiler/example/nccl/profiler_v1.h b/ext-profiler/example/nccl/profiler_v1.h index 7d34bed..e7d316d 100644 --- a/ext-profiler/example/nccl/profiler_v1.h +++ b/ext-profiler/example/nccl/profiler_v1.h @@ -4,8 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#ifndef NCCL_PROFILER_V1_H_ -#define NCCL_PROFILER_V1_H_ +#ifndef PROFILER_V1_H_ +#define PROFILER_V1_H_ #include @@ -59,8 +59,16 @@ typedef struct { }; } ncclProfilerEventDescr_v1_t; -typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t; -typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t; +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v1_t; typedef struct { const char* name; diff --git a/ext-profiler/example/nccl/profiler_v2.h b/ext-profiler/example/nccl/profiler_v2.h index aab4ccf..4be600d 100644 --- a/ext-profiler/example/nccl/profiler_v2.h +++ b/ext-profiler/example/nccl/profiler_v2.h @@ -4,20 +4,11 @@ * See LICENSE.txt for license information ************************************************************************/ -#ifndef NCCL_PROFILER_V2_H_ -#define NCCL_PROFILER_V2_H_ +#ifndef PROFILER_V2_H_ +#define PROFILER_V2_H_ #include -enum { - ncclProfileGroup = (1 << 0), // group event type - ncclProfileColl = (1 << 1), // host collective call event type - ncclProfileP2p = (1 << 2), // host point-to-point call event type - ncclProfileProxyOp = (1 << 3), // proxy operation event type - ncclProfileProxyStep = (1 << 4), // proxy step event type - ncclProfileProxyCtrl = (1 << 5), // proxy control event type -}; - typedef struct { uint8_t type; // event type descriptor: ncclProfileColl, ... void* parentObj; // pointer to the profiler parent object (for coll is the group) @@ -65,32 +56,6 @@ typedef struct { }; } ncclProfilerEventDescr_v2_t; -typedef enum { - ncclProfilerProxyOpSendPosted, - ncclProfilerProxyOpSendRemFifoWait, - ncclProfilerProxyOpSendTransmitted, - ncclProfilerProxyOpSendDone, - ncclProfilerProxyOpRecvPosted, - ncclProfilerProxyOpRecvReceived, - ncclProfilerProxyOpRecvTransmitted, - ncclProfilerProxyOpRecvDone, - - /* Legacy proxy profiler states */ - ncclProfilerProxyStepSendGPUWait, - ncclProfilerProxyStepSendWait, - ncclProfilerProxyStepRecvWait, - ncclProfilerProxyStepRecvFlushWait, - ncclProfilerProxyStepRecvGPUWait, - - /* Legacy proxy control states */ - ncclProfilerProxyCtrlIdle, - ncclProfilerProxyCtrlActive, - ncclProfilerProxyCtrlSleep, - ncclProfilerProxyCtrlWakeup, - ncclProfilerProxyCtrlAppend, - ncclProfilerProxyCtrlAppendEnd, -} ncclProfilerEventState_v2_t; - typedef union { struct { size_t transSize; @@ -138,9 +103,4 @@ typedef struct { ncclResult_t (*finalize)(void* context); } ncclProfiler_v2_t; -typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t; -typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t; -typedef ncclProfiler_v2_t ncclProfiler_t; - #endif diff --git a/ext-profiler/example/nccl/profiler_v3.h b/ext-profiler/example/nccl/profiler_v3.h new file mode 100644 index 0000000..c1f1b91 --- /dev/null +++ b/ext-profiler/example/nccl/profiler_v3.h @@ -0,0 +1,119 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V3_H_ +#define PROFILER_V3_H_ + +#include + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v3_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v3_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v3_t; + +typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t; +typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v3_t ncclProfiler_t; + +#endif diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c index 64d5d8b..08408db 100644 --- a/ext-profiler/example/plugin.c +++ b/ext-profiler/example/plugin.c @@ -58,6 +58,7 @@ __hidden double gettime(void) { static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static pid_t pid; +static int* eActivationMaskPtr; __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) { pthread_mutex_lock(&lock); @@ -65,7 +66,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) // first thread initializes event mask, environment and detach pool const char* str; str = getenv("NCCL_PROFILE_EVENT_MASK"); - __atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED); + __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED); str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE"); groupPoolSize = str ? atoi(str) : defaultGroupPoolSize; @@ -100,6 +101,9 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) } pthread_mutex_unlock(&lock); + // store pointer to activation mask globally + eActivationMaskPtr = eActivationMask; + // pre-allocate memory for event object pools in dedicated profiler context struct context* ctx = (struct context *)calloc(1, sizeof(*ctx)); ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool)); @@ -199,8 +203,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n if (base->type == ncclProfileColl) { struct collective* c = (struct collective *)base; // reset event proxyOps & proxySteps - memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS); - memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS); memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS); // release collective events in the group and return them to the collective pool __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED); @@ -252,7 +254,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->count = eDescr->coll.count; event->root = eDescr->coll.root; event->datatype = eDescr->coll.datatype; - event->trafficBytes = eDescr->coll.trafficBytes; event->nMaxChannels = eDescr->coll.nMaxChannels; event->nWarps = eDescr->coll.nWarps; event->algo = eDescr->coll.algo; @@ -373,7 +374,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); debugEvent(event, "ProxyOpStart"); } - } else if (eDescr->type == ncclProfileProxyStep) { + } else if (eDescr->type == ncclProfileProxyStep) { // the parent might be null if we run out of events struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj; if (parent == NULL) return ncclSuccess; @@ -385,8 +386,77 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->isSend = parent->isSend; event->parent = parent; event->startTs = gettime() - startTime; + event->nNetEvents = 0; *eHandle = event; debugEvent(event, "ProxyStepStart"); + } else if (eDescr->type == ncclProfileKernelCh) { + struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj; + if (eventBase == NULL) return ncclSuccess; + if (eventBase->type == ncclProfileColl) { + struct collective* parent = (struct collective *)eDescr->parentObj; + struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId]; + event->type = ncclProfileKernelCh; + event->channelId = eDescr->kernelCh.channelId; + event->parent = eventBase; + event->startTs = gettime() - startTime; + *eHandle = event; + __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); + debugEvent(event, "KernelChStart"); + } else { // ncclProfileP2p + struct p2p* parent = (struct p2p *)eDescr->parentObj; + struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId]; + event->type = ncclProfileKernelCh; + event->channelId = eDescr->kernelCh.channelId; + event->parent = eventBase; + event->startTs = gettime() - startTime; + *eHandle = event; + __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); + debugEvent(event, "KernelChStart"); + } + } else if (eDescr->type == ncclProfileNetPlugin) { + struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj; + if (parent == NULL) return ncclSuccess; + + int64_t pluginId = eDescr->netPlugin.id; + int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK; + int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK; + if (type == NCCL_PROFILER_NET_TYPE_IB) { + if (ver == 1) { + ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data; + struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED); + event->type = ncclProfileNetPlugin; + event->pluginType = type; + event->pluginVer = ver; + if (descr->type == ncclProfileQp) { + event->pluginEvent = ncclProfileQp; + event->qp.device = descr->qp.device; + event->qp.wr_id = descr->qp.wr_id; + event->qp.opcode = descr->qp.opcode; + event->qp.qpNum = descr->qp.qpNum; + event->qp.length = descr->qp.length; + } + event->startTs = gettime() - startTime; + *eHandle = event; + debugEvent(event, "NetPluginStart"); + } + } else if (type == NCCL_PROFILER_NET_TYPE_SOCK) { + if (ver == 1) { + ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data; + struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED); + event->type = ncclProfileNetPlugin; + event->pluginType = type; + event->pluginVer = ver; + if (descr->type == ncclProfileSocket) { + event->pluginEvent = ncclProfileSocket; + event->sock.fd = descr->sock.fd; + event->sock.op = descr->sock.op; + event->sock.length = descr->sock.length; + } + event->startTs = gettime() - startTime; + *eHandle = event; + debugEvent(event, "NetPluginStart"); + } + } } return ncclSuccess; } @@ -445,6 +515,15 @@ void updateEvent(void* handle) { struct proxyCtrl* event = (struct proxyCtrl *)handle; event->stopTs = gettime() - startTime; debugEvent(event, "ProxyCtrlStop"); + } else if (type == ncclProfileKernelCh) { + struct kernelCh* event = (struct kernelCh *)handle; + event->stopTs = gettime() - startTime; + updateEvent(event->parent); + debugEvent(event, "KernelChStop"); + } else if (type == ncclProfileNetPlugin) { + struct netPlugin* event = (struct netPlugin *)handle; + event->stopTs = gettime() - startTime; + debugEvent(event, "NetPluginStop"); } } @@ -506,7 +585,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile return ncclSuccess; } -ncclProfiler_t ncclProfiler_v2 = { +ncclProfiler_t ncclProfiler_v3 = { "Example-profiler", exampleProfilerInit, exampleProfilerStartEvent, @@ -514,3 +593,17 @@ ncclProfiler_t ncclProfiler_v2 = { exampleProfilerRecordEventState, exampleProfilerFinalize, }; + +int exampleProfilerStart(int eActivationMask) { + if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) { + __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED); + } + return ncclSuccess; +} + +int exampleProfilerStop(void) { + if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) { + __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED); + } + return ncclSuccess; +} diff --git a/ext-profiler/example/plugin.h b/ext-profiler/example/plugin.h new file mode 100644 index 0000000..b4d0706 --- /dev/null +++ b/ext-profiler/example/plugin.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PLUGIN_H_ +#define PLUGIN_H_ + +int exampleProfilerStart(int eActivationMask); +int exampleProfilerStop(void); + +#endif diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c index f26a9ee..43f7190 100644 --- a/ext-profiler/example/print_event.c +++ b/ext-profiler/example/print_event.c @@ -72,7 +72,7 @@ __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) { } static __thread int proxyStepId; -__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { +__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) { if (event->isSend) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", "SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step); @@ -84,8 +84,6 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step); - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - "SendWait", proxyStepId++, getpid(), 1, event->stopTs); } else { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", "RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step); @@ -93,6 +91,14 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { "RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step); + } +} + +__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) { + if (event->isSend) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "SendWait", proxyStepId++, getpid(), 1, event->stopTs); + } else { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", @@ -106,6 +112,19 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { } } +static __thread int kernelId; +__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) { + if (event->type != ncclProfileKernelCh) return; + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n", + "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId); +} + +__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) { + if (event->type != ncclProfileKernelCh) return; + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "KernelCh", kernelId, getpid(), 1, event->stopTs); +} + static __thread int proxyCtrlId; __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) { const char* str; @@ -127,6 +146,29 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) { str, proxyCtrlId++, getpid(), 1, event->stopTs); } +static __thread int ibQpId, sockId; +__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) { + if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) { + if (event->pluginVer == 1) { + if (event->pluginEvent == ncclProfileQp) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n", + "Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "Qp", ibQpId++, getpid(), 1, event->stopTs); + } + } + } else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) { + if (event->pluginVer == 1) { + if (event->pluginEvent == ncclProfileSocket) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n", + "Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "Sock", sockId++, getpid(), 1, event->stopTs); + } + } + } +} + //#define DEBUG_EVENTS void debugEvent(void* eHandle, const char* tag) { #ifdef DEBUG_EVENTS @@ -146,8 +188,10 @@ void debugEvent(void* eHandle, const char* tag) { fprintf(fh, "Collective event %p tag = %s {\n", event, tag); fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED)); fprintf(fh, " parent = %p\n", event->base.parent); - for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]); - for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]); + for (int j = 0; j < MAX_OPS; j++) { + for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]); + for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]); + } fprintf(fh, " startTs = %f\n", event->base.startTs); fprintf(fh, " stopTs = %f\n", event->base.stopTs); fprintf(fh, "}\n"); @@ -178,6 +222,20 @@ void debugEvent(void* eHandle, const char* tag) { fprintf(fh, " startTs = %f\n", event->startTs); fprintf(fh, " stopTs = %f\n", event->stopTs); fprintf(fh, "}\n"); + } else if (type == ncclProfileKernelCh) { + struct kernelCh* event = (struct kernelCh *)eHandle; + fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag); + fprintf(fh, " parent = %p\n", event->parent); + fprintf(fh, " channel = %d\n", event->channelId); + } else if (type == ncclProfileNetPlugin) { + struct netPlugin* event = (struct netPlugin *)eHandle; + fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag); + fprintf(fh, " pluginType = %d\n", event->pluginType); + fprintf(fh, " pluginVer = %d\n", event->pluginVer); + fprintf(fh, " pluginEvent = %d\n", event->pluginEvent); + fprintf(fh, " startTs = %f\n", event->startTs); + fprintf(fh, " stopTs = %f\n", event->stopTs); + fprintf(fh, "}\n"); } fclose(fh); #endif @@ -200,17 +258,21 @@ void printEvent(FILE* fh, void* handle) { struct collective* c = (struct collective *)handle; printCollEventHeader(fh, c); for (int i = 0; i < MAX_CHANNELS; i++) { + printKernelChEventHeader(fh, &c->kernel[i]); for (int j = 0; j < c->nProxyOps[i]; j++) { printEvent(fh, &c->send[i][j]); printEvent(fh, &c->recv[i][j]); } + printKernelChEventTrailer(fh, &c->kernel[i]); } printCollEventTrailer(fh, c); } else if (type == ncclProfileP2p) { struct p2p* p = (struct p2p *)handle; printP2pEventHeader(fh, p); for (int i = 0; i < MAX_CHANNELS; i++) { + printKernelChEventHeader(fh, &p->kernel[i]); printEvent(fh, &p->op[i]); + printKernelChEventTrailer(fh, &p->kernel[i]); } printP2pEventTrailer(fh, p); } else if (type == ncclProfileProxyOp) { @@ -222,7 +284,11 @@ void printEvent(FILE* fh, void* handle) { printProxyOpEventTrailer(fh, p); } else if (type == ncclProfileProxyStep) { struct proxyStep* p = (struct proxyStep *)handle; - printProxyStepEvent(fh, p); + printProxyStepEventHeader(fh, p); + for (int q = 0; q < p->nNetEvents; q++) { + printNetPluginEvent(fh, &p->net[q]); + } + printProxyStepEventTrailer(fh, p); } else if (type == ncclProfileProxyCtrl) { struct proxyCtrl* p = (struct proxyCtrl *)handle; printProxyCtrlEvent(fh, p);