nccl/ext-profiler/example/print_event.c
Sylvain Jeaugey 68b542363f 2.23.4-1
Add scalable init API
 * Add new ncclCommInitRankScalable to allow for passing multiple
   unique IDs to the init function.
 * Spreads the load onto multiple bootstrap roots, allowing for
   constant bootstrap time.
 * Requires multiple ranks to create a unique ID, and the CPU-side
   ID exchange code to call allgather[v] instead of broadcast.

Accelerate init bootstrap operations
 * Reduce the number of calls to allgather.
 * Allow roots to reply early to ranks when information is already
   available.
 * Add an option to use ncclNet instead of sockets to perform
   bootstrap allgather operations.

Add PAT algorithms for Allgather and ReduceScatter
 * Parallel Aggregated Trees, variation of Bruck algorithm.
 * Logarithmic number of network steps for small sizes at scale.
 * Only supports one rank per node at the moment.

Add support for registered buffers for intra-node communication.
 * Allow registered user buffers to be accessed directly intra-node
 * Avoids extra copies in algorithms which permit it, saving
   memory bandwidth and helping with compute overlap.

Add profiler plugin API
 * New plugin API for profiling
 * Supports various levels of profiling, with a hierarchy.

Asynchronous graph allocation
 * Make calls to cudaMalloc and cudaMemcpy during graph allocation
   asynchronous.
 * Significantly speeds up graph capture.

Use fatal IB asynchronous events to stop network operation
 * Avoids many other error messages
 * Only fatal errors are affected; potentially transient errors
   (e.g. port down) do not cause an immediate stop.

Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node
 * P2P would cause a significant performance degradation when using
   many GPUs, and therefore many interleaved data flows.
 * Disable P2P through the CPU when we have 3+ GPUs per node; keep it
   enabled when we only have 2 GPUs.

Improve the init logs to report the real NCCL function.
 * Make the log report ncclCommInitRank or ncclCommSplit, rather than
   the generic ncclCommInitRankFunc.

Add a parameter to set the location of the user configuration file.
 * Add NCCL_CONF_FILE environment variable to set where the user's
   configuration file resides.

Increase default IB timeout
 * Increase IB timeout value from 18 to 20.
 * Should help avoid fatal errors on large RoCE systems.

Add new check for nvidia peermem
 * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer
   present; check for /sys/module/nvidia_peermem/version instead.

Fix old performance regression when mixing small and large operations.
 * Improves distribution of work on channels.

Fix crash when NUMA IDs are equal to -1.
 * Can happen when a NIC is a virtual NIC, or when linux doesn't
   know which NUMA node a device is attached to
 * Issue NVIDIA/nccl-tests#233

Fix tree graph search when NCCL_CROSS_NIC is set to 1.
 * Would force NCCL to use the balanced_tree pattern, thereby
   disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch.
 * Would also try to use alternate rings even though it was not
   needed.

Compiler tweaks and fixes
 * PR #1177
 * PR #1228

Fix stack smash
 * PR #1325

Fixes for multi-node NVLink + IB operation

Coverity fixes and comments.
2024-09-16 23:41:17 -07:00

278 lines
16 KiB
C

/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include "profiler.h"
#include "event.h"
#include "print_event.h"
#define __hidden __attribute__ ((visibility("hidden")))
__hidden const char* ncclFuncToString(int func) {
switch(func) {
case 0:
return "ncclBroadcast";
case 1:
return "ncclReduce";
case 2:
return "ncclAllGather";
case 3:
return "ncclReduceScatter";
case 4:
return "ncclAllReduce";
case 5:
return "ncclSendRecv";
case 6:
return "ncclSend";
case 7:
return "ncclRecv";
}
return NULL;
}
__hidden const char* ncclAlgoToString(int algo) {
switch(algo) {
case 0:
return "Tree";
case 1:
return "Ring";
case 2:
return "CollnetDirect";
case 3:
return "CollnetChain";
case 4:
return "Nvls";
case 5:
return "NvlsTree";
}
}
__hidden const char* ncclProtoToString(int proto) {
switch(proto) {
case 0:
return "LL";
case 1:
return "LL128";
case 2:
return "Simple";
}
}
// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
static __thread int groupId;
__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
"Group", groupId, getpid(), 1, event->startTs, event->groupId);
}
__hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"Group", groupId++, getpid(), 1, event->stopTs);
}
static __thread int collId;
__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels);
}
__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs);
}
static __thread int p2pId;
__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n",
ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
}
__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs);
}
static __thread int proxyOpId;
__hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
if (event->isSend) {
int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted);
int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait);
int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted);
int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
"Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
} else {
int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted);
int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived);
int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted);
int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
"Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
}
}
__hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs);
}
static __thread int proxyStepId;
__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
if (event->isSend) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
} else {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs);
}
}
static __thread int proxyCtrlId;
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
const char* str;
if (event->state == ncclProfilerProxyCtrlIdle || event->state == ncclProfilerProxyCtrlActive) {
str = "Idle";
} else if (event->state == ncclProfilerProxyCtrlSleep || event->state == ncclProfilerProxyCtrlWakeup) {
str = "Sleep";
} else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) {
str = "Append";
}
if (event->state == ncclProfilerProxyCtrlAppendEnd) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n",
str, proxyCtrlId, getpid(), 1, event->startTs, event->appended);
} else {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
str, proxyCtrlId, getpid(), 1, event->startTs);
}
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
str, proxyCtrlId++, getpid(), 1, event->stopTs);
}
//#define DEBUG_EVENTS
void debugEvent(void* eHandle, const char* tag) {
#ifdef DEBUG_EVENTS
char filename[64] = { 0 };
sprintf(filename, "EventDebug-%d", getpid());
FILE* fh = fopen(filename, "a+");
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileGroup) {
struct group* event = (struct group *)eHandle;
fprintf(fh, "Group event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->refCount, __ATOMIC_RELAXED));
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileColl) {
struct collective* event = (struct collective *)eHandle;
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
fprintf(fh, " parent = %p\n", event->base.parent);
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
fprintf(fh, " startTs = %f\n", event->base.startTs);
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileP2p) {
struct p2p* event = (struct p2p *)eHandle;
fprintf(fh, "P2p event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
fprintf(fh, " parent = %p\n", event->base.parent);
fprintf(fh, " op = %p\n", &event->op);
fprintf(fh, " startTs = %f\n", event->base.startTs);
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)eHandle;
fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag);
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
fprintf(fh, " channel = %d\n", event->channelId);
fprintf(fh, " parent = %p\n", event->parent);
fprintf(fh, " rank = %d\n", event->rank);
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)eHandle;
fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag);
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
fprintf(fh, " parent = %p\n", event->parent);
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
}
fclose(fh);
#endif
}
void printEvent(FILE* fh, void* handle) {
if (handle == NULL || fh == NULL) return;
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
struct group* g = (struct group *)handle;
printGroupEventHeader(fh, g);
struct taskEventBase* base = taskEventQueueHead(g);
while (base) {
struct taskEventBase* next = base->next;
printEvent(fh, base);
base = next;
}
printGroupEventTrailer(fh, g);
} else if (type == ncclProfileColl) {
struct collective* c = (struct collective *)handle;
printCollEventHeader(fh, c);
for (int i = 0; i < MAX_CHANNELS; i++) {
printEvent(fh, &c->send[i]);
printEvent(fh, &c->recv[i]);
}
printCollEventTrailer(fh, c);
} else if (type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)handle;
printP2pEventHeader(fh, p);
printEvent(fh, &p->op);
printP2pEventTrailer(fh, p);
} else if (type == ncclProfileProxyOp) {
struct proxyOp* p = (struct proxyOp *)handle;
printProxyOpEventHeader(fh, p);
for (int i = 0; i < MAX_STEPS; i++) {
printEvent(fh, &p->step[i]);
}
printProxyOpEventTrailer(fh, p);
} else if (type == ncclProfileProxyStep) {
struct proxyStep* p = (struct proxyStep *)handle;
printProxyStepEvent(fh, p);
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* p = (struct proxyCtrl *)handle;
printProxyCtrlEvent(fh, p);
}
return;
}