Kamil Iskra 3ea7eedf3b NCCL 2.27.5-1
Improvements for GB200 systems
* Optimize the network performance by alternating the direction of the
  rings and the NIC to GPU assignment across communicators to limit
  unnecessary sharing.
* Fix the detection of C2C links in case GPU Direct RDMA is disabled
  between a GPU and a NIC.
* Fix PXN support on MNNVL systems, where NCCL would try (and fail) to
  share regular host memory across multiple nodes.
* Fix P2C (PXN over C2C), which is now preferred over regular PXN.  This
  support is currently preliminary and is disabled by default; use
  NCCL_PXN_C2C=1 to enable.

Further reduce the overheads of CUDA graph capturing, which increased in
NCCL 2.26.2 for large graphs.

Optimize the network performance on DGX B200 systems by adjusting the
bandwidths provided to the graph search algorithm.

Enable fp8 reductions in symmetric kernels on Blackwell with CUDA 12.8.

Restore the plugin name handling logic to make it possible to specify a
path to the plugin (Issue #1732).

Restore the ability to change NCCL_COLLNET_ENABLE during execution
(Issue #1741).

Add an example tuner plugin with CSV-based overrides.

Remove an x86 dependency from the example profiler.
2025-06-18 10:34:47 -07:00

634 lines
25 KiB
C

/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include <pthread.h>
#include <string.h>
#include <linux/limits.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <time.h>
#include "event.h"
#include "print_event.h"
#define __hidden __attribute__ ((visibility("hidden")))
static int initialized; // initialization counter for profiler
static double startTime; // profiler start time
static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
static const int defaultGroupPoolSize = 16;
static const int defaultCollPoolSize = 16;
static const int defaultP2pPoolSize = 1024;
static const int defaultProxyCtrlPoolSize = 16;
static const int defaultDetachPoolSize = 128;
static int groupPoolSize;
static int collPoolSize;
static int p2pPoolSize;
static int proxyCtrlPoolSize;
static int detachPoolSize;
static int detachPoolBase;
static int detachPoolIndex;
static int detachPoolDone;
static struct proxyOp* detachPool;
ncclDebugLogger_t logFn;
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
__hidden double gettime(void) {
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return (t.tv_sec*1e6 + (t.tv_nsec*1e-3));
}
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static pid_t pid;
static int* eActivationMaskPtr;
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
pthread_mutex_lock(&lock);
if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
// first thread initializes event mask, environment and detach pool
const char* str;
str = getenv("NCCL_PROFILE_EVENT_MASK");
__atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
str = getenv("NCCL_PROFILE_COLL_POOL_SIZE");
collPoolSize = str ? atoi(str) : defaultCollPoolSize;
str = getenv("NCCL_PROFILE_P2P_POOL_SIZE");
p2pPoolSize = str ? atoi(str) : defaultP2pPoolSize;
str = getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE");
proxyCtrlPoolSize = str ? atoi(str) : defaultProxyCtrlPoolSize;
str = getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE");
detachPoolSize = str ? atoi(str) : defaultDetachPoolSize;
// detach pool is used to store PXN proxyOps and is shared among threads
detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
if (detachPool == NULL) {
pthread_mutex_unlock(&lock);
return ncclSystemError;
}
// Pid of the process initializing the profiler first.
// This is compared against the pid of proxyOp events
// to figure out if they have a parent event in this
// process address space.
pid = getpid();
startTime = gettime();
}
pthread_mutex_unlock(&lock);
// store pointer to activation mask globally
eActivationMaskPtr = eActivationMask;
// pre-allocate memory for event object pools in dedicated profiler context
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
ctx->commName = commName;
ctx->commHash = commHash;
ctx->nranks = nranks;
ctx->rank = rank;
logFn = logfn;
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
if (ctx->groupPool == NULL) goto fail;
ctx->collPool = (struct collective *)calloc(collPoolSize, sizeof(*ctx->collPool));
if (ctx->collPool == NULL) goto fail;
ctx->p2pPool = (struct p2p *)calloc(p2pPoolSize, sizeof(*ctx->p2pPool));
if (ctx->p2pPool == NULL) goto fail;
ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
if (ctx->proxyCtrlPool == NULL) goto fail;
// Print event pool sizes for debugging
//fprintf(stdout, "Profiler: Group pool size (bytes): %lu\n", sizeof(struct group)*groupPoolSize);
//fprintf(stdout, "Profiler: Coll pool size (bytes): %lu\n", sizeof(struct collective)*collPoolSize);
//fprintf(stdout, "Profiler: P2p pool size (bytes): %lu\n", sizeof(struct p2p)*p2pPoolSize);
//fprintf(stdout, "Profiler: Proxy pool size (bytes): %lu\n", sizeof(struct proxyCtrl)*proxyCtrlPoolSize);
//fprintf(stdout, "Profiler: PXN pool size (bytes): %lu\n", sizeof(struct proxyOp)*detachPoolSize);
*context = ctx;
return ncclSuccess;
fail:
// cleanup resources
if (ctx->proxyCtrlPool) free(ctx->proxyCtrlPool);
if (ctx->p2pPool) free(ctx->p2pPool);
if (ctx->collPool) free(ctx->collPool);
if (ctx->groupPool) free(ctx->groupPool);
free(ctx);
if (detachPool) free(detachPool);
return ncclSystemError;
}
__hidden ncclResult_t exampleProfilerFinalize(void* context) {
FILE* fh = NULL;
char filename[PATH_MAX] = { 0 };
struct context* ctx = (struct context *)context;
const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
if (dump) {
sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
fh = fopen(filename, "w");
fprintf(fh, "[\n");
}
INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);
// print last N groups/collectives/p2ps
int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
int end = ctx->groupPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
}
start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
end = ctx->proxyCtrlPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &ctx->proxyCtrlPool[i%proxyCtrlPoolSize]);
}
free(ctx->groupPool);
free(ctx->collPool);
free(ctx->p2pPool);
free(ctx->proxyCtrlPool);
free(ctx);
// last thread cleans up shared detach pool
if (__atomic_sub_fetch(&initialized, 1, __ATOMIC_RELAXED) == 0) {
start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
end = detachPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &detachPool[i%detachPoolSize]);
}
free(detachPool);
}
if (fh) fprintf(fh, "{}]\n");
if (fh) fclose(fh);
return ncclSuccess;
}
__hidden void updateEvent(void* handle);
__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
*eHandle = NULL;
struct context* ctx = (struct context *)context;
if (eDescr->type == ncclProfileGroup) {
struct group* event;
int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
// if there are available group events grab one
event = &ctx->groupPool[groupId%groupPoolSize];
while (!taskEventQueueEmpty(event)) {
struct taskEventBase* base = taskEventQueueDequeue(event);
if (base->type == ncclProfileColl) {
struct collective* c = (struct collective *)base;
// reset event proxyOps & proxySteps
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
// release collective events in the group and return them to the collective pool
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
} else if (base->type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)base;
// reset event proxyOp and proxySteps
memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
// release p2p events in the group and return them to the p2p pool
__atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
}
}
} else {
// else drop this event
__atomic_fetch_sub(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileGroup;
event->ctx = ctx;
event->groupId = groupId;
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "GroupStart");
} else if (eDescr->type == ncclProfileColl) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct collective* event;
int collId = __atomic_fetch_add(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
if ((collId - __atomic_load_n(&ctx->collPoolBase, __ATOMIC_RELAXED)) < collPoolSize) {
// if there are available collective events grab one
event = &ctx->collPool[collId%collPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->base.type = ncclProfileColl;
event->base.rank = eDescr->rank;
event->base.func = eDescr->coll.func;
event->base.startTs = gettime() - startTime;
event->base.parent = parent;
event->seqNumber = eDescr->coll.seqNumber;
event->sendBuff = eDescr->coll.sendBuff;
event->recvBuff = eDescr->coll.recvBuff;
event->count = eDescr->coll.count;
event->root = eDescr->coll.root;
event->datatype = eDescr->coll.datatype;
event->nChannels = eDescr->coll.nChannels;
event->nWarps = eDescr->coll.nWarps;
event->algo = eDescr->coll.algo;
event->proto = eDescr->coll.proto;
*eHandle = event;
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
// increment the group ref counter so the event will staty open
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "CollStart");
} else if (eDescr->type == ncclProfileP2p) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct p2p* event;
int p2pId = __atomic_fetch_add(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
if ((p2pId - __atomic_load_n(&ctx->p2pPoolBase, __ATOMIC_RELAXED)) < p2pPoolSize) {
// if there are available p2p events grab one
event = &ctx->p2pPool[p2pId%p2pPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->base.type = ncclProfileP2p;
event->base.rank = eDescr->rank;
event->base.func = eDescr->p2p.func;
event->base.next = parent->eventHead;
event->base.startTs = gettime() - startTime;
event->base.parent = parent;
event->buff = eDescr->p2p.buff;
event->count = eDescr->p2p.count;
event->datatype = eDescr->p2p.datatype;
event->peer = eDescr->p2p.peer;
event->nChannels = eDescr->p2p.nChannels;
*eHandle = event;
// increment the group ref counter so the event will staty open
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "P2pStart");
} else if (eDescr->type == ncclProfileProxyCtrl) {
int proxyCtrlId = __atomic_fetch_add(&ctx->proxyCtrlPoolIndex, 1, __ATOMIC_RELAXED);
struct proxyCtrl* event = &ctx->proxyCtrlPool[proxyCtrlId%proxyCtrlPoolSize];
event->type = ncclProfileProxyCtrl;
event->ctx = ctx;
event->startTs = gettime() - startTime;
*eHandle = event;
} else if (eDescr->type == ncclProfileProxyOp) {
// the eventBase might be null if we run out of events
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
if (eventBase == NULL) return ncclSuccess;
if (eDescr->proxyOp.pid != pid) {
// PXN captured proxyOp events
struct proxyOp* event;
int detachId = __atomic_fetch_add(&detachPoolIndex, 1, __ATOMIC_RELAXED);
if ((detachId - detachPoolBase) < detachPoolSize) {
// if there are available detached proxyOp events grab one
event = &detachPool[detachId%detachPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&detachPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileProxyOp;
event->channelId = eDescr->proxyOp.channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->startTs = gettime() - startTime;
event->parent = NULL;
event->stepCount = 0;
*eHandle = event;
debugEvent(event, "PxnProxyOpStart");
return ncclSuccess;
}
if (eventBase->type == ncclProfileColl) {
struct collective* parent = (struct collective *)eDescr->parentObj;
int channelId = eDescr->proxyOp.channelId;
struct proxyOp* event = &parent->op[channelId][parent->nProxyOps[channelId]++];
event->type = ncclProfileProxyOp;
event->channelId = channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->parent = eventBase;
event->startTs = gettime() - startTime;
event->stepCount = 0;
*eHandle = event;
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "ProxyOpStart");
} else { // ncclProfileP2p
struct p2p* parent = (struct p2p *)eDescr->parentObj;
int channelId = eDescr->proxyOp.channelId;
struct proxyOp* event = &parent->op[channelId];
event->type = ncclProfileProxyOp;
event->channelId = channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->parent = eventBase;
event->startTs = gettime() - startTime;
event->stepCount = 0;
*eHandle = event;
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "ProxyOpStart");
}
} else if (eDescr->type == ncclProfileProxyStep) {
// the parent might be null if we run out of events
struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
int s = parent->stepCount++ % MAX_STEPS;
struct proxyStep* event = &parent->step[s];
event->type = ncclProfileProxyStep;
event->state = 0;
event->step = eDescr->proxyStep.step;
event->parent = parent;
event->isSend = parent->isSend;
event->startTs = gettime() - startTime;
event->nNetEvents = 0;
*eHandle = event;
debugEvent(event, "ProxyStepStart");
} else if (eDescr->type == ncclProfileKernelCh) {
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
if (eventBase == NULL) return ncclSuccess;
if (eventBase->type == ncclProfileColl) {
struct collective* parent = (struct collective *)eDescr->parentObj;
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
event->type = ncclProfileKernelCh;
event->channelId = eDescr->kernelCh.channelId;
event->startGpuClk = eDescr->kernelCh.pTimer;
event->parent = eventBase;
event->startTs = gettime() - startTime;
*eHandle = event;
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "KernelChStart");
} else { // ncclProfileP2p
struct p2p* parent = (struct p2p *)eDescr->parentObj;
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
event->type = ncclProfileKernelCh;
event->channelId = eDescr->kernelCh.channelId;
event->startGpuClk = eDescr->kernelCh.pTimer;
event->parent = eventBase;
event->startTs = gettime() - startTime;
*eHandle = event;
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "KernelChStart");
}
} else if (eDescr->type == ncclProfileNetPlugin) {
struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
int64_t pluginId = eDescr->netPlugin.id;
int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK;
int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK;
if (type == NCCL_PROFILER_NET_TYPE_IB) {
if (ver == 1) {
ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data;
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
event->type = ncclProfileNetPlugin;
event->pluginType = type;
event->pluginVer = ver;
if (descr->type == ncclProfileQp) {
event->pluginEvent = ncclProfileQp;
event->qp.device = descr->qp.device;
event->qp.wr_id = descr->qp.wr_id;
event->qp.opcode = descr->qp.opcode;
event->qp.qpNum = descr->qp.qpNum;
event->qp.length = descr->qp.length;
}
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "NetPluginStart");
}
} else if (type == NCCL_PROFILER_NET_TYPE_SOCK) {
if (ver == 1) {
ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data;
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
event->type = ncclProfileNetPlugin;
event->pluginType = type;
event->pluginVer = ver;
if (descr->type == ncclProfileSocket) {
event->pluginEvent = ncclProfileSocket;
event->sock.fd = descr->sock.fd;
event->sock.op = descr->sock.op;
event->sock.length = descr->sock.length;
}
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "NetPluginStart");
}
}
}
return ncclSuccess;
}
void updateEvent(void* handle) {
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
struct group* event = (struct group *)handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
// return group event to the pool
__atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
}
debugEvent(event, "GroupStop");
} else if (type == ncclProfileColl) {
struct collective* event = (struct collective *)handle;
if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
event->base.stopTs = gettime() - startTime;
debugEvent(event, "CollStop");
updateEvent(event->base.parent);
return;
}
debugEvent(event, "CollStop");
} else if (type == ncclProfileP2p) {
struct p2p* event = (struct p2p *)handle;
if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
event->base.stopTs = gettime() - startTime;
debugEvent(event, "P2pStop");
updateEvent(event->base.parent);
return;
}
debugEvent(event, "P2pStop");
} else if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)handle;
event->stopTs = gettime() - startTime;
if (event->pid != pid) {
// only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
int done = __atomic_add_fetch(&detachPoolDone, 1, __ATOMIC_RELAXED);
if (done == detachPoolSize) {
// reset the event completed (done) counter
__atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
// update the base pointer to the top of the pool
int index = __atomic_load_n(&detachPoolIndex, __ATOMIC_RELAXED);
__atomic_store_n(&detachPoolBase, index, __ATOMIC_RELAXED);
}
debugEvent(event, "ProxyOpStop");
return;
}
updateEvent(event->parent);
debugEvent(event, "ProxyOpStop");
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "ProxyStepStop");
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* event = (struct proxyCtrl *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "ProxyCtrlStop");
} else if (type == ncclProfileKernelCh) {
struct kernelCh* event = (struct kernelCh *)handle;
event->stopTs = gettime() - startTime;
updateEvent(event->parent);
debugEvent(event, "KernelChStop");
} else if (type == ncclProfileNetPlugin) {
struct netPlugin* event = (struct netPlugin *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "NetPluginStop");
}
}
__hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileGroup) {
// stopping the group event in NCCL core does not
// mean the group has completed. It means the group
// was submitted/enqueued so we need to keep the event open
struct group* event = (struct group *)eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileColl) {
// stopping the collective event in NCCL core does not
// mean the collective has completed. It means the collective
// was submitted/enqueued so we need to keep the event open
struct collective* event = (struct collective *)eHandle;
event->base.stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileP2p) {
// stopping the p2p event in NCCL core does not
// mean the p2p has completed. It means the p2p
// was submitted/enqueued so we need to keep the event open
struct p2p* event = (struct p2p *)eHandle;
event->base.stopTs = gettime() - startTime;
return ncclSuccess;
}
updateEvent(eHandle);
return ncclSuccess;
}
__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)eHandle;
if (eState == ncclProfilerProxyOpInProgress_v4) {
event->progrTs = gettime() - startTime;
}
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)eHandle;
struct proxyOp* parent = event->parent;
switch (eState) {
case ncclProfilerProxyStepSendGPUWait:
event->timestamp[PROXY_STEP_SEND_GPU_WAIT] = gettime() - startTime;
break;
case ncclProfilerProxyStepSendPeerWait_v4:
// do not update step event if in SendPeerWait
if (event->state == ncclProfilerProxyStepSendPeerWait_v4) break;
event->timestamp[PROXY_STEP_SEND_PEER_WAIT] = gettime() - startTime;
event->state = ncclProfilerProxyStepSendPeerWait_v4;
break;
case ncclProfilerProxyStepSendWait:
event->timestamp[PROXY_STEP_SEND_WAIT] = gettime() - startTime;
parent->transSize += eStateArgs->proxyStep.transSize;
break;
case ncclProfilerProxyStepRecvWait:
event->timestamp[PROXY_STEP_RECV_WAIT] = gettime() - startTime;
break;
case ncclProfilerProxyStepRecvFlushWait:
event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT] = gettime() - startTime;
parent->transSize += eStateArgs->proxyStep.transSize;
break;
case ncclProfilerProxyStepRecvGPUWait:
event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
break;
}
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
if (eState == ncclProfilerProxyCtrlAppendEnd) {
event->appended = eStateArgs->proxyCtrl.appendedProxyOps;
}
event->state = eState;
} else if (type == ncclProfileKernelCh) {
struct kernelCh* event = (struct kernelCh *)eHandle;
if (eState == ncclProfilerKernelChStop) {
event->stopGpuClk = eStateArgs->kernelCh.pTimer;
}
}
debugEvent(eHandle, "RecordEventState");
return ncclSuccess;
}
ncclProfiler_t ncclProfiler_v4 = {
"Example-profiler",
exampleProfilerInit,
exampleProfilerStartEvent,
exampleProfilerStopEvent,
exampleProfilerRecordEventState,
exampleProfilerFinalize,
};
int exampleProfilerStart(int eActivationMask) {
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
__atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
}
return ncclSuccess;
}
int exampleProfilerStop(void) {
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
__atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
}
return ncclSuccess;
}