NCCL 2.27.5-1

Improvements for GB200 systems * Optimize the network performance by alternating the direction of the rings and the NIC to GPU assignment across communicators to limit unnecessary sharing. * Fix the detection of C2C links in case GPU Direct RDMA is disabled between a GPU and a NIC. * Fix PXN support on MNNVL systems, where NCCL would try (and fail) to share regular host memory across multiple nodes. * Fix P2C (PXN over C2C), which is now preferred over regular PXN. This support is currently preliminary and is disabled by default; use NCCL_PXN_C2C=1 to enable. Further reduce the overheads of CUDA graph capturing, which increased in NCCL 2.26.2 for large graphs. Optimize the network performance on DGX B200 systems by adjusting the bandwidths provided to the graph search algorithm. Enable fp8 reductions in symmetric kernels on Blackwell with CUDA 12.8. Restore the plugin name handling logic to make it possible to specify a path to the plugin (Issue #1732). Restore the ability to change NCCL_COLLNET_ENABLE during execution (Issue #1741). Add an example tuner plugin with CSV-based overrides. Remove an x86 dependency from the example profiler.
NCCL 2.27.3-1
2025-06-18 10:34:47 -07:00 · 2025-05-29 20:56:40 -07:00 · 2025-05-20 04:04:41 -07:00 · 2025-04-22 13:55:13 -07:00 · 2025-04-22 13:50:40 -07:00 · 2025-04-13 23:56:46 -07:00
324 changed files with 80639 additions and 7238 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 /build
+*.gcov
+/coverage/
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -1,5 +1,5 @@

- Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
@ -29,3 +29,11 @@
 The U.S. Department of Energy funded the development of this software
 under subcontract 7078610 with Lawrence Berkeley National Laboratory.

+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+   https://github.com/NVIDIA/NVTX
+
+for more information and license details.
--- a/226
+++ b/226
@ -1,211 +1,31 @@
 #
-# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
-# See LICENCE.txt for license information
+# See LICENSE.txt for license information
 #
+.PHONY : all clean

-CUDA_HOME ?= /usr/local/cuda
-PREFIX ?= /usr/local
-VERBOSE ?= 0
-KEEP ?= 0
-DEBUG ?= 0
-PROFAPI ?= 0
-BUILDDIR ?= build
+default : src.build
+install : src.install
+BUILDDIR ?= $(abspath ./build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := src pkg
+clean: ${TARGETS:%=%.clean}
+test.build: src.build
+LICENSE_FILES := LICENSE.txt
+LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
+lic: $(LICENSE_TARGETS)

-CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-NVCC ?= $(CUDA_HOME)/bin/nvcc
+${BUILDDIR}/%.txt: %.txt
+	@printf "Copying    %-35s > %s\n" $< $@
+	mkdir -p ${BUILDDIR}
+	cp $< $@

-NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_52,code=compute_52
+src.%:
+	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}

-CXXFLAGS   := -I$(CUDA_INC) -fPIC -fvisibility=hidden 
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96
-# Use addprefix so that we can specify more than one path
-LDFLAGS    := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
-
-ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3
-CXXFLAGS  += -O3
-else
-NVCUFLAGS += -O0 -G
-CXXFLAGS  += -O0 -g -ggdb3
-endif
-
-ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
-CXXFLAGS  += -Wall -Wextra
-else
-.SILENT:
-endif
-
-ifneq ($(KEEP), 0)
-NVCUFLAGS += -keep
-endif
-
-ifneq ($(PROFAPI), 0)
-CXXFLAGS += -DPROFAPI
-endif
-
-NCCL_MAJOR   := 1
-NCCL_MINOR   := 3
-NCCL_PATCH   := 0
-CXXFLAGS  += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
-
-CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
-CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
-CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
-CXXFLAGS  += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
-
-.PHONY : lib clean test mpitest install deb debian debclean
-.DEFAULT : lib
-
-INCEXPORTS  := nccl.h
-LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
-LIBNAME     := libnccl.so
-
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-LIBLINK    := $(patsubst lib%.so, -l%, $(LIBNAME))
-LIBOBJ     := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
-DEPFILES   := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
-
-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-
-include $(DEPFILES)
-
-$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
-	@printf "Linking   %-25s\n" $@
-	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(INCDIR)/%.h : src/%.h
-	@printf "Grabbing  %-25s > %-25s\n" $< $@
-	mkdir -p $(INCDIR)
-	cp -f $< $@
-
-$(OBJDIR)/%.o : src/%.cu
-	@printf "Compiling %-25s > %-25s\n" $< $@
-	mkdir -p $(OBJDIR)
-	$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
-	@rm -f $(@:%.o=%.d.tmp)
-
-clean :
-	rm -rf $(BUILDDIR)
-
-install : lib
-	mkdir -p $(PREFIX)/lib
-	mkdir -p $(PREFIX)/include
-	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
-	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
-
-
-#### TESTS ####
-
-TEST_ONLY ?= 0
-
-# Tests depend on lib, except in TEST_ONLY mode.
-ifeq ($(TEST_ONLY), 0)
-TSTDEP = $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-endif
-
-NCCL_LIB ?= $(LIBDIR)
-NCCL_INC ?= $(INCDIR)
-
-MPI_HOME ?= /usr
-MPI_INC ?= $(MPI_HOME)/include
-MPI_LIB ?= $(MPI_HOME)/lib
-MPIFLAGS   := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
-
-TESTS       := all_gather_test     all_gather_scan \
-               all_reduce_test     all_reduce_scan \
-               broadcast_test      broadcast_scan \
-               reduce_test         reduce_scan \
-               reduce_scatter_test reduce_scatter_scan
-MPITESTS    := mpi_test
-
-TSTINC     := -I$(NCCL_INC) -Itest/include
-TSTLIB     := -L$(NCCL_LIB) $(LIBLINK) $(LDFLAGS)
-TSTDIR     := $(BUILDDIR)/test/single
-MPITSTDIR  := $(BUILDDIR)/test/mpi
-TESTBINS   := $(patsubst %, $(TSTDIR)/%, $(TESTS))
-MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
-
-test : $(TESTBINS)
-
-$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP) 
-	@printf "Building  %-25s > %-24s\n" $< $@
-	mkdir -p $(TSTDIR)
-	$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
-	@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-mpitest : $(MPITESTBINS)
-
-$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP) 
-	@printf "Building  %-25s > %-24s\n" $< $@
-	mkdir -p $(MPITSTDIR)
-	$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
-	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-#### PACKAGING ####
-
-DEBIANDIR  := $(BUILDDIR)/debian
-
-DEBGEN_IN  := $(shell (cd debian ; ls *.in))
-DEBGEN     := $(DEBGEN_IN:.in=)
-DEBFILES   := compat copyright libnccl-dev.install libnccl-dev.manpages nccl.7 rules $(DEBGEN)
-DEBTARGETS := $(patsubst %, $(DEBIANDIR)/%, $(DEBFILES))
-
-DEB_REVISION   ?= 1
-DEB_TIMESTAMP  := $(shell date -R)
-DEB_ARCH       ?= amd64
-
-debian : $(DEBTARGETS)
-
-deb : lib debian
-	@printf "Building Debian package\n"
-	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
-	mkdir -p $(BUILDDIR)/deb/
-	mv $(BUILDDIR)/../libnccl*.deb $(BUILDDIR)/deb/
-
-debclean :
-	rm -Rf $(DEBIANDIR)
-
-$(DEBIANDIR)/% : debian/%.in
-	@printf "Generating %-25s > %-24s\n" $< $@
-	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-	    -e "s/\$${deb:Revision}/$(DEB_REVISION)/g" \
-	    -e "s/\$${deb:Timestamp}/$(DEB_TIMESTAMP)/g" \
-	    -e "s/\$${deb:Arch}/$(DEB_ARCH)/g" \
-	    $< > $@
-
-$(DEBIANDIR)/% : debian/%
-	@printf "Grabbing  %-25s > %-25s\n" $< $@
-	mkdir -p $(DEBIANDIR)
-	cp -f $< $@
+pkg.%:
+	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}

+pkg.debian.prep: lic
+pkg.txz.prep: lic
--- a/README.md
+++ b/README.md
@ -1,122 +1,76 @@
 # NCCL

-Optimized primitives for collective multi-GPU communication.
+Optimized primitives for inter-GPU communication.

 ## Introduction

-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
-[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
+NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.

-## What's inside
+For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).

-At present, the library implements the following collectives:
- all-reduce
- all-gather
- reduce-scatter
- reduce
- broadcast
+## Build

-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
+Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.

-## Requirements
-
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
-
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
-
-## Build & run
-
-To build the library and tests.
+To build the library :

 ```shell
 $ cd nccl
-$ make CUDA_HOME=<cuda install path> test
+$ make -j src.build
 ```

-Test binaries are located in the subdirectories nccl/build/test/{single,mpi}.
+If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :

 ```shell
-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./build/lib
-$ ./build/test/single/all_reduce_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all_reduce_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/single/all_reduce_test 10000000
-# Using devices
-#   Device  0 ->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 ->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 ->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 ->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out-of-place                    in-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e-03    1.636   6.11   9.17    4e-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e-03    1.657   6.03   9.05    1e-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e-07    1.622   6.17   9.25    5e-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e-07    1.628   6.14   9.21    1e-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e-16    1.628   6.14   9.21    2e-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
+$ make src.build CUDA_HOME=<path to cuda install>
 ```

-To install, run `make PREFIX=<install dir> install` and add `<instal dir>/lib` to your `LD_LIBRARY_PATH`.
+NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.

-## Usage
-
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single-process machine, all GPUs can be conveniently initialized using `ncclCommInitAll`. For multi-process applications (e.g., with MPI), `ncclCommInitRank` must be called for each GPU. Internally `ncclCommInitRank` invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single-process example follows, for an MPI example see test/mpi/mpi_test.cu. For details about the API see nccl.h.
-
-```c
-#include <nccl.h>
-
-typedef struct {
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream_t stream;
-} PerThreadData;
-
-int main(int argc, char* argv[])
-{
-  int nGPUs;
-  cudaGetDeviceCount(&nGPUs);
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) {
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  }
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-}
+By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
+```shell
+$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
 ```

-## Copyright and License
+## Install

-NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
-accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
-rights reserved.
+To install NCCL on the system, create a package then install it as root.

+Debian/Ubuntu :
+```shell
+$ # Install tools to create debian packages
+$ sudo apt install build-essential devscripts debhelper fakeroot
+$ # Build NCCL deb package
+$ make pkg.debian.build
+$ ls build/pkg/deb/
+```
+
+RedHat/CentOS :
+```shell
+$ # Install tools to create rpm packages
+$ sudo yum install rpm-build rpmdevtools
+$ # Build NCCL rpm package
+$ make pkg.redhat.build
+$ ls build/pkg/rpm/
+```
+
+OS-agnostic tarball :
+```shell
+$ make pkg.txz.build
+$ ls build/pkg/txz/
+```
+
+## Tests
+
+Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
+
+```shell
+$ git clone https://github.com/NVIDIA/nccl-tests.git
+$ cd nccl-tests
+$ make
+$ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
+```
+
+## Copyright
+
+All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
--- a/debian/changelog.in
+++ b/debian/changelog.in
@ -1,5 +0,0 @@
-nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}-${deb:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
-
-  * Automatic Debian package from build
-
- -- cudatools <cudatools@nvidia.com>  ${deb:Timestamp}
--- a/debian/control.in
+++ b/debian/control.in
@ -1,28 +0,0 @@
-Source: nccl
-Section: libs
-Maintainer: cudatools <cudatools@nvidia.com>
-Priority: optional
-Build-depends: debhelper(>=9)
-Standards-Version: 3.9.5
-
-Package: libnccl${nccl:Major}
-Section: libs
-Architecture: ${deb:Arch}
-Depends: ${misc:Depends}, ${shlibs:Depends}
-Description: NVIDIA Collectives Communication Library (NCCL) Runtime
- NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
-
-Package: libnccl-dev
-Section: libdevel
-Architecture: ${deb:Arch}
-Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
-Description: NVIDIA Collectives Communication Library (NCCL) Development Files
- NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
--- a/debian/copyright
+++ b/debian/copyright
@ -1 +0,0 @@
-../LICENSE.txt
--- a/debian/libnccl-dev.install
+++ b/debian/libnccl-dev.install
@ -1,2 +0,0 @@
-include/nccl.h usr/include
-lib/libnccl.so /usr/lib/x86_64-linux-gnu
--- a/debian/libnccl-dev.manpages
+++ b/debian/libnccl-dev.manpages
@ -1 +0,0 @@
-debian/nccl.7
--- a/debian/libnccl1.install.in
+++ b/debian/libnccl1.install.in
@ -1,2 +0,0 @@
-lib/libnccl.so.${nccl:Major} /usr/lib/x86_64-linux-gnu
-lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/x86_64-linux-gnu
--- a/debian/nccl.7
+++ b/debian/nccl.7
@ -1,139 +0,0 @@
-.TH NCCL
-.SH NAME
-.PP
-nccl \- Optimized primitives for collective multi\-GPU communication.
-
-.SH Introduction
-.PP
-NCCL (pronounced "Nickel") is a stand\-alone library of standard collective communication routines, such as all\-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single\- or multi\-process (e.g., MPI) applications.
-
-.SH What's inside
-.PP
-At present, the library implements the following collectives:
-\- all\-reduce
-\- all\-gather
-\- reduce\-scatter
-\- reduce
-\- broadcast
-
-.PP
-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
-
-.SH Requirements
-.PP
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi\-socket configurations are also supported.
-
-.PP
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
-
-.SH Build & run
-.PP
-To build the library and tests.
-
-.PP
-.RS
-
-.nf
-$ cd nccl
-$ make CUDA\_HOME=<cuda install path> test
-
-.fi
-.RE
-
-.PP
-Test binaries are located in the subdirectories nccl/build/test and nccl/build/mpitest.
-
-.PP
-.RS
-
-.nf
-$ export LD\_LIBRARY\_PATH=$LD\_LIBRARY\_PATH:./build/lib
-$ ./build/test/all\_reduce\_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all\_reduce\_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/all\_reduce\_test 10000000
-# Using devices
-#   Device  0 \->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 \->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 \->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 \->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out\-of\-place                    in\-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e\-03    1.636   6.11   9.17    4e\-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e\-03    1.657   6.03   9.05    1e\-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e\-07    1.622   6.17   9.25    5e\-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e\-07    1.628   6.14   9.21    1e\-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e\-16    1.628   6.14   9.21    2e\-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
-
-.fi
-.RE
-
-.PP
-To install, run \fB\fCmake PREFIX=<install dir> install\fR and add \fB\fC<instal dir>/lib\fR to your \fB\fCLD\_LIBRARY\_PATH\fR.
-
-.SH Usage
-.PP
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single\-process machine, all GPUs can be conveniently initialized using \fB\fCncclCommInitAll\fR. For multi\-process applications (e.g., with MPI), \fB\fCncclCommInitRank\fR must be called for each GPU. Internally \fB\fCncclCommInitRank\fR invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single\-process example follows, for an MPI example see src/mpi\_test.cu. For details about the API see nccl.h.
-
-.PP
-.RS
-
-.nf
-#include <nccl.h>
-
-typedef struct \{
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream\_t stream;
-\} PerThreadData;
-
-int main(int argc, char* argv[])
-\{
-  int nGPUs;
-  cudaGetDeviceCount(\&nGPUs);
-  ncclComm\_t* comms = (ncclComm\_t*)malloc(sizeof(ncclComm\_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) \{
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  \}
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-\}
-
-.fi
-.RE
-
-.SH Copyright
-.PP
-All source code and accompanying documentation is copyright (c) 2015\-2016, NVIDIA CORPORATION. All
-rights reserved.
--- a/debian/shlibs.local.in
+++ b/debian/shlibs.local.in
@ -1 +0,0 @@
-libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}
--- a/ext-net/README.md
+++ b/ext-net/README.md
@ -0,0 +1,419 @@
+# NCCL Net Plugin Documentation
+
+This page describes the NCCL Net plugin API and how to implement a network plugin for NCCL.
+
+# Overview
+
+To allow NCCL to work on any network type, NCCL provides a way to use external plugins. Plugins
+implement the NCCL network API, and decouple NCCL binary builds which are built against a
+particular version of the GPU stack (i.e. CUDA) from the network code which is built against a
+particular version of the networking stack. That way, we can easily integrate any CUDA version
+with any network stack version.
+
+NCCL network plugins come as a shared library called `libnccl-net.so`. That shared library
+contains one or more implementations of the NCCL NET API, in the form of versioned structs,
+filled with pointers to all required functions.
+
+# Plugin architecture
+
+## Plugin name and supporting multiple network plugins
+
+When NCCL is initialized, it will look for a `libnccl-net.so` library and dynamically load it,
+then look for symbols inside the library.
+
+The `NCCL_NET_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
+will look for a library with a name of `libnccl-net-${NCCL_NET_PLUGIN}.so`. It is therefore
+advised to name the library following that pattern, with a symlink pointing `libnccl-net.so`
+to `libnccl-net-${NCCL_NET_PLUGIN}.so`. That way, if there are multiple plugins in the path,
+setting `NCCL_NET_PLUGIN` will allow users to select the right plugin.
+
+## Struct versioning
+
+Once a library is found, NCCL will look for a symbol named `ncclNet_vX`, with `X` increasing
+over time. The versioning ensures that the plugin and the NCCL core are compatible.
+
+Plugins are encouraged to provide multiple of those symbols, implementing multiple versions
+of the NCCL NET API, so that the same plugin can be compiled and support a wide range of NCCL
+versions.
+
+Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
+for the latest ncclNet struct version, but also looking for older ones so that older plugins
+would still work.
+
+## In-network collective operations, a.k.a. collNet
+
+Additionally to the ncclNet structure, network plugins can provide a collNet structure which
+implements in-network collective operations, if supported. That can be used by the NCCL collNet
+algorithm to accelerate inter-node reductions in allReduce.
+
+The collNet struct is a different, optional struct provided by the network plugin, but its
+versioning is tied to the ncclNet struct and many functions are common between the two to
+ease the implementation.
+
+## Headers management
+
+To help users build plugins effortlessly, plugins should copy the `ncclNet_vX` definitions
+they support to their internal includes. An example is shown in `ext-net/example/` where we keep
+all headers in the `nccl/` directory and provide thin layers to implement old versions on top
+of newer ones.
+
+The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
+from old API versions. It also provides error codes in `err.h`.
+
+# API (v10)
+
+Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
+
+```
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_t;
+```
+
+## Error codes
+
+All plugins functions use NCCL error codes as return value. `ncclSuccess` should be returned upon
+success.
+
+Otherwise, plugins can return one of the following:
+ - `ncclSystemError` is the most common error for network plugins, when a call to the linux kernel
+or a system library fails. This typically includes all network/hardware errors.
+ - `ncclInternalError` is returned when the NCCL core code is using the network plugin in an
+incorrect way, for example allocating more requests than it should, or passing an invalid argument
+to calls.
+ - `ncclInvalidUsage` should be returned when the error is most likely a user error. This can
+include misconfiguration, but also sizes mismatch.
+ - `ncclInvalidArgument` should usually not be used by plugins since arguments should be checked by
+the NCCL core layer.
+ - `ncclUnhandledCudaError` is returned when an error comes from CUDA. Since network plugins should
+not need to rely on CUDA, this should not be common.
+
+## Operation overview
+
+NCCL will call the `init` function first, then query the number of network devices with the
+`devices` function, getting each network device properties with `getProperties`.
+
+If NCCL wishes to initialize virtual devices, used in NIC fusion currently, it can call `makeVDevice`
+specifying a list of physical devices (the original devices listed from `devices`) it wishes to
+merge together. If the plugin does not support NIC fusion, it can set `makeVDevice` to null.
+
+To establish a connection between two network devices, NCCL will first call `listen` on the
+receiving side, pass the returned handle to the sender side of the connection, and call `connect`
+with that handle. Finally, `accept` will be called on the receiving side to finalize the connection
+establishment.
+
+`connect` and `accept` can receive an optional `netDevComm` pointer from the caller, if the caller
+wishes to make use of device networking. This parameter may be ignored by the plugin if it does
+not support device-side networking.
+
+Once the connection is established, communication will be done using the functions `isend`,
+`irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on
+all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers.
+
+In certain conditions, `iflush` will be called after a receive calls completes to allow the network
+plugin to flush data and ensure the GPU will observe the newly written data.
+
+To close the connections NCCL will call `closeListen` to close the object returned by `listen`,
+`closeSend` to close the object returned by `connect` and `closeRecv` to close the object returned
+by `accept`.
+
+## API Functions
+
+### Initialization
+`name`
+
+The `name` field should point to a character string with the name of the network plugin. This will
+be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
+
+Note: setting `NCCL_NET=<plugin name>` will ensure a specific network implementation is used, with
+a matching `name`. This is not to be confused with `NCCL_NET_PLUGIN` which defines a suffix to the
+`libnccl-net.so`library name to load.
+
+`init`
+
+As soon as NCCL finds the plugin and the correct ncclNet symbol, it will call the `init` function.
+This will allow the plugin to discover network devices and make sure they are usable. If the
+`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
+internal ones.
+
+To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
+function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
+the plugin code adding the following definitions:
+
+```
+#define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+```
+
+The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
+record its own events with the NCCL profiler plugin.
+
+`devices`
+
+Once the plugin is initialized, NCCL will query the number of devices available. It should not
+be zero, otherwise NCCL initialization will fail. If no device is present or usable, the `init`
+function should not return `ncclSuccess`.
+
+`getProperties`
+
+Right after getting the number of devices, NCCL will query properties for each available network
+device. These properties are critical when multiple adapters are present to ensure NCCL uses each
+adapter in the most optimized way.
+
+The `name` is only used for logging.
+
+The `pciPath` is the base for all topology detection and should point to the PCI device directory
+in /sys. This is typically the directory pointed by `/sys/class/net/eth0/device` or
+`/sys/class/infiniband/mlx5_0/device`. If the network interface is virtual, then `pciPath` should
+be `NULL`.
+
+The `guid` field is used to determine when network adapters are connected to multiple PCI
+endpoints. For normal cases, it can be set to the device number. If multiple network devices have
+the same guid, then NCCL will consider the are sharing the same network port to the fabric, hence
+it will not use the port multiple times.
+
+The `ptrSupport` field indicates whether or not CUDA pointers are supported. If so, it should be
+set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HOST`. If the plugin
+supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and
+provide a `regMrDmaBuf` function.
+
+The `regIsGlobal` field allows NCCL to register buffers in advance using e.g. a loopback connection
+and later on, expect that another registration on a buffer contained within a previous registration
+will be nearly immediate, as the buffer is already known by the network adapter. A typical
+implementation would maintain a registration cache; the call to ncclCommRegister will create the
+initial entry in the cache using regMr() on a loopback connection. Any later call to NCCL
+operations will call regMr() again on the real connection, with the real buffer (could be at a
+different offset within the original buffer, with a smaller size, etc), then deregMr() right after.
+The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping
+on the network adapter.
+
+The `forceFlush` field can request the NCCL core to call flush for all transfers. By default,
+flushes are only called when the GPU architecture or PCI topology would not not guarantee correct
+PCI ordering. Plugins can set it to one if the NIC operates in a mode where e.g. the data and the
+completion paths use different PCI links and therefore need a call to flush() to guarantee
+ordering.
+
+The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
+important to ensure proper optimization of flows within the node.
+
+The `port` field indicates the port number. This is important again for topology detection and flow
+optimization within the node when a NIC with a single PCI connection is connected to the fabric
+with multiple ports.
+
+The `latency` field indicates the network latency in microseconds. This can be useful to improve
+the NCCL tuning and make sure NCCL switches from tree to ring at the right size.
+
+The `maxComms` field indicates the maximum number of connections we can create.
+
+The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped
+receive).
+
+The `netDeviceType` indicates which type of device networking this plugin supports. The current supported
+options are `NCCL_NET_DEVICE_HOST` and `NCCL_NET_DEVICE_UNPACK`.
+
+The `netDeviceVersion` indicates the version of device networking this plugin supports. Currently, this must match the associated netDeviceVersion of this netDeviceType compiled into NCCL core. Net device functionality is built as apart of NCCL core's device code.
+
+The `maxP2pBytes` and `maxCollBytes` fields indicate the maximum size the plugin can handle for
+point-to-point and collective calls. This will tell the NCCL core to cut large operations into
+multiple smaller chunks if needed.
+
+`vProps` is the list of devices that have been fused into the current device. Each entry is an index pointing to the child device.
+
+### Connection establishment
+
+Connections are used in an unidirectional manner. There is therefore a sender side and a receiver
+side.
+
+`listen`
+
+To create a connection, NCCL will start by calling `listen` on the receiver side. This function
+takes a device number as input argument, and should return a local `listenComm` object, and a
+`handle` to pass to the other side, so that the sender side can connect to the receiver.
+
+The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
+
+This call should never block, but contrary to `connect` and `accept`, `listenComm` should never
+be `NULL` if the call succeeds.
+
+`connect`
+
+NCCL will use its bootstrap infrastructure to provide the `handle` to the sender side, then call
+`connect` on the sender side on a given device index `dev`, providing the `handle`. `connect`
+should not block either, and instead set `sendComm` to `NULL` and return `ncclSuccess`. In that
+case, NCCL will call `accept` again until it succeeds.
+
+`accept`
+
+To finalize the connection, the receiver side will call `accept` on the `listenComm` returned by
+the `listen` call previously. If the sender did not connect yet, `accept` should not block. It
+should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
+succeeds.
+
+The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+This field can be used by the network plugin to specify the QoS level of the connection. By default,
+`trafficClass` is set to -1 but can be configured by the application during communicator initialization
+to select a plugin-supported QoS level.
+
+`closeListen`/`closeSend`/`closeRecv`
+
+Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
+`closeListen`/`closeSend`/`closeRecv` to free the associated resources.
+
+### Communication
+
+Communication is done using asynchronous send and receive operations: `isend`, `irecv` and `test`.
+To support RDMA capabilities, buffer registration and flush functions are provided.
+
+To keep track of asynchronous send, receive and flush operations, requests are returned to NCCL,
+then queried with `test`. Each `sendComm` or `recvComm` must be able to handle
+`NCCL_NET_MAX_REQUESTS` requests in parallel.
+
+Note: That value should be multiplied by the multi-receive capability of the plugin for the sender
+side, so that we can effectively have `NCCL_NET_MAX_REQUESTS` multi-receive operations happening
+in parallel. So, if we have a `maxRecvs`value of 8 and `NCCL_NET_MAX_REQUESTS` is 8, then each
+`sendComm` must be able to handle up to 8x8=64 concurrent `isend` operations.
+
+`regMr`
+
+Prior to sending or receiving data, NCCL will call `regMr` with any buffers later used for
+communication. It will provide a `sendComm` or `recvComm` as `comm` argument, then the buffer
+pointer `data`, `size`, and `type` being either `NCCL_PTR_HOST`, or `NCCL_PTR_CUDA` if the network
+supports CUDA pointers.
+
+The network plugin can use the output argument `mhandle` to keep any reference to that memory
+registration, as this `mhandle` will be passed back for all `isend`, `irecv`, `iflush` and
+`deregMr` calls.
+
+`regMrDmaBuf`
+
+If the plugin has set the `NCCL_PTR_DMABUF` property in `ptrSupport`, NCCL will use `regMrDmaBuf`
+instead of `regMr`. If the property was not set, `regMrDmaBuf` can be set to `NULL`.
+
+
+`deregMr`
+
+When buffers will no longer be used for communication, NCCL will call `deregMr` to let the plugin
+free resources. This function is used to deregister handles returned by both `regMr` and
+`regMrDmaBuf`.
+
+`isend`
+
+Data will be sent through the connection using `isend`, passing the `sendComm` previously
+created by `connect`, and the buffer described by `data`, `size`, and `mhandle`. A `tag` must be
+used if the network supports multi-receive operations (see `irecv`) to distinguish between
+different sends matching the same multi-receive. Otherwise it can be set to 0.
+
+The `isend` operation returns a handle in the `request` argument for further calls to `test`. If
+the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
+`isend` again later.
+
+The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
+to support network defined events.
+
+`irecv`
+
+To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
+`n` will allow NCCL to perform a multi-receive, to allow grouping of multiple sends through a
+single network connection. Each buffer will be described by the `data`, `sizes`, and `mhandles`
+arrays. `tags` will specify a tag for each receive so that each of the `n` independent `isend`
+operations is received into the right buffer.
+
+If all receive operations can be initiated, `irecv` will return a handle in the `request` pointer,
+otherwise it will set it to `NULL`. In the case of multi-receive, all `n` receive operations are
+handled by a single request handle.
+
+The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation.
+The contrary (receive size being lower than the send size) is an error, however.
+
+NCCL sets request pointer in `irecv` to `NCCL_NET_OPTIONAL_RECV_COMPLETION` when it is using
+LL or LL128 protocols. In these cases, NCCL polls on flag embedded in data to detect completion
+of irecv and is resilient to redundant network writes. This allows the plugin to optimize request
+completions on such irecvs (for example, complete the request immediately). The plugin is still
+expected to set a valid request pointer on return which NCCL can poll to check for completion.
+
+The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
+network plugin to support network defined events.
+
+Note: for a given connection, send/receive operations should always match in the order they were
+posted. Tags provided for receive operations are only used to assign a given send operation to one
+of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
+matching on any receive operation posted.
+
+`test`
+
+After an `isend` or `irecv` operation is initiated, NCCL will call `test` on the request handles
+until they complete. When that happens, `done` will be set to 1 and `sizes` will be set to the
+real size sent or received, the latter being potentially lower than the size passed to `irecv`.
+
+In the case of a multi-receive, all receives will be considered as done as a single operation (the
+goal being to allow aggregation), hence they share a single request and a single `done` status.
+However, they can have different sizes, so when `done` is non-zero, the `sizes` array should
+contain the `n` sizes corresponding to the buffers passed to `irecv`.
+
+Once `test` returns 1 in `done`, the request handle can be freed, meaning that NCCL will never
+call `test` again on that request (until it is reallocated by another call to `isend` or `irecv`).
+
+`iflush`
+
+After a receive operation completes, if the operation was targeting GPU memory and received a
+non-zero number of bytes, NCCL will call `iflush` to let the network flush any buffer and ensure
+the GPU can read it right after without seeing stale data. This flush operation is decoupled from
+the `test` code to improve latency of `LL*` protocols, as those are capable of determining when
+data is valid or not.
+
+`iflush` returns a request which needs to be queried with `test` until it completes.
--- a/ext-net/example/Makefile
+++ b/ext-net/example/Makefile
@ -0,0 +1,22 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
+
+SRC_FILES := $(wildcard *.c)
+
+build: ${BUILDDIR}/libnccl-net-example.so
+
+${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
+
+clean:
+	rm -f ${BUILDDIR}/libnccl-net-example.so
--- a/ext-net/example/nccl/common.h
+++ b/ext-net/example/nccl/common.h
@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+#include <stdint.h>
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop };
+
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
+#endif
--- a/ext-net/example/nccl/err.h
+++ b/ext-net/example/nccl/err.h
@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_H_
+#define NET_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "err.h"
+#include "net_device.h"
+#include "common.h"
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 32
+
+#include "net_v10.h"
+#include "net_v9.h"
+#include "net_v8.h"
+#include "net_v7.h"
+#include "net_v6.h"
+#include "net_v5.h"
+#include "net_v4.h"
+#include "net_v3.h"
+#include "net_v2.h"
+
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_device.h
+++ b/ext-net/example/nccl/net_device.h
@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_DEVICE_H_
+#define NET_DEVICE_H_
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+
+typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
+
+#endif
--- a/ext-net/example/nccl/net_v10.h
+++ b/ext-net/example/nccl/net_v10.h
@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v2.h
+++ b/ext-net/example/nccl/net_v2.h
@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V2_H_
+#define NET_V2_H_
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Return the device path in /sys. NCCL will call free on this path.
+  ncclResult_t (*pciPath)(int dev, char** path);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v2_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v3.h
+++ b/ext-net/example/nccl/net_v3.h
@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V3_H_
+#define NET_V3_H_
+
+#define NCCL_NET_MAX_REQUESTS_V3 16
+
+typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v3_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v4.h
+++ b/ext-net/example/nccl/net_v4.h
@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V4_H_
+#define NET_V4_H_
+
+#define NCCL_NET_HANDLE_MAXSIZE_V4 64
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+} ncclNetProperties_v4_t;
+
+// v4 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v4_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v5.h
+++ b/ext-net/example/nccl/net_v5.h
@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V5_H_
+#define NET_V5_H_
+
+typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v6.h
+++ b/ext-net/example/nccl/net_v6.h
@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V6_H_
+#define NET_V6_H_
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+}ncclNetProperties_v6_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v7.h
+++ b/ext-net/example/nccl/net_v7.h
@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V7_H_
+#define NET_V7_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v8.h
+++ b/ext-net/example/nccl/net_v8.h
@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V8_H_
+#define NET_V8_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/net_v9.h
+++ b/ext-net/example/nccl/net_v9.h
@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V9_H_
+#define NET_V9_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclNet_v9_t;
+
+#endif // end include guard
--- a/ext-net/example/nccl/types.h
+++ b/ext-net/example/nccl/types.h
@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+} ncclDataType_t;
+
+#endif
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@ -0,0 +1,418 @@
+/*************************************************************************
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "net.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+#define NCCL_PLUGIN_MAX_RECVS 1
+
+int max_requests = NCCL_NET_MAX_REQUESTS;
+
+__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
+__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
+__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
+  // Below are default values, if unsure don't change.
+
+  props->name = "Example";
+  // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
+  props->pciPath = NULL;
+  // Only used to detect NICs with multiple PCI attachments.
+  props->guid = 0;
+  // Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
+  props->ptrSupport = NCCL_PTR_HOST;
+  // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
+  props->regIsGlobal = 0;
+  // Force flush after receive. Needed if the control path and data path use a different path to the GPU
+  props->forceFlush = 0;
+  // Speed in *Mbps*. 100000 means 100G
+  props->speed = 100000;
+  // Port number, used in conjunction with guid
+  props->port = 0;
+  // Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
+  props->latency = 0;
+  // Maximum number of comm objects we can create.
+  props->maxComms = 1024*1024;
+  // Maximum number of receive operations taken by irecv().
+  props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
+  // Coupling with NCCL network device-side code.
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  // maximum transfer sizes the plugin can handle
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
+__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
+
+#define PLUGIN_NAME "Plugin"
+
+const ncclNet_v10_t ncclNetPlugin_v10 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties,
+  .listen = pluginListen,
+  .connect = pluginConnect,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice,
+};
+
+__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
+  return pluginInit(logFunction, NULL);
+}
+
+__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
+  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+}
+
+__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
+  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+}
+
+__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+  return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
+}
+
+__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; }
+
+const ncclNet_v9_t ncclNetPlugin_v9 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v9,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v9,
+  .irecv = pluginIrecv_v9,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v9,
+};
+
+__hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v8->name = props.name;
+  props_v8->pciPath = props.pciPath;
+  props_v8->guid = props.guid;
+  props_v8->ptrSupport = props.ptrSupport;
+  props_v8->regIsGlobal = props.regIsGlobal;
+  props_v8->speed = props.speed;
+  props_v8->latency = props.latency;
+  props_v8->port = props.port;
+  props_v8->maxComms = props.maxComms;
+  props_v8->maxRecvs = props.maxRecvs;
+  props_v8->netDeviceType = props.netDeviceType;
+  props_v8->netDeviceVersion = props.netDeviceVersion;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+  size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
+  for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, NULL, request);
+}
+
+const ncclNet_v8_t ncclNetPlugin_v8 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v8,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+ };
+
+__hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v7->name = props.name;
+  props_v7->pciPath = props.pciPath;
+  props_v7->guid = props.guid;
+  props_v7->ptrSupport = props.ptrSupport;
+  props_v7->speed = props.speed;
+  props_v7->latency = props.latency;
+  props_v7->port = props.port;
+  props_v7->maxComms = props.maxComms;
+  props_v7->maxRecvs = props.maxRecvs;
+  props_v7->netDeviceType = props.netDeviceType;
+  props_v7->netDeviceVersion = props.netDeviceVersion;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) {
+  return pluginRegMr(collComm, data, size, type, mhandle);
+}
+
+const ncclNet_v7_t ncclNetPlugin_v7 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v7,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr_v7,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+};
+
+__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v6->name = props.name;
+  props_v6->pciPath = props.pciPath;
+  props_v6->guid = props.guid;
+  props_v6->ptrSupport = props.ptrSupport;
+  props_v6->speed = props.speed;
+  props_v6->latency = props.latency;
+  props_v6->port = props.port;
+  props_v6->maxComms = props.maxComms;
+  props_v6->maxRecvs = props.maxRecvs;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { return ncclInternalError; }
+
+const ncclNet_v6_t ncclNetPlugin_v6 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v6,
+  .listen = pluginListen,
+  .connect = pluginConnect_v6,
+  .accept = pluginAccept_v6,
+  .regMr = pluginRegMr_v7,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen
+};
+
+/* v5 Compat */
+const ncclNet_v5_t ncclNetPlugin_v5 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v6,
+  .listen = pluginListen,
+  .connect = pluginConnect_v6,
+  .accept = pluginAccept_v6,
+  .regMr = pluginRegMr_v7,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v4 Compat */
+static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v4->name = props.name;
+  props_v4->pciPath = props.pciPath;
+  props_v4->guid = props.guid;
+  props_v4->ptrSupport = props.ptrSupport;
+  props_v4->speed = props.speed;
+  props_v4->port = props.port;
+  props_v4->maxComms = props.maxComms;
+  return ncclSuccess;
+}
+static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
+  return pluginIsend_v8(sendComm, data, size, 0, mhandle, request);
+}
+static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
+  int tag = 0;
+  return pluginIrecv_v8(recvComm, 1, &data, &size, &tag, &mhandle, request);
+}
+static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
+  return pluginIflush(recvComm, 1, &data, &size, &mhandle, request);
+}
+static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
+  ncclResult_t ret;
+  do {
+    ncclNetDeviceHandle_v7_t* handle = NULL;
+    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
+  } while (ret == ncclSuccess && *sendComm == NULL);
+  return ret;
+}
+static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
+  ncclResult_t ret;
+  do {
+    ncclNetDeviceHandle_v7_t* handle = NULL;
+    ret = pluginAccept(listenComm, recvComm, &handle);
+  } while (ret == ncclSuccess && *recvComm == NULL);
+  return ret;
+}
+const ncclNet_v4_t ncclNetPlugin_v4 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v4,
+  .listen = pluginListen,
+  .connect = pluginConnect_v4,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr_v7,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .iflush = pluginIflush_v4,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v3 Compat */
+static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) {
+  void* req;
+  ncclResult_t ret = pluginIflush_v4(recvComm, data, size, mhandle, &req);
+  int done = 0;
+  while (ret == ncclSuccess && done == 0) {
+    ret = pluginTest(req, &done, NULL);
+  }
+  return ret;
+}
+static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
+  max_requests = NCCL_NET_MAX_REQUESTS_V3;
+  return pluginInit(logFunction, NULL);
+}
+#include <string.h>
+static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
+  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
+  ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
+  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
+  return ret;
+}
+static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) {
+  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
+  memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V4);
+  return pluginConnect_v4(dev, &pluginHandle, sendComm);
+}
+const ncclNet_v3_t ncclNetPlugin_v3 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v3,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v4,
+  .listen = pluginListen_v3,
+  .connect = pluginConnect_v3,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr_v7,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .flush = pluginFlush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v2 Compat */
+const ncclNet_v2_t ncclNetPlugin_v2 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v3,
+  .devices = pluginDevices,
+  .pciPath = pluginPciPath,
+  .ptrSupport = pluginPtrSupport,
+  .listen = pluginListen,
+  .connect = pluginConnect_v4,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr_v7,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .flush = pluginFlush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
--- a/ext-net/google-fastsocket/Makefile
+++ b/ext-net/google-fastsocket/Makefile
@ -0,0 +1,22 @@
+CUDA_HOME?=/usr/local/cuda
+INC:=-I$(CUDA_HOME)/include
+PLUGIN_SO:=libnccl-net.so
+
+default: $(PLUGIN_SO)
+
+$(PLUGIN_SO): nccl-fastsocket/*.cc
+	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+nccl-fastsocket/*.cc:
+	git clone https://github.com/google/nccl-fastsocket.git
+
+install: $(BUILDDIR)/lib/$(PLUGIN_SO)
+
+$(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO)
+	@printf "Grabbing %-35s > %s\n" $< $@
+	mkdir -p $(BUILDDIR)/lib
+	install -m 644 $< $@
+
+clean:
+	rm -f $(PLUGIN_SO)
+	rm -Rf nccl-fastsocket
--- a/ext-profiler/README.md
+++ b/ext-profiler/README.md
@ -0,0 +1,461 @@
+# NCCL Profiler Plugin Documentation
+
+This page describes the NCCL Profiler plugin API and how to implement a profiler plugin for NCCL.
+
+# Overview
+
+To allow NCCL to better integrate with DL frameworks, NCCL v2.23 introduced a profiler plugin
+interface. Any NCCL user can write profiler plugins to extract performance data from NCCL and
+use it for debugging and analysis.
+
+Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library
+called `libnccl-profiler.so`. That shared library contains one or more implementations of the
+NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required
+functions.
+
+# Plugin architecture
+
+## Plugin name and supporting multiple profiler plugins
+
+When NCCL is initialized, it will look for a `libnccl-profiler.so` library and dynamically load
+it, then look for symbols inside the library.
+
+The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
+will look for a library with a name of `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore
+advised to name the library following that pattern, with a symlink pointing `libnccl-profiler.so`
+to `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the
+path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively,
+the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `libnccl-profiler.so` library.
+
+## Struct versioning
+
+Once a library is found, NCCL will look for a symbol named `ncclProfiler_vX`, with `X` increasing
+over time. The versioning ensures that the plugin and the NCCL core are compatible.
+
+Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the
+NCCL PROFILER API, so that the same plugin can be compiled and support a wide range of NCCL versions.
+
+Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
+for the latest ncclProfiler struct version, but also looking for older ones so that older plugins
+would still work.
+
+## Headers management
+
+To help users build plugins effortlessly, plugins should copy the `ncclProfiler_vX` definitions
+they support to their internal includes. An example is shown in `ext-profiler/example` where we
+keep all headers in the `nccl/` directory and provide thin layers to implement old version on top
+of newer ones.
+
+The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
+from old API versions. It also provides error codes in `err.h`.
+
+# API (v4)
+
+Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
+
+```
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+```
+
+## Error codes
+
+As rule of thumb, profiler generated errors should not be propagated to NCCL and alter its normal
+functioning. Nevertheless, the profiler interface returns NCCL error codes, in case any need for
+them arises in the future. For now, any profiler interface call should only return `ncclSuccess`.
+The only exception is `init` that can return an error so that NCCL can disable the plugin.
+
+## Operation overview
+
+NCCL will call the `init` function first for every new communicator that is initialized. The profiler
+returns an opaque context handle that is used to isolate profiler instances across communicators.
+Similarly, NCCL will call `finalize` to destroy the profiler context, thus freeing resources.
+
+The NCCL core code is instrumented with calls to `startEvent`, `stopEvent` and `recordEventState`.
+These are used to start, stop and update events in the profiler, respectively.
+
+## API Functions
+
+### Initialization
+
+#### name
+
+The `name` field should point to a character string with the name of the profiler plugin. This will
+be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
+
+#### init
+
+As soon as NCCL finds the plugin and the correct ncclProfiler symbol, it calls its `init` function.
+This allows the plugin to initialize its internal context, used during profiling of NCCL events.
+If the `init` function does not return `ncclSuccess`, NCCL disables the plugin.
+
+#### finalize
+
+When the profiler is no longer needed, a call to `finalize` destroys the profiler context and frees
+up resources.
+
+### Profiling
+
+#### startEvent
+
+When NCCL needs to start profiling a new event it calls `startEvent`. `startEvent` takes the profiler
+context, previously created by `init`, an event descriptor of type `ncclProfilerEventDescr_t` and
+returns an opaque profiler event handle that can be passed to other profiler functions, as discussed
+later in the document.
+
+
+The event descriptor contains all the event metadata. Every event type has its own descriptor. Below
+is the `ncclProfilerEventDescr_t` struct.
+
+```
+typedef struct {
+  uint8_t type;             // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
+  void* parentObj;          // pointer to parent event used to expose the event hierarchy to the profiler
+  int rank;                 // rank that generated the event
+  union {
+    struct {                // collective events metadata
+      uint64_t seqNumber;   // sequence number of this collective operation in the communicator
+      const char* func;     // string containing name of the collective
+      void const* sendBuff; // address of send buffer
+      void* recvBuff;       // address of recv buffer
+      size_t count;         // data count
+      int root;             // root rank
+      const char* datatype; // string containing the name of the datatype
+      uint8_t nChannels;    // number of channels for this collective
+      uint8_t nWarps;       // number of GPU warps for this collective
+      const char* algo;     // string containing name of the algorithm for this collective
+      const char* proto;    // string containing name of the protocol for this collective
+    } coll;
+
+    struct {                // point-to-point events metadata
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;             // peer rank for this point-to-point
+      uint8_t nChannels;    // number of channels for this p2p
+    } p2p;
+
+    struct {                // proxyOp events metadata
+      pid_t pid;            // process id that generated the associated `ncclProxyOp` object
+      uint8_t channelId;    // id of the channel used by the associated `ncclProxyOp` object
+      int peer;             // peer rank
+      int nSteps;           // number of network transfers/steps required by the `ncclProxyOp`
+      int chunkSize;        // chunk size for this `ncclProxyOp`
+      int isSend;           // type of network operation
+    } proxyOp;
+
+    struct {                // proxyStep events metadata
+      int step;             // individual step in `ncclProxyOp`
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;    // id of the channel used by the kernel
+      uint64_t ptimer;      // kernel supplied timestamp
+    } kernelCh;
+
+    struct {
+      int64_t id;           // net plugin id (used by net and profiler plugins to agree on event definitions)
+      void* data;           // pointer to network plugin defined event
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+```
+
+NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
+`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
+`ncclProfileNetPlugin`.
+
+#### stopEvent
+
+`stopEvent` takes the event handle returned by `startEvent` to stop the event. After the event
+has been stopped the handle can no longer be used with other profiler calls. Using the event
+handle after `eventStop` is undefined behavior.
+
+#### recordEventState
+
+Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
+`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
+
+`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
+`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
+
+The state of these events can be updated, along with event attributes, using `recordEventState`.
+These events can go through several states during their lifecycle.
+
+The list of supported states for the updatable events is reported below.
+
+```
+typedef enum {
+  // ncclProfileProxyOp event states
+  ncclProfilerProxyOpSendPosted        = 0, // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1, // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2, // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3, // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4, // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5, // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6, // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7, // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,// state marks transition of proxy op to progress
+
+  // ncclProfileProxyStep event states
+  ncclProfilerProxyStepSendGPUWait     = 8, // state marks the waiting of send data from GPU for given network transfer/step
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,// state marks the waiting of recv clear to send credits for given network transfer/step
+  ncclProfilerProxyStepSendWait        = 9, // state marks the waiting of send data from network for given network transfer/step
+  ncclProfilerProxyStepRecvWait        = 10,// state marks the waiting of recv data from network for given network transfer/step
+  ncclProfilerProxyStepRecvFlushWait   = 11,// state marks the waiting of recv data flush to GPU for given network transfer/step
+  ncclProfilerProxyStepRecvGPUWait     = 12,// state marks the waiting of recv data consumption from GPU for given network transfer/step
+
+  // ncclProfileProxyCtrl event states
+  ncclProfilerProxyCtrlIdle            = 13,// state marks proxy progress thread idle
+  ncclProfilerProxyCtrlActive          = 14,// state marks proxy progress thread active
+  ncclProfilerProxyCtrlSleep           = 15,// state marks proxy progress thread sleeping
+  ncclProfilerProxyCtrlWakeup          = 16,// state marks proxy progress thread waking up
+  ncclProfilerProxyCtrlAppend          = 17,// state marks append of new network work item begin
+  ncclProfilerProxyCtrlAppendEnd       = 18,// state marks append of new network work item end
+
+  // ncclProfileNetPlugin event states
+  ncclProfilerNetPluginUpdate          = 21,// state marks update of network defined event
+
+  // ncclProfileKernelCh event states
+  ncclProfilerKernelChStop             = 22,// state marks stop of kernelCh event and timestamp update
+} ncclProfilerEventState_v4_t;
+```
+
+`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
+network requests for the GPU kernel. ProxyOp events are generated for every active channel and
+provide a summary of the activity of the proxy progress thread for that channel. Most of the
+states for this event were duplicated with `ncclProfileProxyStep` events. Therefore, starting
+with version 4 of the profiler interface these states have been deprecated. The same level of
+information can still be obtained through the `ncclProfileProxyStep` events.
+
+`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing
+network requests for the GPU kernel. ProxyStep events describe individual network transfer in
+the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
+
+`ncclProfileProxyCtrl` events are generated by the proxy progress thread while it is not processing
+network requests for the GPU kernel. This includes everything else that the proxy thread might be
+doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
+
+`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
+processes work items for the enqueued NCCL operations.
+
+`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
+their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
+the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
+network defined event definition using the plugin id in the event descriptor. The plugin identifier
+is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
+16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
+unused and available for future extensions.
+
+A network IB plugin can use this infrastructure to define a QP event as:
+
+```C
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+```
+
+The network event infrastructure is network agnostic. A different network socket plugin can
+use it to define a socket event as:
+
+```C
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+```
+
+The network plugin creates an event (descriptor) and passes it to the profiler callback,
+along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
+event descriptor, attaches the network plugin defined event as external data, and calls
+the profiler `startEvent` function.
+
+```C
+ncclResult_t isend(..., void* phandle, ...) {
+  ...
+  int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+  ncclProfilerNetIbDescr_v1_t eDescr = { };
+  eDescr.type = ncclProfileQp;
+  eDescr.qp = { ... };
+  ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
+  ...
+}
+```
+
+State transitions for the events described can also come with event attribute updates. For this
+reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
+
+```
+typedef union {
+  struct {                // attributes for update for ncclProfileProxyStep events
+    size_t transSize;     // transfer size field for this proxy step
+  } proxyStep;
+
+  struct {                // attributes to update for ncclProfileProxyCtrl events
+    int appendedProxyOps; // number of appended proxy ops thus far
+  } proxyCtrl;
+
+  struct {                // attributes to update for ncclProfileNetPlugin events
+    void* data;           // network plugin opaque update data field
+  } netPlugin;
+
+  struct {                // attribute to update for ncclProfileKernelCh events
+    uint64_t pTimer;      // timestamp provided by the NCCL kernel
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+```
+
+The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
+
+### Event hierarchy
+
+NCCL core events (reported above) are organized into a hierarchy as reported below:
+
+```
+Group event
+   |
+   +- Collective event
+   |  |
+   |  +- ProxyOp event
+   |  |  |
+   |  |  +- ProxyStep event
+   |  |     |
+   |  |     +- NetPlugin event
+   |  |
+   |  +- KernelCh event
+   |
+   +- Point-to-point event
+      |
+      +- ProxyOp event
+      |  |
+      |  +- ProxyStep event
+      |     |
+      |     +- NetPlugin event
+      |
+      +- KernelCh event
+
+ProxyCtrl event
+```
+
+# Profiler instrumentation and logging
+
+## Profiling of collective and p2p operations
+
+The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups,
+collective and point-to-point operations, as well as proxy, kernel and network activity. Due to the asynchronous nature
+of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit
+precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to
+figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to
+the profiler that the collective has been enqueued. The profiler can leverage proxy and/or kernel event information, if
+these are enabled, to estimate when the collective ends. For example, the profiler can look at the `stopEvent`
+call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This
+can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent`
+increment and decrement the reference counter, respectively.
+
+## PXN
+
+PXN causes some proxy operations to be processed in a remote proxy thread that differs from the one that
+generated the operation. When this happens, the event hierarchy reported above breaks. Because the
+profiler can use the hierarchy information, provided by NCCL in the event descriptor, to dereference the
+parent event during `startEvent`, the remote proxy thread must be in the same address space of the proxy
+thread originating the operation. To avoid the profiler instance in the remote proxy address space to
+dereference a pointer from another address space the event descriptor includes the PID of the originator.
+The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
+parent event.
+
+# Known Limitations
+
+In intra-node communication, or whenever a rank does not have any network activity for which proxy events
+are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
+enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
+collective. However, this time only represents the launch time of the collective and not the actual
+execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
+
+With version 3 of the profiler interface network activity is no longer required to do intra-node profiling.
+Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
+thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
+the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
+accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
+delayed, a similar loss of accuracy can be encountered.
+
+To mitigate this effect, with version 4 of the profiler NCCL uses a per-channel ring buffer of 64 elements.
+Every counter is complemented by two timestamps (ptimers) supplied by the NCCL kernel (one for start and one
+for stop of the operation in the kernel). NCCL propagates these timestamps to the profiler plugin that it can
+convert them to CPU time domain.
--- a/ext-profiler/example/Makefile
+++ b/ext-profiler/example/Makefile
@ -0,0 +1,22 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
+
+SRC_FILES := $(wildcard *.c)
+
+build: ${BUILDDIR}/libnccl-profiler-example.so
+
+${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
+
+clean:
+	rm -f ${BUILDDIR}/libnccl-profiler-example.so
--- a/ext-profiler/example/README.md
+++ b/ext-profiler/example/README.md
@ -0,0 +1,239 @@
+# NCCL Example Profiler Plugin Usage
+
+This page describes how to use the NCCL example profiler plugin
+
+# Overview
+
+The example profiler plugin implements the NCCL profiler plugin API introduced in NCCL v2.23. The API
+defines a set of events and data structures that NCCL uses to share event information with profiler
+plugins. The user can control what events are instrumented by NCCL and when traces collected by the
+profiler should be dumped through environment variables, as described in the rest of the document.
+The user can also control other profiler parameters that alter its behavior. For example, users can
+change the size of the event window the profiler keeps track of.
+
+## Building the profiler plugin
+
+To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
+You can override `NCCL_HOME` to where the NCCL installation is on your system.
+
+## Using the profiler plugin
+
+1. Add the directory of this profiler plugin to your `LD_LIBRARY_PATH` or set the `NCCL_PROFILER_PLUGIN`,
+   as documented in `ext-profiler/README.md`.
+
+2. Set `NCCL_PROFILE_EVENT_MASK` bitmask to specify the NCCL events you want to instrument. By
+   default, all collectives and send/recv operations will be traced. For more details about the event
+   representation used by the profiler refer to `ext-profiler/README.md`.
+
+   As an example, setting:
+
+   `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
+
+   enables the profiling of the group, the collective and the proxy op events. The same events can be
+   expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
+   in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
+   is that the profiler can easily correlate events that belong to the same NCCL operation and present
+   them accordingly.
+
+3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
+   ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
+   event format (more precisely, using asynchronous events).
+
+4. If you set the dump file variable, type chrome://tracing on your chromium browser search bar and
+   open the created dump file to visualize the traces.
+
+# Changing the profiler memory pool sizes
+
+The example profiler uses separate memory pools for different types of events. The size of these memory
+pools (i.e., the # events) determines the number of events that the profiler can keep track of at the
+same time. When NCCL requests a new event (e.g., collective event) to profile a `ncclAllReduce`
+operation, by calling `startEvent`, the profiler searches in the collective pool for a free event. If it
+finds one, it marks it as in use and returns the handle to NCCL. If the pool is completely used the
+profiler returns `NULL` to NCCL and ignores all the following NCCL profiler calls for the `NULL` event
+handle. When the `ncclAllReduce` has been processed, NCCL calls `stopEvent` with the previosly returned
+event handle. The profiler has a total of 5 memory pools.
+
+The group, collective and p2p pools contain objects for the corresponding events. The `ProxyCtrl` pool
+contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
+generated by remote proxies. A list of pools and their size is reported below:
+
+- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
+- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
+- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
+- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
+- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
+
+Remote proxy operations are generated when PXN is in use. Refer to this article for more information
+about PXN and how it works:
+https://developer.nvidia.com/blog/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12/
+
+# Reported events
+
+The example profiler generates traces using the json format. An example of trace is reported below:
+
+```
+[
+{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
+{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
+{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
+{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
+ ... [ trace truncated for brevity ]
+{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
+{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
+{}]
+```
+
+Details about the fields used in the trace can be found at this link:
+https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
+
+The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
+the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
+(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
+one collective and this is what is presented in the traces above).
+
+
+The `AllReduce` event encloses traces for the proxy operation associated to the `ncclAllReduce` operation. The `args`
+field in the traces contains NCCL specific information (aside from the chrome trace event format).
+
+## AllReduce trace
+
+The `AllReduce` entry presents information about the `ncclAllReduce` operation. It contains the following info in the args field:
+
+- seqNum      : sequential number of the collective in the communicator (every collective type has its own sequence number in the communicator)
+- commHash    : communicator unique identifier
+- rank        : NCCL rank for the ncclAllReduce
+- datatype    : NCCL datatype
+- algorithm   : algorithm used to process the ncclAllReduce
+- protocol    : protocol used to process the ncclAllReduce
+- nMaxChannels: max number of channels used to process the ncclAllReduce
+
+If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
+consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
+of collective and p2p operations`.
+
+### Proxy Send
+The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
+info in the args field:
+
+- Channel      : id of the channel used by this proxy operation to send data to the peer
+- Peer         : peer rank
+- Steps        : number of network steps required to transfer transSize bytes to the peer
+- ChunkSize    : chunk size used by NCCL to pipeline data through the proxy thread
+- transSize    : bytes transferred across the channel by this proxy operation
+- POSTED       : struct containing the number of buffer posts to the GPU and the time stamp for the last post
+- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
+- TRANSMITTED  : struct containing the number of network sends and the time stamp of the last send
+- DONE         : struct containing the number of network sends completed and the time stamp of the last send completed
+
+In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
+which could help identify at which point the network problem occurred.
+
+The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
+needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
+entries below are also reported by the profiler.
+
+#### Proxy SendBufferWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
+
+#### Proxy SendGPUWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
+buffer.
+
+#### Proxy SendWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
+
+### Proxy Recv
+
+The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
+info in the args field:
+
+- Channel    : id of the channel used by this proxy operation to recv data from the peer
+- Peer       : peer rank
+- Steps      : number of network steps required to transfer transSize bytes from the peer
+- ChunkSize  : chunk size used by NCCL to pipeline data through the proxy thread
+- transSize  : bytes transferred across the channel by this proxy operation
+- POSTED     : struct containing the number of recvs posted and the time stamp for the last recv posted
+- RECEIVED   : struct containing the number of recvs completed and the time stamp for the last recv completed
+- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
+- DONE       : struct containing the number of flush completed and the time stamp for the last flush completed
+
+The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
+needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
+entries below are also reported by the profiler.
+
+
+#### Proxy RecvBufferWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
+become available.
+
+#### Proxy RecvWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
+
+#### Proxy RecvFlushWait
+
+Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
+
+#### Proxy RecvGPUWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
--- a/ext-profiler/example/event.c
+++ b/ext-profiler/example/event.c
@ -0,0 +1,30 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include "event.h"
+
+int taskEventQueueEmpty(struct group* g) {
+  return g->eventHead == NULL;
+}
+
+void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
+  event->next = NULL;
+  if (g->eventHead) g->eventTail->next = event;
+  else g->eventHead = event;
+  g->eventTail = event;
+}
+
+struct taskEventBase* taskEventQueueHead(struct group* g) {
+  return g->eventHead;
+}
+
+struct taskEventBase* taskEventQueueDequeue(struct group* g) {
+  struct taskEventBase* tmp = g->eventHead;
+  g->eventHead = g->eventHead->next;
+  if (g->eventHead == NULL) g->eventTail = NULL;
+  return tmp;
+}
--- a/ext-profiler/example/event.h
+++ b/ext-profiler/example/event.h
@ -0,0 +1,194 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef EVENT_H_
+#define EVENT_H_
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <unistd.h>
+#include "profiler.h"
+
+#define MAX_CHANNELS                     32
+#define MAX_STEPS                        16
+#define MAX_OPS                          16 // Up to 64K ranks for PAT
+#define MAX_EVENTS_PER_REQ               (8)
+
+struct proxyOp;
+struct proxyStep;
+
+struct netPlugin {
+  uint8_t type;
+  int pluginType;
+  int pluginVer;
+  uint8_t pluginEvent;
+  union {
+    struct {
+      int device;
+      int qpNum;
+      int opcode;
+      uint64_t wr_id;
+      size_t length;
+    } qp;
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+  double startTs;
+  double stopTs;
+  struct proxyStep* parent;
+};
+
+struct kernelCh {
+  uint8_t type;
+  uint8_t channelId;
+  struct taskEventBase* parent;
+  double startTs;
+  double stopTs;
+  uint64_t startGpuClk;
+  uint64_t stopGpuClk;
+};
+
+#define PROXY_STEP_SEND_GPU_WAIT 0
+#define PROXY_STEP_SEND_PEER_WAIT 1
+#define PROXY_STEP_SEND_WAIT 2
+#define PROXY_STEP_RECV_WAIT 0
+#define PROXY_STEP_RECV_FLUSH_WAIT 1
+#define PROXY_STEP_RECV_GPU_WAIT 2
+#define PROXY_STEP_MAX_STATES 3
+
+struct proxyStep {
+  uint8_t type;                     // type of event: network transfer
+  int state;
+  int step;                         // network transfer id in given channel
+  int isSend;                       // send/recv channel operation
+  double timestamp[PROXY_STEP_MAX_STATES];
+  double startTs;
+  double stopTs;
+  struct proxyOp* parent;
+  struct netPlugin net[MAX_EVENTS_PER_REQ];
+  int nNetEvents;
+};
+
+struct proxyOp {
+  uint8_t type;                     // type of event: proxy operation
+  uint8_t channelId;                // channel id for this proxy operation
+  pid_t pid;
+  int rank;
+  int peer;                         // peer rank for this proxy operation
+  int nSteps;                       // total number of network transfers for this proxy operation
+  int chunkSize;                    // chunk size for this proxy operation
+  int isSend;                       // send/recv channel operation
+  size_t transSize;                 // transfer data size for this proxy operation
+  double startTs;
+  double progrTs;                   // In progress state transition
+  double stopTs;
+  int stepCount;                    // last processed network operation for this proxy operation
+  struct proxyStep step[MAX_STEPS]; // array of network transfer events
+  struct taskEventBase* parent;     // parent event p2p/collective
+};
+
+struct group;
+struct context;
+
+struct proxyCtrl {
+  uint8_t type;
+  struct context* ctx;              // profiler context
+  double startTs;
+  double stopTs;
+  int state;
+  int appended;                     // appended proxy operations
+};
+
+// task level event base structure
+struct taskEventBase {
+  uint8_t type;                     // event type: collective/p2p
+  int rank;                         // rank of the operation in NCCL communicator
+  const char* func;                 // ncclFunc*
+  int refCount;                     // number of references for this operation
+  struct group* parent;             // parent event group
+  struct taskEventBase* next;       // next top level event in group
+  double startTs;
+  double stopTs;
+};
+
+struct collective {
+  struct taskEventBase base;        // base structure for this event
+  uint64_t seqNumber;               // sequence number for this collective in communicator
+  void const* sendBuff;
+  void* recvBuff;
+  size_t count;
+  int root;
+  const char* datatype;
+  uint8_t nChannels;
+  const char* algo;
+  const char* proto;
+  int nWarps;
+  struct proxyOp op[MAX_CHANNELS][2*MAX_OPS];
+  int nProxyOps[MAX_CHANNELS];
+  struct kernelCh kernel[MAX_CHANNELS];
+};
+
+struct p2p {
+  struct taskEventBase base;        // base structure for this event
+  uint8_t func;
+  void const* buff;
+  size_t count;
+  const char* datatype;
+  int peer;
+  uint8_t nChannels;
+  struct proxyOp op[MAX_CHANNELS];
+  struct kernelCh kernel[MAX_CHANNELS];
+};
+
+struct group {
+  uint8_t type;
+  struct context* ctx;              // profiler context
+  int groupId;
+  int refCount;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct group* next;               // next group event in queue
+};
+
+// arrays for different event objects
+struct context {
+  const char* commName;
+  uint64_t commHash;
+  int nranks;
+  int rank;
+
+  int groupPoolSize;
+  int groupPoolBase;
+  int groupPoolIndex;
+  struct group* groupPool;
+
+  int collPoolSize;
+  int collPoolBase;
+  int collPoolIndex;
+  struct collective* collPool;
+
+  int p2pPoolSize;
+  int p2pPoolBase;
+  int p2pPoolIndex;
+  struct p2p* p2pPool;
+
+  int proxyCtrlPoolSize;
+  int proxyCtrlPoolBase;
+  int proxyCtrlPoolIndex;
+  struct proxyCtrl* proxyCtrlPool;
+};
+
+int taskEventQueueEmpty(struct group* g);
+void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
+struct taskEventBase* taskEventQueueHead(struct group* g);
+struct taskEventBase* taskEventQueueDequeue(struct group* g);
+
+#endif
--- a/ext-profiler/example/nccl/common.h
+++ b/ext-profiler/example/nccl/common.h
@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
--- a/ext-profiler/example/nccl/err.h
+++ b/ext-profiler/example/nccl/err.h
@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
--- a/ext-profiler/example/nccl/net_ib_v1.h
+++ b/ext-profiler/example/nccl/net_ib_v1.h
@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_V1_H_
+#define NET_IB_V1_H_
+
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+
+#endif
--- a/ext-profiler/example/nccl/net_socket_v1.h
+++ b/ext-profiler/example/nccl/net_socket_v1.h
@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_V1_H_
+#define NET_SOCKET_V1_H_
+
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+
+#endif
--- a/ext-profiler/example/nccl/profiler.h
+++ b/ext-profiler/example/nccl/profiler.h
@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_H_
+#define PROFILER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+#include "err.h"
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait     = 8,
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,
+  ncclProfilerProxyStepSendWait        = 9,
+  ncclProfilerProxyStepRecvWait        = 10,
+  ncclProfilerProxyStepRecvFlushWait   = 11,
+  ncclProfilerProxyStepRecvGPUWait     = 12,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle            = 13,
+  ncclProfilerProxyCtrlActive          = 14,
+  ncclProfilerProxyCtrlSleep           = 15,
+  ncclProfilerProxyCtrlWakeup          = 16,
+  ncclProfilerProxyCtrlAppend          = 17,
+  ncclProfilerProxyCtrlAppendEnd       = 18,
+
+  /* Network defined events states */
+  ncclProfilerNetPluginUpdate          = 21,
+
+  /* Kernel event states */
+  ncclProfilerKernelChStop             = 22,
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+
+#include "profiler_v4.h"
+#include "profiler_v3.h"
+#include "profiler_v2.h"
+#include "profiler_v1.h"
+#include "profiler_net.h"
+
+typedef ncclProfiler_v4_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
+
+#endif // end include guard
--- a/ext-profiler/example/nccl/profiler_net.h
+++ b/ext-profiler/example/nccl/profiler_net.h
@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_NET_H_
+#define PROFILER_NET_H_
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#include "net_ib_v1.h"
+#include "net_socket_v1.h"
+
+#endif
--- a/ext-profiler/example/nccl/profiler_v1.h
+++ b/ext-profiler/example/nccl/profiler_v1.h
@ -0,0 +1,109 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
--- a/ext-profiler/example/nccl/profiler_v2.h
+++ b/ext-profiler/example/nccl/profiler_v2.h
@ -0,0 +1,106 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
--- a/ext-profiler/example/nccl/profiler_v3.h
+++ b/ext-profiler/example/nccl/profiler_v3.h
@ -0,0 +1,114 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
--- a/ext-profiler/example/nccl/profiler_v4.h
+++ b/ext-profiler/example/nccl/profiler_v4.h
@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V4_H_
+#define PROFILER_V4_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communciator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+
+#endif
--- a/ext-profiler/example/nccl/types.h
+++ b/ext-profiler/example/nccl/types.h
@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+} ncclDataType_t;
+
+#endif
--- a/ext-profiler/example/plugin.c
+++ b/ext-profiler/example/plugin.c
@ -0,0 +1,633 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <linux/limits.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <time.h>
+#include "event.h"
+#include "print_event.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+static int initialized;             // initialization counter for profiler
+static double startTime;            // profiler start time
+
+static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
+static const int defaultGroupPoolSize = 16;
+static const int defaultCollPoolSize = 16;
+static const int defaultP2pPoolSize = 1024;
+static const int defaultProxyCtrlPoolSize = 16;
+static const int defaultDetachPoolSize = 128;
+
+static int groupPoolSize;
+static int collPoolSize;
+static int p2pPoolSize;
+static int proxyCtrlPoolSize;
+static int detachPoolSize;
+static int detachPoolBase;
+static int detachPoolIndex;
+static int detachPoolDone;
+static struct proxyOp* detachPool;
+
+ncclDebugLogger_t logFn;
+#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+
+__hidden double gettime(void) {
+  struct timespec t;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return (t.tv_sec*1e6 + (t.tv_nsec*1e-3));
+}
+
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pid_t pid;
+static int* eActivationMaskPtr;
+
+__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
+  pthread_mutex_lock(&lock);
+  if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
+    // first thread initializes event mask, environment and detach pool
+    const char* str;
+    str = getenv("NCCL_PROFILE_EVENT_MASK");
+    __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
+
+    str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
+    groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
+
+    str = getenv("NCCL_PROFILE_COLL_POOL_SIZE");
+    collPoolSize = str ? atoi(str) : defaultCollPoolSize;
+
+    str = getenv("NCCL_PROFILE_P2P_POOL_SIZE");
+    p2pPoolSize = str ? atoi(str) : defaultP2pPoolSize;
+
+    str = getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE");
+    proxyCtrlPoolSize = str ? atoi(str) : defaultProxyCtrlPoolSize;
+
+    str = getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE");
+    detachPoolSize = str ? atoi(str) : defaultDetachPoolSize;
+
+    // detach pool is used to store PXN proxyOps and is shared among threads
+    detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
+    if (detachPool == NULL) {
+      pthread_mutex_unlock(&lock);
+      return ncclSystemError;
+    }
+    // Pid of the process initializing the profiler first.
+    // This is compared against the pid of proxyOp events
+    // to figure out if they have a parent event in this
+    // process address space.
+    pid = getpid();
+
+    startTime = gettime();
+  }
+  pthread_mutex_unlock(&lock);
+
+  // store pointer to activation mask globally
+  eActivationMaskPtr = eActivationMask;
+
+  // pre-allocate memory for event object pools in dedicated profiler context
+  struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
+  ctx->commName = commName;
+  ctx->commHash = commHash;
+  ctx->nranks = nranks;
+  ctx->rank = rank;
+  logFn = logfn;
+  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
+
+  ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
+  if (ctx->groupPool == NULL) goto fail;
+
+  ctx->collPool = (struct collective *)calloc(collPoolSize, sizeof(*ctx->collPool));
+  if (ctx->collPool == NULL) goto fail;
+
+  ctx->p2pPool = (struct p2p *)calloc(p2pPoolSize, sizeof(*ctx->p2pPool));
+  if (ctx->p2pPool == NULL) goto fail;
+
+  ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
+  if (ctx->proxyCtrlPool == NULL) goto fail;
+
+  // Print event pool sizes for debugging
+  //fprintf(stdout, "Profiler: Group pool size (bytes): %lu\n", sizeof(struct group)*groupPoolSize);
+  //fprintf(stdout, "Profiler: Coll  pool size (bytes): %lu\n", sizeof(struct collective)*collPoolSize);
+  //fprintf(stdout, "Profiler: P2p   pool size (bytes): %lu\n", sizeof(struct p2p)*p2pPoolSize);
+  //fprintf(stdout, "Profiler: Proxy pool size (bytes): %lu\n", sizeof(struct proxyCtrl)*proxyCtrlPoolSize);
+  //fprintf(stdout, "Profiler: PXN   pool size (bytes): %lu\n", sizeof(struct proxyOp)*detachPoolSize);
+
+  *context = ctx;
+  return ncclSuccess;
+
+fail:
+  // cleanup resources
+  if (ctx->proxyCtrlPool) free(ctx->proxyCtrlPool);
+  if (ctx->p2pPool) free(ctx->p2pPool);
+  if (ctx->collPool) free(ctx->collPool);
+  if (ctx->groupPool) free(ctx->groupPool);
+  free(ctx);
+  if (detachPool) free(detachPool);
+  return ncclSystemError;
+}
+
+__hidden ncclResult_t exampleProfilerFinalize(void* context) {
+  FILE* fh = NULL;
+  char filename[PATH_MAX] = { 0 };
+  struct context* ctx = (struct context *)context;
+  const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
+  if (dump) {
+    sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
+    fh = fopen(filename, "w");
+    fprintf(fh, "[\n");
+  }
+  INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);
+
+  // print last N groups/collectives/p2ps
+  int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
+  int end = ctx->groupPoolIndex;
+  for (int i = start; i < end; i++) {
+    printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
+  }
+
+  start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
+  end = ctx->proxyCtrlPoolIndex;
+  for (int i = start; i < end; i++) {
+    printEvent(fh, &ctx->proxyCtrlPool[i%proxyCtrlPoolSize]);
+  }
+
+  free(ctx->groupPool);
+  free(ctx->collPool);
+  free(ctx->p2pPool);
+  free(ctx->proxyCtrlPool);
+  free(ctx);
+
+  // last thread cleans up shared detach pool
+  if (__atomic_sub_fetch(&initialized, 1, __ATOMIC_RELAXED) == 0) {
+    start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
+    end = detachPoolIndex;
+    for (int i = start; i < end; i++) {
+      printEvent(fh, &detachPool[i%detachPoolSize]);
+    }
+    free(detachPool);
+  }
+
+  if (fh) fprintf(fh, "{}]\n");
+  if (fh) fclose(fh);
+
+  return ncclSuccess;
+}
+
+__hidden void updateEvent(void* handle);
+
+__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  *eHandle = NULL;
+  struct context* ctx = (struct context *)context;
+  if (eDescr->type == ncclProfileGroup) {
+    struct group* event;
+    int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
+      // if there are available group events grab one
+      event = &ctx->groupPool[groupId%groupPoolSize];
+      while (!taskEventQueueEmpty(event)) {
+        struct taskEventBase* base = taskEventQueueDequeue(event);
+        if (base->type == ncclProfileColl) {
+          struct collective* c = (struct collective *)base;
+          // reset event proxyOps & proxySteps
+          memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
+          // release collective events in the group and return them to the collective pool
+          __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
+        } else if (base->type == ncclProfileP2p) {
+          struct p2p* p = (struct p2p *)base;
+          // reset event proxyOp and proxySteps
+          memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+          // release p2p events in the group and return them to the p2p pool
+          __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
+        }
+      }
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileGroup;
+    event->ctx = ctx;
+    event->groupId = groupId;
+    event->startTs = gettime() - startTime;
+    *eHandle = event;
+    debugEvent(event, "GroupStart");
+  } else if (eDescr->type == ncclProfileColl) {
+    // the parent might be null if we run out of events
+    struct group* parent = (struct group *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    struct collective* event;
+    int collId = __atomic_fetch_add(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((collId - __atomic_load_n(&ctx->collPoolBase, __ATOMIC_RELAXED)) < collPoolSize) {
+      // if there are available collective events grab one
+      event = &ctx->collPool[collId%collPoolSize];
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+
+    event->base.type = ncclProfileColl;
+    event->base.rank = eDescr->rank;
+    event->base.func = eDescr->coll.func;
+    event->base.startTs = gettime() - startTime;
+    event->base.parent = parent;
+    event->seqNumber = eDescr->coll.seqNumber;
+    event->sendBuff = eDescr->coll.sendBuff;
+    event->recvBuff = eDescr->coll.recvBuff;
+    event->count = eDescr->coll.count;
+    event->root = eDescr->coll.root;
+    event->datatype = eDescr->coll.datatype;
+    event->nChannels = eDescr->coll.nChannels;
+    event->nWarps = eDescr->coll.nWarps;
+    event->algo = eDescr->coll.algo;
+    event->proto = eDescr->coll.proto;
+    *eHandle = event;
+    taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
+    // increment the group ref counter so the event will staty open
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    debugEvent(event, "CollStart");
+  } else if (eDescr->type == ncclProfileP2p) {
+    // the parent might be null if we run out of events
+    struct group* parent = (struct group *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    struct p2p* event;
+    int p2pId = __atomic_fetch_add(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((p2pId - __atomic_load_n(&ctx->p2pPoolBase, __ATOMIC_RELAXED)) < p2pPoolSize) {
+      // if there are available p2p events grab one
+      event = &ctx->p2pPool[p2pId%p2pPoolSize];
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+
+    event->base.type = ncclProfileP2p;
+    event->base.rank = eDescr->rank;
+    event->base.func = eDescr->p2p.func;
+    event->base.next = parent->eventHead;
+    event->base.startTs = gettime() - startTime;
+    event->base.parent = parent;
+    event->buff = eDescr->p2p.buff;
+    event->count = eDescr->p2p.count;
+    event->datatype = eDescr->p2p.datatype;
+    event->peer = eDescr->p2p.peer;
+    event->nChannels = eDescr->p2p.nChannels;
+    *eHandle = event;
+    // increment the group ref counter so the event will staty open
+    taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    debugEvent(event, "P2pStart");
+  } else if (eDescr->type == ncclProfileProxyCtrl) {
+    int proxyCtrlId = __atomic_fetch_add(&ctx->proxyCtrlPoolIndex, 1, __ATOMIC_RELAXED);
+    struct proxyCtrl* event = &ctx->proxyCtrlPool[proxyCtrlId%proxyCtrlPoolSize];
+    event->type = ncclProfileProxyCtrl;
+    event->ctx = ctx;
+    event->startTs = gettime() - startTime;
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileProxyOp) {
+    // the eventBase might be null if we run out of events
+    struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
+    if (eventBase == NULL) return ncclSuccess;
+
+    if (eDescr->proxyOp.pid != pid) {
+      // PXN captured proxyOp events
+      struct proxyOp* event;
+      int detachId = __atomic_fetch_add(&detachPoolIndex, 1, __ATOMIC_RELAXED);
+      if ((detachId - detachPoolBase) < detachPoolSize) {
+        // if there are available detached proxyOp events grab one
+        event = &detachPool[detachId%detachPoolSize];
+      } else {
+        // else drop this event
+        __atomic_fetch_sub(&detachPoolIndex, 1, __ATOMIC_RELAXED);
+        return ncclSuccess;
+      }
+
+      event->type = ncclProfileProxyOp;
+      event->channelId = eDescr->proxyOp.channelId;
+      event->pid = eDescr->proxyOp.pid;
+      event->rank = eDescr->rank;
+      event->peer = eDescr->proxyOp.peer;
+      event->nSteps = eDescr->proxyOp.nSteps;
+      event->chunkSize = eDescr->proxyOp.chunkSize;
+      event->isSend = eDescr->proxyOp.isSend;
+      event->startTs = gettime() - startTime;
+      event->parent = NULL;
+      event->stepCount = 0;
+      *eHandle = event;
+      debugEvent(event, "PxnProxyOpStart");
+      return ncclSuccess;
+    }
+
+    if (eventBase->type == ncclProfileColl) {
+      struct collective* parent = (struct collective *)eDescr->parentObj;
+      int channelId = eDescr->proxyOp.channelId;
+      struct proxyOp* event = &parent->op[channelId][parent->nProxyOps[channelId]++];
+
+      event->type = ncclProfileProxyOp;
+      event->channelId = channelId;
+      event->pid = eDescr->proxyOp.pid;
+      event->rank = eDescr->rank;
+      event->peer = eDescr->proxyOp.peer;
+      event->nSteps = eDescr->proxyOp.nSteps;
+      event->chunkSize = eDescr->proxyOp.chunkSize;
+      event->isSend = eDescr->proxyOp.isSend;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      event->stepCount = 0;
+      *eHandle = event;
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "ProxyOpStart");
+    } else { // ncclProfileP2p
+      struct p2p* parent = (struct p2p *)eDescr->parentObj;
+      int channelId = eDescr->proxyOp.channelId;
+      struct proxyOp* event = &parent->op[channelId];
+      event->type = ncclProfileProxyOp;
+      event->channelId = channelId;
+      event->pid = eDescr->proxyOp.pid;
+      event->rank = eDescr->rank;
+      event->peer = eDescr->proxyOp.peer;
+      event->nSteps = eDescr->proxyOp.nSteps;
+      event->chunkSize = eDescr->proxyOp.chunkSize;
+      event->isSend = eDescr->proxyOp.isSend;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      event->stepCount = 0;
+      *eHandle = event;
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "ProxyOpStart");
+    }
+  } else if (eDescr->type == ncclProfileProxyStep) {
+    // the parent might be null if we run out of events
+    struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    int s = parent->stepCount++ % MAX_STEPS;
+    struct proxyStep* event = &parent->step[s];
+    event->type = ncclProfileProxyStep;
+    event->state = 0;
+    event->step = eDescr->proxyStep.step;
+    event->parent = parent;
+    event->isSend = parent->isSend;
+    event->startTs = gettime() - startTime;
+    event->nNetEvents = 0;
+    *eHandle = event;
+    debugEvent(event, "ProxyStepStart");
+  } else if (eDescr->type == ncclProfileKernelCh) {
+    struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
+    if (eventBase == NULL) return ncclSuccess;
+    if (eventBase->type == ncclProfileColl) {
+      struct collective* parent = (struct collective *)eDescr->parentObj;
+      struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
+      event->type = ncclProfileKernelCh;
+      event->channelId = eDescr->kernelCh.channelId;
+      event->startGpuClk = eDescr->kernelCh.pTimer;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      *eHandle = event;
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "KernelChStart");
+    } else { // ncclProfileP2p
+      struct p2p* parent = (struct p2p *)eDescr->parentObj;
+      struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
+      event->type = ncclProfileKernelCh;
+      event->channelId = eDescr->kernelCh.channelId;
+      event->startGpuClk = eDescr->kernelCh.pTimer;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      *eHandle = event;
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "KernelChStart");
+    }
+  } else if (eDescr->type == ncclProfileNetPlugin) {
+    struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    int64_t pluginId = eDescr->netPlugin.id;
+    int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK;
+    int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK;
+    if (type == NCCL_PROFILER_NET_TYPE_IB) {
+      if (ver == 1) {
+        ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data;
+        struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
+        event->type = ncclProfileNetPlugin;
+        event->pluginType = type;
+        event->pluginVer = ver;
+        if (descr->type == ncclProfileQp) {
+          event->pluginEvent = ncclProfileQp;
+          event->qp.device = descr->qp.device;
+          event->qp.wr_id = descr->qp.wr_id;
+          event->qp.opcode = descr->qp.opcode;
+          event->qp.qpNum = descr->qp.qpNum;
+          event->qp.length = descr->qp.length;
+        }
+        event->startTs = gettime() - startTime;
+        *eHandle = event;
+        debugEvent(event, "NetPluginStart");
+      }
+    } else if (type == NCCL_PROFILER_NET_TYPE_SOCK) {
+      if (ver == 1) {
+        ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data;
+        struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
+        event->type = ncclProfileNetPlugin;
+        event->pluginType = type;
+        event->pluginVer = ver;
+        if (descr->type == ncclProfileSocket) {
+          event->pluginEvent = ncclProfileSocket;
+          event->sock.fd = descr->sock.fd;
+          event->sock.op = descr->sock.op;
+          event->sock.length = descr->sock.length;
+        }
+        event->startTs = gettime() - startTime;
+        *eHandle = event;
+        debugEvent(event, "NetPluginStart");
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+void updateEvent(void* handle) {
+  uint8_t type = *(uint8_t *)handle;
+  if (type == ncclProfileGroup) {
+    struct group* event = (struct group *)handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      // return group event to the pool
+      __atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
+    }
+    debugEvent(event, "GroupStop");
+  } else if (type == ncclProfileColl) {
+    struct collective* event = (struct collective *)handle;
+    if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->base.stopTs = gettime() - startTime;
+      debugEvent(event, "CollStop");
+      updateEvent(event->base.parent);
+      return;
+    }
+    debugEvent(event, "CollStop");
+  } else if (type == ncclProfileP2p) {
+    struct p2p* event = (struct p2p *)handle;
+    if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->base.stopTs = gettime() - startTime;
+      debugEvent(event, "P2pStop");
+      updateEvent(event->base.parent);
+      return;
+    }
+    debugEvent(event, "P2pStop");
+  } else if (type == ncclProfileProxyOp) {
+    struct proxyOp* event = (struct proxyOp *)handle;
+    event->stopTs = gettime() - startTime;
+    if (event->pid != pid) {
+      // only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
+      int done = __atomic_add_fetch(&detachPoolDone, 1, __ATOMIC_RELAXED);
+      if (done == detachPoolSize) {
+        // reset the event completed (done) counter
+        __atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
+        // update the base pointer to the top of the pool
+        int index = __atomic_load_n(&detachPoolIndex, __ATOMIC_RELAXED);
+        __atomic_store_n(&detachPoolBase, index, __ATOMIC_RELAXED);
+      }
+      debugEvent(event, "ProxyOpStop");
+      return;
+    }
+    updateEvent(event->parent);
+    debugEvent(event, "ProxyOpStop");
+  } else if (type == ncclProfileProxyStep) {
+    struct proxyStep* event = (struct proxyStep *)handle;
+    event->stopTs = gettime() - startTime;
+    debugEvent(event, "ProxyStepStop");
+  } else if (type == ncclProfileProxyCtrl) {
+    struct proxyCtrl* event = (struct proxyCtrl *)handle;
+    event->stopTs = gettime() - startTime;
+    debugEvent(event, "ProxyCtrlStop");
+  } else if (type == ncclProfileKernelCh) {
+    struct kernelCh* event = (struct kernelCh *)handle;
+    event->stopTs = gettime() - startTime;
+    updateEvent(event->parent);
+    debugEvent(event, "KernelChStop");
+  } else if (type == ncclProfileNetPlugin) {
+    struct netPlugin* event = (struct netPlugin *)handle;
+    event->stopTs = gettime() - startTime;
+    debugEvent(event, "NetPluginStop");
+  }
+}
+
+__hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
+  // the event handle might be null if we run out of events
+  if (eHandle == NULL) return ncclSuccess;
+
+  uint8_t type = *(uint8_t *)eHandle;
+  if (type == ncclProfileGroup) {
+    // stopping the group event in NCCL core does not
+    // mean the group has completed. It means the group
+    // was submitted/enqueued so we need to keep the event open
+    struct group* event = (struct group *)eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileColl) {
+    // stopping the collective event in NCCL core does not
+    // mean the collective has completed. It means the collective
+    // was submitted/enqueued so we need to keep the event open
+    struct collective* event = (struct collective *)eHandle;
+    event->base.stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileP2p) {
+    // stopping the p2p event in NCCL core does not
+    // mean the p2p has completed. It means the p2p
+    // was submitted/enqueued so we need to keep the event open
+    struct p2p* event = (struct p2p *)eHandle;
+    event->base.stopTs = gettime() - startTime;
+    return ncclSuccess;
+  }
+
+  updateEvent(eHandle);
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  // the event handle might be null if we run out of events
+  if (eHandle == NULL) return ncclSuccess;
+
+  uint8_t type = *(uint8_t *)eHandle;
+  if (type == ncclProfileProxyOp) {
+    struct proxyOp* event = (struct proxyOp *)eHandle;
+    if (eState == ncclProfilerProxyOpInProgress_v4) {
+      event->progrTs = gettime() - startTime;
+    }
+  } else if (type == ncclProfileProxyStep) {
+    struct proxyStep* event = (struct proxyStep *)eHandle;
+    struct proxyOp* parent = event->parent;
+    switch (eState) {
+      case ncclProfilerProxyStepSendGPUWait:
+        event->timestamp[PROXY_STEP_SEND_GPU_WAIT] = gettime() - startTime;
+        break;
+      case ncclProfilerProxyStepSendPeerWait_v4:
+        // do not update step event if in SendPeerWait
+        if (event->state == ncclProfilerProxyStepSendPeerWait_v4) break;
+        event->timestamp[PROXY_STEP_SEND_PEER_WAIT] = gettime() - startTime;
+        event->state = ncclProfilerProxyStepSendPeerWait_v4;
+        break;
+      case ncclProfilerProxyStepSendWait:
+        event->timestamp[PROXY_STEP_SEND_WAIT] = gettime() - startTime;
+        parent->transSize += eStateArgs->proxyStep.transSize;
+        break;
+      case ncclProfilerProxyStepRecvWait:
+        event->timestamp[PROXY_STEP_RECV_WAIT] = gettime() - startTime;
+        break;
+      case ncclProfilerProxyStepRecvFlushWait:
+        event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT] = gettime() - startTime;
+        parent->transSize += eStateArgs->proxyStep.transSize;
+        break;
+      case ncclProfilerProxyStepRecvGPUWait:
+        event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
+        break;
+    }
+  } else if (type == ncclProfileProxyCtrl) {
+    struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
+    if (eState == ncclProfilerProxyCtrlAppendEnd) {
+      event->appended = eStateArgs->proxyCtrl.appendedProxyOps;
+    }
+    event->state = eState;
+  } else if (type == ncclProfileKernelCh) {
+    struct kernelCh* event = (struct kernelCh *)eHandle;
+    if (eState == ncclProfilerKernelChStop) {
+      event->stopGpuClk = eStateArgs->kernelCh.pTimer;
+    }
+  }
+  debugEvent(eHandle, "RecordEventState");
+  return ncclSuccess;
+}
+
+ncclProfiler_t ncclProfiler_v4 = {
+  "Example-profiler",
+  exampleProfilerInit,
+  exampleProfilerStartEvent,
+  exampleProfilerStopEvent,
+  exampleProfilerRecordEventState,
+  exampleProfilerFinalize,
+};
+
+int exampleProfilerStart(int eActivationMask) {
+  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
+    __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
+  }
+  return ncclSuccess;
+}
+
+int exampleProfilerStop(void) {
+  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
+    __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
+  }
+  return ncclSuccess;
+}
--- a/ext-profiler/example/plugin.h
+++ b/ext-profiler/example/plugin.h
@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PLUGIN_H_
+#define PLUGIN_H_
+
+int exampleProfilerStart(int eActivationMask);
+int exampleProfilerStop(void);
+
+#endif
--- a/ext-profiler/example/print_event.c
+++ b/ext-profiler/example/print_event.c
@ -0,0 +1,294 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include "profiler.h"
+#include "event.h"
+#include "print_event.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
+// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
+// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
+static __thread int groupId;
+__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
+          "Group", groupId, getpid(), 1, event->startTs, event->groupId);
+}
+
+__hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          "Group", groupId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int collId;
+__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
+          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
+}
+
+__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          event->base.func, collId++, getpid(), 1, event->base.stopTs);
+}
+
+static __thread int p2pId;
+__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
+          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
+}
+
+__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          event->base.func, p2pId++, getpid(), 1, event->base.stopTs);
+}
+
+static __thread int proxyOpId;
+__hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
+  if (event->isSend) {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
+            "ScheduleSend", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "ScheduleSend", proxyOpId, getpid(), 1, event->progrTs);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
+            "ProgressSend", proxyOpId, getpid(), 1, event->progrTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
+  } else {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
+            "ScheduleRecv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "ScheduleRecv", proxyOpId, getpid(), 1, event->progrTs);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
+            "ProgressRecv", proxyOpId, getpid(), 1, event->progrTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
+  }
+}
+
+__hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          event->isSend ? "ProgressSend" : "ProgressRecv", proxyOpId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int proxyStepId;
+__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
+  if (event->isSend) {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_GPU_WAIT], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_PEER_WAIT]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "SendPeerWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_PEER_WAIT], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "SendPeerWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_WAIT]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_WAIT], event->step);
+  } else {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_WAIT], event->step);
+  }
+}
+
+__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
+  if (event->isSend) {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "SendWait", proxyStepId++, getpid(), 1, event->stopTs);
+  } else {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_GPU_WAIT]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_GPU_WAIT], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs);
+  }
+}
+
+static __thread int kernelId;
+__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
+  if (event->type != ncclProfileKernelCh) return;
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"StartGpuClk\": %lu, \"StopGpuClk\": %lu}},\n",
+          "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId, event->startGpuClk, event->stopGpuClk);
+}
+
+__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
+  if (event->type != ncclProfileKernelCh) return;
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          "KernelCh", kernelId, getpid(), 1, event->stopTs);
+}
+
+static __thread int proxyCtrlId;
+__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
+  const char* str;
+  if (event->state == ncclProfilerProxyCtrlIdle || event->state == ncclProfilerProxyCtrlActive) {
+    str = "Idle";
+  } else if (event->state == ncclProfilerProxyCtrlSleep || event->state == ncclProfilerProxyCtrlWakeup) {
+    str = "Sleep";
+  } else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) {
+    str = "Append";
+  } else {
+    return;
+  }
+  if (event->state == ncclProfilerProxyCtrlAppendEnd) {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n",
+            str, proxyCtrlId, getpid(), 1, event->startTs, event->appended);
+  } else {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            str, proxyCtrlId, getpid(), 1, event->startTs);
+  }
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          str, proxyCtrlId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int ibQpId, sockId;
+__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) {
+  if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) {
+    if (event->pluginVer == 1) {
+      if (event->pluginEvent == ncclProfileQp) {
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n",
+                "Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length);
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+                "Qp", ibQpId++, getpid(), 1, event->stopTs);
+      }
+    }
+  } else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) {
+    if (event->pluginVer == 1) {
+      if (event->pluginEvent == ncclProfileSocket) {
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n",
+                "Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length);
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+                "Sock", sockId++, getpid(), 1, event->stopTs);
+      }
+    }
+  }
+}
+
+//#define DEBUG_EVENTS
+void debugEvent(void* eHandle, const char* tag) {
+#ifdef DEBUG_EVENTS
+  char filename[64] = { 0 };
+  sprintf(filename, "EventDebug-%d", getpid());
+  FILE* fh = fopen(filename, "a+");
+  uint8_t type = *(uint8_t *)eHandle;
+  if (type == ncclProfileGroup) {
+    struct group* event = (struct group *)eHandle;
+    fprintf(fh, "Group event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->refCount, __ATOMIC_RELAXED));
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileColl) {
+    struct collective* event = (struct collective *)eHandle;
+    fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
+    fprintf(fh, "  parent            = %p\n", event->base.parent);
+    for (int j = 0; j < 2*MAX_OPS; j++) {
+      for (int i = 0; i < MAX_CHANNELS; i++) if (event->op[i][j].type == ncclProfileProxyOp) fprintf(fh, "  op[%d]           = %p\n", i, &event->op[i]);
+    }
+    fprintf(fh, "  startTs           = %f\n", event->base.startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->base.stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileP2p) {
+    struct p2p* event = (struct p2p *)eHandle;
+    fprintf(fh, "P2p event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
+    fprintf(fh, "  parent            = %p\n", event->base.parent);
+    fprintf(fh, "  op                = %p\n", &event->op);
+    fprintf(fh, "  startTs           = %f\n", event->base.startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->base.stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileProxyOp) {
+    struct proxyOp* event = (struct proxyOp *)eHandle;
+    fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  type              = %s\n", event->isSend < 0 ? "Unknown" : event->isSend ? "Send" : "Recv");
+    fprintf(fh, "  channel           = %d\n", event->channelId);
+    fprintf(fh, "  parent            = %p\n", event->parent);
+    fprintf(fh, "  rank              = %d\n", event->rank);
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  progrTs           = %f\n", event->progrTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileProxyStep) {
+    struct proxyStep* event = (struct proxyStep *)eHandle;
+    fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  type              = %s\n", event->isSend < 0 ? "Unknown" : event->isSend ? "Send" : "Recv");
+    fprintf(fh, "  parent            = %p\n", event->parent);
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileKernelCh) {
+    struct kernelCh* event = (struct kernelCh *)eHandle;
+    fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  parent            = %p\n", event->parent);
+    fprintf(fh, "  channel           = %d\n", event->channelId);
+  } else if (type == ncclProfileNetPlugin) {
+    struct netPlugin* event = (struct netPlugin *)eHandle;
+    fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  pluginType        = %d\n", event->pluginType);
+    fprintf(fh, "  pluginVer         = %d\n", event->pluginVer);
+    fprintf(fh, "  pluginEvent       = %d\n", event->pluginEvent);
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
+  }
+  fclose(fh);
+#endif
+}
+
+void printEvent(FILE* fh, void* handle) {
+  if (handle == NULL || fh == NULL) return;
+  uint8_t type = *(uint8_t *)handle;
+  if (type == ncclProfileGroup) {
+    struct group* g = (struct group *)handle;
+    printGroupEventHeader(fh, g);
+    struct taskEventBase* base = taskEventQueueHead(g);
+    while (base) {
+      struct taskEventBase* next = base->next;
+      printEvent(fh, base);
+      base = next;
+    }
+    printGroupEventTrailer(fh, g);
+  } else if (type == ncclProfileColl) {
+    struct collective* c = (struct collective *)handle;
+    printCollEventHeader(fh, c);
+    for (int i = 0; i < MAX_CHANNELS; i++) {
+      printKernelChEventHeader(fh, &c->kernel[i]);
+      for (int j = 0; j < c->nProxyOps[i]; j++) {
+        printEvent(fh, &c->op[i][j]);
+      }
+      printKernelChEventTrailer(fh, &c->kernel[i]);
+    }
+    printCollEventTrailer(fh, c);
+  } else if (type == ncclProfileP2p) {
+    struct p2p* p = (struct p2p *)handle;
+    printP2pEventHeader(fh, p);
+    for (int i = 0; i < MAX_CHANNELS; i++) {
+      printKernelChEventHeader(fh, &p->kernel[i]);
+      printEvent(fh, &p->op[i]);
+      printKernelChEventTrailer(fh, &p->kernel[i]);
+    }
+    printP2pEventTrailer(fh, p);
+  } else if (type == ncclProfileProxyOp) {
+    struct proxyOp* p = (struct proxyOp *)handle;
+    printProxyOpEventHeader(fh, p);
+    for (int i = 0; i < MAX_STEPS; i++) {
+      printEvent(fh, &p->step[i]);
+    }
+    printProxyOpEventTrailer(fh, p);
+  } else if (type == ncclProfileProxyStep) {
+    struct proxyStep* p = (struct proxyStep *)handle;
+    printProxyStepEventHeader(fh, p);
+    for (int q = 0; q < p->nNetEvents; q++) {
+      printNetPluginEvent(fh, &p->net[q]);
+    }
+    printProxyStepEventTrailer(fh, p);
+  } else if (type == ncclProfileProxyCtrl) {
+    struct proxyCtrl* p = (struct proxyCtrl *)handle;
+    printProxyCtrlEvent(fh, p);
+  }
+  return;
+}
--- a/ext-profiler/example/print_event.h
+++ b/ext-profiler/example/print_event.h
@ -0,0 +1,16 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PRINT_EVENT_H_
+#define PRINT_EVENT_H_
+
+#include "nccl/common.h"
+extern ncclDebugLogger_t logFn;
+
+void debugEvent(void* eHandle, const char* tag);
+void printEvent(FILE* fh, void* handle);
+
+#endif
--- a/ext-tuner/basic/Makefile
+++ b/ext-tuner/basic/Makefile
@ -0,0 +1,23 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
+
+SRC_FILES := $(wildcard *.c)
+DST_DIR   := $(BUILDDIR)/test/unit/plugins
+
+build: ${BUILDDIR}/libnccl-tuner-basic.so
+
+${BUILDDIR}/libnccl-tuner-basic.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
+
+clean:
+	rm -f ${BUILDDIR}/libnccl-tuner-basic.so
--- a/ext-tuner/basic/nccl/common.h
+++ b/ext-tuner/basic/nccl/common.h
@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
--- a/ext-tuner/basic/nccl/err.h
+++ b/ext-tuner/basic/nccl/err.h
@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
--- a/ext-tuner/basic/nccl/tuner.h
+++ b/ext-tuner/basic/nccl/tuner.h
@ -0,0 +1,97 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+#include "err.h"
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+#define NCCL_ALGO_PAT 6
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+#define NCCL_ALGO_PROTO_IGNORE -1.0
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+#endif
--- a/ext-tuner/basic/plugin.c
+++ b/ext-tuner/basic/plugin.c
@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "tuner.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
+
+__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels) {
+  // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+  }
+  *nChannels = 1;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
+
+#define PLUGIN_NAME "Basic"
+
+const ncclTuner_v4_t ncclTunerPlugin_v4 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .getCollInfo = pluginGetCollInfo,
+  .destroy = pluginDestroy
+};
--- a/ext-tuner/example/Makefile
+++ b/ext-tuner/example/Makefile
@ -0,0 +1,55 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+.DEFAULT_GOAL: build
+PLUGIN_SO:=libnccl-tuner-example.so
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
+
+SRC_FILES := $(wildcard *.c)
+DST_DIR   := $(BUILDDIR)/test/unit/plugins
+
+default: ${BUILDDIR}/$(PLUGIN_SO)
+
+build: ${BUILDDIR}/$(PLUGIN_SO)
+
+${BUILDDIR}/$(PLUGIN_SO): plugin.c
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+# Test targets - delegate to test directory
+test:
+	$(MAKE) -C test test TEST_CASE=$(TEST_CASE)
+
+test-verbose:
+	$(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE)
+
+# Build tests
+test-build:
+	$(MAKE) -C test all
+
+# Optimize configurations from performance data
+optimize-config:
+	@if [ -z "$(CSV_FILE)" ]; then \
+		echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \
+		echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \
+		exit 1; \
+	fi
+	python3 scripts/optimize_config.py $(CSV_FILE) \
+		$(if $(OUTPUT),-o $(OUTPUT)) \
+		$(if $(METRIC),-m $(METRIC)) \
+		$(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \
+		$(if $(DRY_RUN),--dry-run) \
+		$(if $(NO_HEADER),--no-header)
+
+clean:
+	rm -f ${BUILDDIR}/$(PLUGIN_SO)
+	$(MAKE) -C test clean
+
+.PHONY: test test-verbose test-build optimize-config clean
--- a/ext-tuner/example/README.md
+++ b/ext-tuner/example/README.md
@ -0,0 +1,164 @@
+# NCCL Example Tuner Plugin
+
+This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling.
+
+## Features
+
+- **File-based Configuration**: Read tuning parameters from a CSV configuration file
+- **Size-based Tuning**: Specify different configurations based on message size ranges
+- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks
+- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default
+- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE`
+- **Fallback Behavior**: Gracefully handles missing config files and invalid entries
+
+## Building
+
+```bash
+make
+```
+
+This will create `libnccl-tuner-example.so` that can be loaded by NCCL.
+
+## Configuration File Format
+
+The configuration file uses CSV (Comma-Separated Values) format with one configuration per line:
+
+```
+collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+```
+
+### Parameters
+
+- **collective_type**: The collective operation type
+  - `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce`
+
+- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies
+  - Use `0` for minimum and `4294967295` for maximum (covers all sizes)
+
+- **algorithm**: The NCCL algorithm to use
+  - `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat`
+
+- **protocol**: The NCCL protocol to use
+  - `ll`, `ll128`, `simple`
+
+- **channels**: Number of channels (SMs) to use
+  - Use a positive integer to specify exact channel count
+  - Use `-1` to keep NCCL's default channel selection
+
+- **nNodes**: Number of nodes to match
+  - Use a positive integer to match specific node count
+  - Use `-1` to match any number of nodes
+
+- **nRanks**: Number of ranks to match
+  - Use a positive integer to match specific rank count
+  - Use `-1` to match any number of ranks
+
+- **numPipeOps**: Number of pipeline operations to match (optional)
+  - Use a positive integer to match specific pipeline operation count
+  - Use `-1` to match any number of pipeline operations
+  - If omitted, configuration will match any numPipeOps value
+
+- **regBuff**: Whether user buffer can be registered (optional)
+  - Use `0` to match only non-registered buffers
+  - Use `1` to match only registered buffers
+  - Use `-1` to match either registered or non-registered buffers
+  - If omitted, configuration will match any regBuff value
+
+### Example Configuration
+
+```csv
+# Single-node, small allreduce: use tree algorithm, registered buffers only
+allreduce,0,65536,tree,simple,2,1,-1,-1,1
+
+# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers
+allreduce,65537,1048576,ring,simple,4,4,32,1,0
+
+# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type
+allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
+
+# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible)
+broadcast,0,32768,tree,simple,-1,1,-1
+
+# Multi-node broadcast: optimized for non-registered buffers, single pipeline op
+broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
+```
+
+Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc.
+
+### Backward Compatibility
+
+Configurations without the numPipeOps and/or regBuff parameters are fully supported:
+- 8 fields: matches any numPipeOps and regBuff values
+- 9 fields: matches any regBuff value
+- 10 fields: full parameter specification
+
+This ensures existing configuration files continue to work without modification.
+
+## Usage
+
+### Method 1: Default Config File
+Place your configuration in `nccl_tuner.conf` in the current working directory.
+
+### Method 2: Environment Variable
+Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path:
+
+```bash
+export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf
+export LD_LIBRARY_PATH=/path/to/plugin:$LD_LIBRARY_PATH
+mpirun -np 4 your_nccl_application
+```
+
+## Editing Configuration Files
+
+### Generating Configuration Files from Raw Data
+
+A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md).
+
+### Spreadsheet Tips:
+- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff`
+- Save as CSV format (not Excel format) for the plugin to read
+- Use data validation to prevent typos in algorithm/protocol names
+
+## Logging
+
+The plugin uses NCCL's logging system. To see tuner-related messages:
+
+```bash
+export NCCL_DEBUG=INFO
+```
+
+This will show when configurations are loaded and applied, including the topology information.
+
+For detailed debugging output during tuning decisions:
+
+```bash
+export NCCL_DEBUG=TRACE
+```
+
+This will show verbose information about which configurations are being evaluated and matched.
+
+## Dimension Matching
+
+Configurations are only applied when the topology matches:
+
+- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks
+- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks
+- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node
+- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology
+
+This allows you to create specialized configurations for different cluster setups while maintaining flexibility.
+
+## Default Behavior
+
+If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic.
+
+When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size.
+
+## Troubleshooting
+
+1. **Config file not found**: Check the file path and permissions
+2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters
+3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory
+4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO`
+5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards
+6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces
--- a/ext-tuner/example/nccl/common.h
+++ b/ext-tuner/example/nccl/common.h
@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
--- a/ext-tuner/example/nccl/err.h
+++ b/ext-tuner/example/nccl/err.h
@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
--- a/ext-tuner/example/nccl/tuner.h
+++ b/ext-tuner/example/nccl/tuner.h
@ -0,0 +1,97 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+#include "err.h"
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+#define NCCL_ALGO_PAT 6
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+#define NCCL_ALGO_PROTO_IGNORE -1.0
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+#endif
--- a/ext-tuner/example/nccl_tuner.conf
+++ b/ext-tuner/example/nccl_tuner.conf
@ -0,0 +1,45 @@
+# NCCL Tuner Configuration File (CSV Format)
+# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#
+# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
+# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
+# Protocols: ll, ll128, simple
+# Channels: number of channels to use, or -1 to keep default
+# nNodes: number of nodes to match, or -1 for any number of nodes
+# nRanks: number of ranks to match, or -1 for any number of ranks
+# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
+# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
+#
+# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
+#
+# Examples:
+
+# For single-node configurations with registered buffers
+# Small allreduce operations on single node - use tree algorithm, registered buffers
+allreduce,0,65536,tree,simple,2,1,-1,-1,1
+
+# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers
+# Medium allreduce operations - use ring algorithm
+allreduce,65537,1048576,ring,simple,4,4,32,1,0
+
+# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type
+allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
+
+# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types
+# Single node broadcast - prefer tree, any pipeOps, registered buffers only
+broadcast,0,32768,tree,simple,-1,1,-1,-1,1
+
+# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring
+broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
+
+# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type
+allgather,0,4294967295,ring,simple,4,2,-1
+
+# ReduceScatter operations
+# Small messages on single node, single pipeline op, registered buffers
+reducescatter,0,131072,tree,simple,2,1,-1,1,1
+# Large messages on any topology, multiple pipeline ops, non-registered buffers
+reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0
+
+# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type
+reduce,0,4294967295,tree,simple,-1,-1,-1
--- a/ext-tuner/example/plugin.c
+++ b/ext-tuner/example/plugin.c
@ -0,0 +1,453 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "tuner.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define __hidden __attribute__ ((visibility("hidden")))
+#define MAX_LINE_LENGTH 256
+
+// CSV field indices for configuration parsing
+// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#define CONFIG_FIELD_COLLTYPE     0
+#define CONFIG_FIELD_MINBYTES     1
+#define CONFIG_FIELD_MAXBYTES     2
+#define CONFIG_FIELD_ALGORITHM    3
+#define CONFIG_FIELD_PROTOCOL     4
+#define CONFIG_FIELD_CHANNELS     5
+#define CONFIG_FIELD_NNODES       6
+#define CONFIG_FIELD_NRANKS       7
+#define CONFIG_FIELD_PIPEOPS      8  // Optional field
+#define CONFIG_FIELD_REGBUFF      9  // Optional field
+
+// Field count constants
+#define CONFIG_FIELDS_REQUIRED    8   // Minimum required fields (up to nRanks)
+#define CONFIG_FIELDS_WITH_PIPEOPS 9  // Fields including numPipeOps
+#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff
+#define CONFIG_FIELDS_MAX         10  // Maximum number of fields supported
+
+typedef struct {
+  ncclFunc_t collType;
+  size_t minBytes;
+  size_t maxBytes;
+  int algorithm;
+  int protocol;
+  int nChannels;
+  int nNodes;
+  int nRanks;
+  int numPipeOps;
+  int regBuff;
+} TuningConfig;
+
+typedef struct {
+  TuningConfig* configs;  // Changed from static array to dynamic pointer
+  int numConfigs;
+  int maxConfigs;         // Added to track allocated size
+  size_t nRanks;
+  size_t nNodes;
+  ncclDebugLogger_t logFunction;
+} TunerContext;
+
+// Parse collective type from string
+static ncclFunc_t parseCollType(const char* str) {
+  if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast;
+  if (strcmp(str, "reduce") == 0) return ncclFuncReduce;
+  if (strcmp(str, "allgather") == 0) return ncclFuncAllGather;
+  if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter;
+  if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce;
+  return ncclFuncAllReduce; // default
+}
+
+// Convert collective type to string
+static const char* collTypeToString(ncclFunc_t collType) {
+  switch (collType) {
+    case ncclFuncBroadcast: return "broadcast";
+    case ncclFuncReduce: return "reduce";
+    case ncclFuncAllGather: return "allgather";
+    case ncclFuncReduceScatter: return "reducescatter";
+    case ncclFuncAllReduce: return "allreduce";
+    default: return "unknown";
+  }
+}
+
+// Parse algorithm from string
+static int parseAlgorithm(const char* str) {
+  if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE;
+  if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING;
+  if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT;
+  if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN;
+  if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS;
+  if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE;
+  if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT;
+  return NCCL_ALGO_RING; // default
+}
+
+// Convert algorithm to string
+static const char* algorithmToString(int algorithm) {
+  switch (algorithm) {
+    case NCCL_ALGO_TREE: return "tree";
+    case NCCL_ALGO_RING: return "ring";
+    case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct";
+    case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain";
+    case NCCL_ALGO_NVLS: return "nvls";
+    case NCCL_ALGO_NVLS_TREE: return "nvls_tree";
+    case NCCL_ALGO_PAT: return "pat";
+    default: return "unknown";
+  }
+}
+
+// Parse protocol from string
+static int parseProtocol(const char* str) {
+  if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL;
+  if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128;
+  if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE;
+  return NCCL_PROTO_SIMPLE; // default
+}
+
+// Convert protocol to string
+static const char* protocolToString(int protocol) {
+  switch (protocol) {
+    case NCCL_PROTO_LL: return "ll";
+    case NCCL_PROTO_LL128: return "ll128";
+    case NCCL_PROTO_SIMPLE: return "simple";
+    default: return "unknown";
+  }
+}
+
+// Helper function to count valid configuration lines in file
+static int countConfigLines(const char* filename) {
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    return 0;
+  }
+
+  char line[MAX_LINE_LENGTH];
+  int count = 0;
+
+  while (fgets(line, sizeof(line), file)) {
+    // Skip comments and empty lines
+    if (line[0] == '#' || line[0] == '\n') continue;
+
+    // Remove trailing newline
+    line[strcspn(line, "\n")] = 0;
+
+    // Check if line has content
+    if (strlen(line) > 0) {
+      count++;
+    }
+  }
+
+  fclose(file);
+  return count;
+}
+
+// Load configuration from file
+static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Config file %s not found, using defaults", filename);
+    }
+    return ncclSuccess; // Not finding config file is not an error
+  }
+
+  // First pass: count valid configuration lines
+  int configCount = countConfigLines(filename);
+  if (configCount == 0) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: No valid configurations found in %s", filename);
+    }
+    fclose(file);
+    return ncclSuccess;
+  }
+
+  // Allocate memory for configurations based on actual count
+  ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig));
+  if (!ctx->configs) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount);
+    }
+    fclose(file);
+    return ncclSystemError;
+  }
+
+  ctx->maxConfigs = configCount;
+  ctx->numConfigs = 0;
+
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount);
+  }
+
+  // Reset file pointer to beginning
+  fseek(file, 0, SEEK_SET);
+
+  char line[MAX_LINE_LENGTH];
+
+  while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) {
+    // Skip comments and empty lines
+    if (line[0] == '#' || line[0] == '\n') continue;
+
+    // Remove trailing newline
+    line[strcspn(line, "\n")] = 0;
+
+    // Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+    char* token;
+    char* tokens[CONFIG_FIELDS_MAX];
+    int tokenCount = 0;
+
+    // Make a copy of the line for tokenizing
+    char lineCopy[MAX_LINE_LENGTH];
+    strncpy(lineCopy, line, sizeof(lineCopy));
+    lineCopy[sizeof(lineCopy) - 1] = '\0';
+
+    // Tokenize by comma
+    token = strtok(lineCopy, ",");
+    while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) {
+      // Trim whitespace
+      while (*token == ' ' || *token == '\t') token++;
+      char* end = token + strlen(token) - 1;
+      while (end > token && (*end == ' ' || *end == '\t')) {
+        *end = '\0';
+        end--;
+      }
+      tokens[tokenCount++] = token;
+      token = strtok(NULL, ",");
+    }
+
+    // Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10)
+    if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) {
+      TuningConfig* config = &ctx->configs[ctx->numConfigs];
+      config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]);
+      config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10);
+      config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10);
+      config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]);
+      config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]);
+      config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]);
+      config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]);
+      config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]);
+
+      // numPipeOps is optional (9th field, index 8)
+      if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) {
+        config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]);
+      } else {
+        config->numPipeOps = -1; // -1 means match any numPipeOps
+      }
+
+      // regBuff is optional (10th field, index 9)
+      if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) {
+        config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]);
+      } else {
+        config->regBuff = -1; // -1 means match any regBuff value
+      }
+
+      ctx->numConfigs++;
+
+      if (ctx->logFunction) {
+        if (config->numPipeOps == -1 && config->regBuff == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks);
+        } else if (config->regBuff == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->numPipeOps);
+        } else if (config->numPipeOps == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->regBuff);
+        } else {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
+        }
+      }
+    }
+  }
+
+  fclose(file);
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename);
+  }
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
+  TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
+  if (!ctx) return ncclSystemError;
+
+  ctx->configs = NULL;     // Initialize to NULL
+  ctx->numConfigs = 0;
+  ctx->maxConfigs = 0;     // Initialize to 0
+  ctx->nRanks = nRanks;
+  ctx->nNodes = nNodes;
+  ctx->logFunction = logFunction;
+
+  if (logFunction) {
+    logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
+  }
+
+  // Try to load config file from environment variable or default location
+  const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE");
+  if (!configFile) {
+    configFile = "nccl_tuner.conf"; // default config file name
+  }
+
+  ncclResult_t result = loadConfig(ctx, configFile);
+  if (result != ncclSuccess) {
+    if (ctx->configs) {
+      free(ctx->configs);  // Clean up allocated memory on error
+    }
+    free(ctx);
+    return result;
+  }
+
+  *context = ctx;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels) {
+  TunerContext* ctx = (TunerContext*)context;
+  if (!ctx) return ncclInternalError;
+
+  // Default channels
+  *nChannels = 1;
+
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d",
+                     collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs);
+  }
+
+  // Look for matching configuration
+  for (int i = 0; i < ctx->numConfigs; i++) {
+    TuningConfig* config = &ctx->configs[i];
+
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d",
+                       i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                       config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
+    }
+
+    // Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff
+    if (config->collType == collType &&
+        nBytes >= config->minBytes &&
+        nBytes <= config->maxBytes &&
+        (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) &&
+        (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) &&
+        (config->numPipeOps == -1 || config->numPipeOps == numPipeOps) &&
+        (config->regBuff == -1 || config->regBuff == regBuff)) {
+
+      if (ctx->logFunction) {
+        ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                         "TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d",
+                         algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels);
+      }
+
+      // Check bounds
+      if (config->algorithm < numAlgo && config->protocol < numProto) {
+        if (collCostTable[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) {
+          if (ctx->logFunction) {
+            ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                             "TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)",
+                             algorithmToString(config->algorithm), protocolToString(config->protocol),
+                             &collCostTable[config->algorithm][config->protocol], collCostTable[config->algorithm][config->protocol]);
+          }
+          collCostTable[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration
+
+          // Only override channels if not set to -1 (keep default)
+          if (config->nChannels != -1) {
+            *nChannels = config->nChannels;
+          }
+
+          if (ctx->logFunction) {
+            if (config->nChannels == -1) {
+              ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                               "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)",
+                               collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                               config->nNodes, config->nRanks);
+            } else {
+              ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                               "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)",
+                               collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                               config->nChannels, config->nNodes, config->nRanks);
+            }
+          }
+          return ncclSuccess;
+        } else {
+          if (ctx->logFunction) {
+            ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                             "TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE",
+                             algorithmToString(config->algorithm), protocolToString(config->protocol));
+          }
+        }
+      } else {
+        if (ctx->logFunction) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)",
+                           algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto);
+        }
+      }
+    } else {
+      if (ctx->logFunction) {
+        ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                         "TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d",
+                         config->collType == collType,
+                         (nBytes >= config->minBytes && nBytes <= config->maxBytes),
+                         (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes),
+                         (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks),
+                         (config->numPipeOps == -1 || config->numPipeOps == numPipeOps),
+                         (config->regBuff == -1 || config->regBuff == regBuff));
+      }
+    }
+  }
+
+  // If no specific config found, apply default behavior
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: No matching config found");
+  }
+
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginDestroy(void* context) {
+  if (context) {
+    TunerContext* ctx = (TunerContext*)context;
+    if (ctx->configs) {
+      free(ctx->configs);  // Free dynamically allocated configs array
+    }
+    free(context);
+  }
+  return ncclSuccess;
+}
+
+#define PLUGIN_NAME "Example"
+
+const ncclTuner_v4_t ncclTunerPlugin_v4 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .getCollInfo = pluginGetCollInfo,
+  .destroy = pluginDestroy
+};
--- a/ext-tuner/example/scripts/README.md
+++ b/ext-tuner/example/scripts/README.md
@ -0,0 +1,106 @@
+# NCCL Tuner Configuration Scripts
+
+This directory contains scripts for optimizing NCCL tuner configurations based on performance data.
+
+## optimize_config.py
+
+A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations.
+
+### Usage
+
+```bash
+python scripts/optimize_config.py [options] <input_csv_file>
+```
+
+### Options
+
+- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`)
+- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`)
+- `--no-header`: Don't add header comments to output file
+- `--dry-run`: Print configurations without writing to file
+
+### CSV Input Format
+
+The input CSV file should have the following columns:
+
+```csv
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
+```
+
+**Required columns:**
+- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.)
+- `size_bytes`: Message size in bytes
+- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.)
+- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`)
+- `channels`: Number of channels (or `-1` for default)
+- `nodes`: Number of nodes (or `-1` for any)
+- `ranks`: Number of ranks (or `-1` for any)
+- `pipeOps`: Number of pipeline operations (or `-1` for any)
+- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any)
+
+**Optional metrics (must have at least one present):**
+- `bandwidth_gbps`: Bandwidth in GB/s (higher is better)
+- `latency_us`: Latency in microseconds (lower is better)
+
+### Examples
+
+**Basic usage with cost optimization:**
+```bash
+python scripts/optimize_config.py sample_performance_data.csv
+```
+
+**Optimize for bandwidth and write to custom file:**
+```bash
+python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv
+```
+
+**Preview configurations without writing:**
+```bash
+python scripts/optimize_config.py --dry-run performance_data.csv
+```
+
+### How It Works
+
+1. **Data Loading**: Reads CSV performance data and validates format
+2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters
+3. **Size Ranges**: Automatically bins data into size ranges for optimization
+4. **Optimization**: Finds the best performing configuration for each group/size combination
+5. **Output**: Generates NCCL tuner config format and appends to specified file
+
+### Default Size Ranges
+
+The script uses these default size ranges (in bytes):
+- Small: 0 - 1,024
+- Medium: 1,025 - 65,536
+- Large: 65,537 - 1,048,576
+- XLarge: 1,048,577 - 16,777,216
+- XXLarge: 16,777,217 - 4,294,967,295
+
+### Sample Data
+
+See `sample_performance_data.csv` for an example of the expected input format.
+
+### Integration with NCCL
+
+The generated configuration file can be used directly with the NCCL tuner plugin:
+
+```bash
+export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf
+export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so
+mpirun -np 8 your_nccl_application
+```
+
+### Performance Data Collection
+
+To collect performance data for optimization, you can:
+
+1. **Use NCCL benchmarks** with different algorithm/protocol combinations
+2. **Profile your applications** with various tuner settings
+3. **Run systematic sweeps** across parameter combinations
+4. **Use NCCL debug output** to collect timing information
+
+The key is to have comprehensive data covering:
+- Different message sizes (small to large)
+- Various topologies (single node, multi-node)
+- All relevant algorithm/protocol combinations
+- Different channel counts and pipeline configurations
--- a/ext-tuner/example/scripts/optimize_config.py
+++ b/ext-tuner/example/scripts/optimize_config.py
@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+"""
+NCCL Tuner Configuration Optimizer
+
+Reads a CSV file containing performance data across different tuning parameters
+and generates optimal NCCL tuner configurations based on the best performing
+combinations.
+
+By default, creates growing size ranges that interpolate between the actual data sizes
+for each unique dimension (node count, rank count combination). This ensures that
+different cluster configurations get their own optimized size boundaries, as
+performance characteristics often vary significantly between topologies.
+
+Each dimension gets its own set of ranges starting from 0 and extending to the maximum
+size for that dimension, with boundaries at midpoints between consecutive data sizes.
+
+CSV Input Format:
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us
+
+Output Format (NCCL Tuner Config):
+collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+
+Usage Examples:
+  # Auto-create dimension-specific interpolated ranges (default)
+  python3 optimize_config.py data.csv
+
+  # Use custom size ranges (applied to all topologies)
+  python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576"
+
+  # Use hardcoded default ranges (applied to all topologies)
+  python3 optimize_config.py data.csv --no-auto-ranges
+"""
+
+import csv
+import argparse
+import sys
+import os
+from collections import defaultdict
+from typing import Dict, List, Tuple, Any
+
+class PerformanceData:
+    def __init__(self, row: Dict[str, str]):
+        self.collective = row['collective']
+        self.size_bytes = int(row['size_bytes'])
+        self.algorithm = row['algorithm']
+        self.protocol = row['protocol']
+        self.channels = int(row['channels']) if row['channels'] != '-1' else -1
+        self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1
+        self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1
+        self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1
+        self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1
+
+        # Performance metrics
+        self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0))  # Higher is better
+        self.latency_us = float(row.get('latency_us', 0))  # Lower is better
+
+    def get_config_key(self) -> Tuple:
+        """Generate a key for grouping similar configurations"""
+        return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff)
+
+    def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]:
+        """Find which size range this data point belongs to for its dimension"""
+        topology_key = (self.nodes, self.ranks)
+
+        # Get size ranges for this dimension, or fall back to default
+        if topology_key in topology_size_ranges:
+            size_ranges = topology_size_ranges[topology_key]
+        elif (-1, -1) in topology_size_ranges:
+            size_ranges = topology_size_ranges[(-1, -1)]
+        else:
+            # Fallback to first available dimension ranges
+            size_ranges = next(iter(topology_size_ranges.values()))
+
+        for min_size, max_size in size_ranges:
+            if min_size <= self.size_bytes <= max_size:
+                return (min_size, max_size)
+        # If no range found, create a single-point range
+        return (self.size_bytes, self.size_bytes)
+
+class ConfigOptimizer:
+    def __init__(self, optimization_metric: str = 'latency_us'):
+        self.optimization_metric = optimization_metric
+        # Default size ranges - will be overridden by auto-detection
+        self.size_ranges = [
+            (0, 1024),
+            (1025, 64*1024),
+            (64*1024+1, 1024*1024),
+            (1024*1024+1, 16*1024*1024),
+            (16*1024*1024+1, 4*1024*1024*1024-1)
+        ]
+        self.auto_size_ranges = True
+
+    def set_size_ranges(self, ranges: List[Tuple[int, int]]):
+        """Set custom size ranges for optimization"""
+        self.size_ranges = ranges
+        self.auto_size_ranges = False
+
+    def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
+        """Create growing size ranges for each unique (nodes, ranks) dimension"""
+        if not data:
+            return {(-1, -1): self.size_ranges}
+
+        # Group data by dimension (nodes, ranks)
+        topology_data = defaultdict(list)
+        for item in data:
+            topology_key = (item.nodes, item.ranks)
+            topology_data[topology_key].append(item)
+
+        topology_ranges = {}
+
+        for topology_key, items in topology_data.items():
+            nodes, ranks = topology_key
+
+            # Extract unique sizes for this dimension and sort them
+            unique_sizes = sorted(set(item.size_bytes for item in items))
+
+            if len(unique_sizes) <= 1:
+                # Only one size, create a single range from 0 to that size
+                size = unique_sizes[0] if unique_sizes else 0
+                ranges = [(0, size)]
+            else:
+                # Create growing ranges that interpolate between data points
+                ranges = []
+
+                for i, size in enumerate(unique_sizes):
+                    if i == 0:
+                        # First range: 0 to midpoint between first and second size
+                        if len(unique_sizes) > 1:
+                            next_size = unique_sizes[i + 1]
+                            max_size = (size + next_size) // 2
+                        else:
+                            max_size = size
+                        min_size = 0
+                    elif i == len(unique_sizes) - 1:
+                        # Last range: previous max + 1 to current size (and beyond)
+                        min_size = ranges[-1][1] + 1
+                        max_size = size
+                    else:
+                        # Intermediate ranges: previous max + 1 to midpoint with next size
+                        min_size = ranges[-1][1] + 1
+                        next_size = unique_sizes[i + 1]
+                        max_size = (size + next_size) // 2
+
+                    ranges.append((min_size, max_size))
+
+            topology_ranges[topology_key] = ranges
+
+            print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:")
+            for i, (min_size, max_size) in enumerate(ranges):
+                # Count data points that fall in this range for this dimension
+                count = sum(1 for item in items if min_size <= item.size_bytes <= max_size)
+                actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size))
+                if actual_sizes:
+                    size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3])
+                    if len(actual_sizes) > 3:
+                        size_list += f", ... (+{len(actual_sizes)-3} more)"
+                    print(f"  Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})")
+
+        return topology_ranges
+
+    def load_data(self, csv_file: str) -> List[PerformanceData]:
+        """Load performance data from CSV file"""
+        data = []
+        try:
+            with open(csv_file, 'r') as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    try:
+                        data.append(PerformanceData(row))
+                    except (ValueError, KeyError) as e:
+                        print(f"Warning: Skipping invalid row: {row} - {e}")
+        except FileNotFoundError:
+            print(f"Error: File {csv_file} not found")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Error reading {csv_file}: {e}")
+            sys.exit(1)
+
+        print(f"Loaded {len(data)} performance data points")
+
+        # Auto-determine size ranges if enabled
+        if self.auto_size_ranges and data:
+            self.topology_size_ranges = self.auto_determine_size_ranges(data)
+        else:
+            # Use default ranges for all topologies
+            self.topology_size_ranges = {(-1, -1): self.size_ranges}
+
+        return data
+
+    def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool:
+        """Determine if new_data is better than current_best"""
+        if self.optimization_metric == 'bandwidth_gbps':
+            return new_data.bandwidth_gbps > current_best.bandwidth_gbps
+        elif self.optimization_metric == 'latency_us':
+            return new_data.latency_us < current_best.latency_us
+        else:
+            # Default to latency
+            return new_data.latency_us < current_best.latency_us
+
+    def optimize_configurations(self, data: List[PerformanceData]) -> List[str]:
+        """Find optimal configurations and return as NCCL config strings"""
+        # Group data by configuration key and size range
+        grouped_data = defaultdict(lambda: defaultdict(list))
+
+        for item in data:
+            config_key = item.get_config_key()
+            size_range = item.get_size_range_key(self.topology_size_ranges)
+            grouped_data[config_key][size_range].append(item)
+
+        # Store optimal configurations before combining ranges
+        optimal_configs = []
+
+        for config_key, size_ranges_dict in grouped_data.items():
+            collective, nodes, ranks, pipeOps, regBuff = config_key
+
+            for (min_size, max_size), items in size_ranges_dict.items():
+                if not items:
+                    continue
+
+                # Find the best performing configuration for this size range
+                best_item = items[0]
+                for item in items[1:]:
+                    if self.is_better(item, best_item):
+                        best_item = item
+
+                # Store the optimal configuration with its range
+                optimal_configs.append({
+                    'collective': collective,
+                    'min_size': min_size,
+                    'max_size': max_size,
+                    'algorithm': best_item.algorithm,
+                    'protocol': best_item.protocol,
+                    'channels': best_item.channels,
+                    'nodes': best_item.nodes,
+                    'ranks': best_item.ranks,
+                    'pipeOps': best_item.pipeOps,
+                    'regBuff': best_item.regBuff,
+                    'metric_value': getattr(best_item, self.optimization_metric)
+                })
+
+        # Combine sequential ranges with identical tunings
+        combined_configs = self.combine_sequential_ranges(optimal_configs)
+
+        # Generate config strings
+        configs = []
+        for config in combined_configs:
+            config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}"
+            configs.append(config_str)
+
+            print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: "
+                  f"{config['algorithm']}/{config['protocol']} channels={config['channels']} "
+                  f"({self.optimization_metric}={config['metric_value']:.3f})")
+
+        return configs
+
+    def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
+        """Combine sequential ranges that have identical tuning parameters"""
+        if not configs:
+            return configs
+
+        # Group by collective and topology (nodes, ranks)
+        topology_groups = defaultdict(list)
+        for config in configs:
+            topology_key = (config['collective'], config['nodes'], config['ranks'],
+                          config['pipeOps'], config['regBuff'])
+            topology_groups[topology_key].append(config)
+
+        combined_configs = []
+
+        for topology_key, topology_configs in topology_groups.items():
+            # Sort by min_size to ensure proper ordering
+            topology_configs.sort(key=lambda x: x['min_size'])
+
+            # Group by tuning parameters (algorithm, protocol, channels)
+            tuning_groups = defaultdict(list)
+            for config in topology_configs:
+                tuning_key = (config['algorithm'], config['protocol'], config['channels'])
+                tuning_groups[tuning_key].append(config)
+
+            # For each tuning group, combine sequential ranges
+            for tuning_key, tuning_configs in tuning_groups.items():
+                if not tuning_configs:
+                    continue
+
+                # Sort by min_size
+                tuning_configs.sort(key=lambda x: x['min_size'])
+
+                # Combine sequential ranges
+                current_config = tuning_configs[0].copy()
+
+                for next_config in tuning_configs[1:]:
+                    # Check if ranges are adjacent or overlapping
+                    if current_config['max_size'] + 1 >= next_config['min_size']:
+                        # Extend the current range
+                        current_config['max_size'] = max(current_config['max_size'], next_config['max_size'])
+                        # Update metric value to the better one
+                        if self.optimization_metric == 'bandwidth_gbps':
+                            if next_config['metric_value'] > current_config['metric_value']:
+                                current_config['metric_value'] = next_config['metric_value']
+                        else:  # latency_us or default
+                            if next_config['metric_value'] < current_config['metric_value']:
+                                current_config['metric_value'] = next_config['metric_value']
+                    else:
+                        # Gap between ranges, save current and start new one
+                        combined_configs.append(current_config)
+                        current_config = next_config.copy()
+
+                # Add the last configuration
+                combined_configs.append(current_config)
+
+        # Sort final configs by collective, nodes, ranks, then min_size
+        combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size']))
+
+        original_count = len(configs)
+        combined_count = len(combined_configs)
+        if combined_count < original_count:
+            print(f"Combined {original_count} ranges into {combined_count} ranges "
+                  f"(reduced by {original_count - combined_count})")
+
+        return combined_configs
+
+    def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True):
+        """Append optimized configurations to NCCL tuner config file"""
+        try:
+            # Create directory if it doesn't exist
+            config_dir = os.path.dirname(config_file)
+            if config_dir and not os.path.exists(config_dir):
+                os.makedirs(config_dir)
+                print(f"Created directory: {config_dir}")
+
+            # Check if file exists and has content
+            file_exists = os.path.exists(config_file)
+            add_separator = False
+
+            if file_exists:
+                with open(config_file, 'r') as f:
+                    content = f.read().strip()
+                    add_separator = len(content) > 0
+                print(f"Appending to existing file: {config_file}")
+            else:
+                print(f"Creating new file: {config_file}")
+
+            with open(config_file, 'a') as f:
+                if add_separator:
+                    f.write("\n\n")
+
+                if add_header:
+                    f.write(f"# Optimized configurations generated by optimize_config.py\n")
+                    f.write(f"# Optimization metric: {self.optimization_metric}\n")
+                    f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n")
+
+                for config in configs:
+                    f.write(f"{config}\n")
+
+            if file_exists:
+                print(f"Appended {len(configs)} optimized configurations to {config_file}")
+            else:
+                print(f"Created {config_file} with {len(configs)} optimized configurations")
+
+        except PermissionError:
+            print(f"Error: Permission denied writing to {config_file}")
+            print("Try running with appropriate permissions or choose a different output location")
+            sys.exit(1)
+        except OSError as e:
+            print(f"Error: Cannot create/write to {config_file}: {e}")
+            print("Check that the path is valid and you have write permissions")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Unexpected error writing to {config_file}: {e}")
+            sys.exit(1)
+
+def main():
+    parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data")
+    parser.add_argument("csv_file", help="Input CSV file with performance data")
+    parser.add_argument("-o", "--output", default="nccl_tuner.conf",
+                       help="Output NCCL tuner config file (default: nccl_tuner.conf)")
+    parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'],
+                       default='latency_us', help="Optimization metric (default: latency_us)")
+    parser.add_argument("--no-header", action="store_true",
+                       help="Don't add header comments to output file")
+    parser.add_argument("--dry-run", action="store_true",
+                       help="Print configurations without writing to file")
+    parser.add_argument("--no-auto-ranges", action="store_true",
+                       help="Disable automatic size range determination (use default ranges)")
+    parser.add_argument("--size-ranges", type=str,
+                       help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'")
+
+    args = parser.parse_args()
+
+    optimizer = ConfigOptimizer(args.metric)
+
+    # Handle size range configuration
+    if args.size_ranges:
+        # Parse custom size ranges
+        try:
+            ranges = []
+            for range_str in args.size_ranges.split(','):
+                min_size, max_size = map(int, range_str.split('-'))
+                ranges.append((min_size, max_size))
+            optimizer.set_size_ranges(ranges)
+            print(f"Using custom size ranges: {ranges}")
+        except ValueError:
+            print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'")
+            sys.exit(1)
+    elif args.no_auto_ranges:
+        # Disable auto-ranging
+        optimizer.auto_size_ranges = False
+        print("Using default hardcoded size ranges")
+    else:
+        # Auto-ranging is enabled by default - creates one bucket per unique size
+        optimizer.auto_size_ranges = True
+        print("Auto-ranging enabled: will create one bucket per unique size in data")
+
+    # Load and optimize data
+    data = optimizer.load_data(args.csv_file)
+    if not data:
+        print("No valid data found in CSV file")
+        sys.exit(1)
+
+    configs = optimizer.optimize_configurations(data)
+
+    if args.dry_run:
+        print("\nGenerated configurations:")
+        for config in configs:
+            print(config)
+    else:
+        optimizer.append_to_config_file(configs, args.output, not args.no_header)
+
+if __name__ == "__main__":
+    main()
--- a/ext-tuner/example/scripts/sample_performance_data.csv
+++ b/ext-tuner/example/scripts/sample_performance_data.csv
@ -0,0 +1,24 @@
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
+allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5
+allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8
+allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2
+allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1
+allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3
+allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4
+allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2
+allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1
+allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7
+allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1
+broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2
+broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1
+broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5
+broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2
+broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1
+broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3
+allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4
+allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6
+allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1
+allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3
+allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2
+allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1
+allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3
--- a/ext-tuner/example/test/Makefile
+++ b/ext-tuner/example/test/Makefile
@ -0,0 +1,30 @@
+#
+# Makefile for NCCL Tuner Plugin Unit Tests
+#
+
+CC := gcc
+CFLAGS := -Wall -Wextra -g -std=c99 -fPIC
+INC := -I. -I../nccl
+TARGET := test_plugin
+SOURCES := test_plugin.c
+
+# Default target
+all: $(TARGET)
+
+# Build the test executable
+$(TARGET): $(SOURCES)
+	$(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES)
+
+# Run the tests
+test: $(TARGET)
+	./$(TARGET) $(TEST_CASE)
+
+# Run tests with verbose output
+test-verbose: $(TARGET)
+	NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE)
+
+# Clean build artifacts
+clean:
+	rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf
+
+.PHONY: all test test-verbose clean
--- a/ext-tuner/example/test/README.md
+++ b/ext-tuner/example/test/README.md
@ -0,0 +1,205 @@
+# NCCL Tuner Plugin Unit Tests
+
+This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates.
+
+## Test Structure
+
+```
+test/
+├── test_plugin.c     # Main unit test file
+├── Makefile          # Build system for tests
+└── README.md         # This file
+```
+
+## Building and Running Tests
+
+### Quick Start
+
+```bash
+# Build and run all tests
+make test
+
+# Or step by step
+make           # Build test executable
+./test_plugin  # Run tests
+```
+
+### Advanced Testing
+
+```bash
+# Run with memory leak detection (requires valgrind)
+make test-memory
+
+# Run with verbose logging
+make test-verbose
+
+# Generate code coverage report (requires gcov)
+make coverage
+
+# Create sample test configuration files
+make test-configs
+```
+
+## Test Coverage
+
+The unit tests cover the following functionality:
+
+### 1. **Plugin Initialization (`test_plugin_init`)**
+- Tests successful plugin initialization
+- Verifies context allocation
+- Tests cleanup on destroy
+
+### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)**
+- Valid CSV format parsing
+- Comment and empty line handling
+- Invalid format graceful handling
+- Environment variable configuration
+
+### 3. **Collective Type Matching (`test_collective_matching`)**
+- Correct matching of allreduce, broadcast, etc.
+- Algorithm/protocol selection
+- Channel configuration
+
+### 4. **Size Range Matching (`test_size_matching`)**
+- Small, medium, large message size handling
+- Proper range boundary checking
+- Multiple size-based configurations
+
+### 5. **Topology Matching (`test_topology_matching`)**
+- Single-node vs multi-node configurations
+- Exact nNodes/nRanks matching
+- Wildcard matching (-1 values)
+
+### 6. **Default Channels (`test_default_channels`)**
+- Proper handling of -1 channel specification
+- Preservation of NCCL default behavior
+
+### 7. **Registered Buffer Matching (`test_regbuff_matching`)**
+- Configurations based on regBuff parameter
+- Registered vs non-registered buffer handling
+- Backward compatibility with configs missing regBuff
+
+### 8. **Pipeline Operations Matching (`test_pipeops_matching`)**
+- Configurations based on numPipeOps parameter
+- Single vs multiple pipeline operation handling
+- Backward compatibility with configs missing numPipeOps
+
+### 9. **Fallback Behavior (`test_no_match_fallback`)**
+- Default behavior when no config matches
+- Ring/Simple algorithm fallback
+
+## Test Output
+
+Successful test run:
+```
+Running NCCL Tuner Plugin Unit Tests
+=====================================
+PASS: test_plugin_init
+PASS: test_config_parsing_valid
+PASS: test_config_parsing_invalid
+PASS: test_collective_matching
+PASS: test_size_matching
+PASS: test_topology_matching
+PASS: test_default_channels
+PASS: test_regbuff_matching
+PASS: test_pipeops_matching
+PASS: test_no_match_fallback
+
+=====================================
+Test Results: 9/9 tests passed
+All tests PASSED!
+```
+
+Failed test example:
+```
+FAIL: test_collective_matching - Tree/Simple should have low cost
+Test Results: 8/9 tests passed
+Some tests FAILED!
+```
+
+## Mock NCCL Implementation
+
+The tests use the actual NCCL header files from the `../nccl/` directory:
+
+- `tuner.h` - Complete NCCL tuner interface and type definitions
+- `common.h` - Common NCCL types and logging functions
+- `err.h` - NCCL error codes
+
+This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation.
+
+## Integration with CI/CD
+
+```bash
+# Install tests for CI/CD pipeline
+make install-test
+
+# Run as part of automated testing
+make test && echo "Tests passed" || echo "Tests failed"
+```
+
+## Memory Testing
+
+The tests can be run with valgrind for memory leak detection:
+
+```bash
+make test-memory
+```
+
+This will detect:
+- Memory leaks
+- Invalid memory access
+- Use of uninitialized memory
+
+## Code Coverage
+
+Generate code coverage reports to ensure comprehensive testing:
+
+```bash
+make coverage
+# Creates test_plugin.c.gcov with line-by-line coverage
+```
+
+## Adding New Tests
+
+To add a new test:
+
+1. Create a new test function in `test_plugin.c`:
+```c
+int test_new_feature() {
+  // Test setup
+  TEST_ASSERT(condition, "description");
+  // Test cleanup
+  TEST_PASS();
+}
+```
+
+2. Add the test to the main function:
+```c
+total++; passed += test_new_feature();
+```
+
+3. Rebuild and run:
+```bash
+make test
+```
+
+## Debugging Tests
+
+For debugging failed tests:
+
+```bash
+# Compile with debug symbols
+make CFLAGS="-g -O0 -DDEBUG"
+
+# Run with gdb
+gdb ./test_plugin
+```
+
+## Cleaning Up
+
+```bash
+# Remove all build artifacts and temporary files
+make clean
+```
+
+This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases.
--- a/ext-tuner/example/test/test_plugin.c
+++ b/ext-tuner/example/test/test_plugin.c
@ -0,0 +1,856 @@
+/*************************************************************************
+ * Unit tests for NCCL Tuner Plugin
+ ************************************************************************/
+
+#define _GNU_SOURCE  // Enable setenv/unsetenv and other GNU extensions
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <stdarg.h>
+
+
+// Include NCCL tuner header (which includes common.h and err.h)
+#include "tuner.h"
+
+// Include plugin source for testing
+#include "../plugin.c"
+
+// Test framework macros
+#define TEST_ASSERT(condition, message) \
+  do { \
+    if (!(condition)) { \
+      printf("FAIL: %s - %s\n", __func__, message); \
+      return 0; \
+    } \
+  } while(0)
+
+#define TEST_PASS() \
+  do { \
+    printf("PASS: %s\n", __func__); \
+    return 1; \
+  } while(0)
+
+// Global test state
+static int test_log_count = 0;
+
+// Mock logger function
+void mock_logger(ncclDebugLogLevel level, unsigned long flags,
+                 const char* file, int line, const char* fmt, ...) {
+  (void)flags; // Suppress unused parameter warning
+  test_log_count++;
+
+  // Check if we should print based on NCCL_DEBUG level
+  const char* debug_level = getenv("NCCL_DEBUG");
+  int should_print = 0;
+
+  if (debug_level) {
+    if (strcmp(debug_level, "TRACE") == 0) {
+      should_print = 1; // Print everything
+    } else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) {
+      should_print = 1; // Print INFO and below
+    } else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) {
+      should_print = 1; // Print WARN and below
+    }
+  }
+
+  if (!should_print) return;
+
+  // Convert log level to string
+  const char* level_str;
+  switch(level) {
+    case NCCL_LOG_NONE: level_str = "NONE"; break;
+    case NCCL_LOG_VERSION: level_str = "VERSION"; break;
+    case NCCL_LOG_WARN: level_str = "WARN"; break;
+    case NCCL_LOG_INFO: level_str = "INFO"; break;
+    case NCCL_LOG_ABORT: level_str = "ABORT"; break;
+    case NCCL_LOG_TRACE: level_str = "TRACE"; break;
+    default: level_str = "UNKNOWN"; break;
+  }
+
+  // Print log header
+  printf("[TUNER:%s:%s:%d] ", level_str, file, line);
+
+  // Print formatted message
+  va_list args;
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+
+  printf("\n");
+}
+
+// Helper function to create test config file
+void create_test_config(const char* filename, const char* content) {
+  FILE* f = fopen(filename, "w");
+  if (f) {
+    fprintf(f, "%s", content);
+    fclose(f);
+  }
+}
+
+// Test 1: Plugin initialization
+int test_plugin_init() {
+  void* context = NULL;
+
+  // Test successful initialization
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
+  TEST_ASSERT(context != NULL, "Context should be allocated");
+
+  // Clean up
+  pluginDestroy(context);
+  TEST_PASS();
+}
+
+// Test 2: Configuration file parsing - valid CSV
+int test_config_parsing_valid() {
+  const char* test_config =
+    "# Test configuration\n"
+    "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"
+    "broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n"
+    "# Comment line\n"
+    "\n"  // Empty line
+    "reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n";
+
+  create_test_config("test_valid.conf", test_config);
+
+  // Set environment variable to use our test config
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_valid.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 3: Configuration file parsing - invalid CSV
+int test_config_parsing_invalid() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,1  # Missing nRanks and other fields\n"
+    "invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n"
+    "broadcast,abc,def,ring,simple,1,1,1,-1,-1\n";  // Invalid numbers
+
+  create_test_config("test_invalid.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
+  // Should still succeed but with no valid configs loaded
+  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_invalid.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 4: Collective type matching
+int test_collective_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n"
+    "broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n";
+
+  create_test_config("test_match.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  // Create mock cost table
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Default high cost
+    }
+  }
+
+  int nChannels;
+
+  // Test allreduce matching (should match first config)
+  ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                                          cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                                          0, &nChannels);
+
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 8, "Should set 8 channels");
+
+  // Test broadcast matching (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Reset costs
+    }
+  }
+
+  result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 4, "Should set 4 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_match.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 5: Size range matching
+int test_size_matching() {
+  const char* test_config =
+    "allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n"
+    "allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n"
+    "allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n";
+
+  create_test_config("test_size.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+  int nChannels = 1;
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels");
+
+  // Test medium message (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels");
+
+  // Test large message (should match third config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_size.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 6: Topology matching
+int test_topology_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"      // Single node only
+    "allreduce,0,65536,ring,simple,4,4,32,-1,-1\n"      // 4 nodes, 32 ranks exactly
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any topology
+
+  create_test_config("test_topo.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1);
+
+  // Test with single node setup
+  void* context1 = NULL;
+  pluginInit(8, 1, mock_logger, &context1);  // 8 ranks, 1 node
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
+  TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
+
+  pluginDestroy(context1);
+
+  // Test with 4 nodes, 32 ranks setup
+  void* context2 = NULL;
+  pluginInit(32, 4, mock_logger, &context2);  // 32 ranks, 4 nodes
+
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config");
+  TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels");
+
+  // Clean up
+  unlink("test_topo.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 7: Default channels behavior (-1)
+int test_default_channels() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n";  // Use default channels
+
+  create_test_config("test_default.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels = 99;  // Set to known value
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol");
+  TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_default.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 8: regBuff matching
+int test_regbuff_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n"      // Registered buffers only
+    "allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n"      // Non-registered buffers only
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any buffer type (backward compatible)
+
+  create_test_config("test_regbuff.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+  }
+
+  int nChannels;
+
+  // Test registered buffer (should match first config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    1, &nChannels);  // regBuff = 1 (registered)
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels");
+
+  // Test non-registered buffer (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);  // regBuff = 0 (non-registered)
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels");
+
+  // Test backward compatibility - config without regBuff should match any regBuff value
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  // First try with regBuff=2 (unusual value, should match third config)
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    2, &nChannels);  // regBuff = 2 (only third config should match)
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_regbuff.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 9: numPipeOps matching
+int test_pipeops_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n"      // Single pipeline op
+    "allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n"      // Multiple pipeline ops
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any pipeline ops (backward compatible)
+
+  create_test_config("test_pipeops.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+  }
+
+  int nChannels;
+
+  // Test single pipeline op (should match first config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels");
+
+  // Test multiple pipeline ops (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels");
+
+  // Test different number of pipeline ops (should match third config - backward compatible)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_pipeops.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 10: No matching configuration (fallback behavior)
+int test_no_match_fallback() {
+  const char* test_config =
+    "broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n";  // Only broadcast config
+
+  create_test_config("test_fallback.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  // Try allreduce (should not match, use fallback)
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified");
+  TEST_ASSERT(nChannels == 1, "Should use default channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_fallback.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 11: Large configuration files (testing dynamic allocation)
+int test_large_config() {
+  const char* large_config_file = "test_large.conf";
+
+  // Create a large configuration file with many entries
+  // This tests the dynamic allocation functionality
+  FILE* f = fopen(large_config_file, "w");
+  TEST_ASSERT(f != NULL, "Should be able to create large config file");
+
+  // Write header comment
+  fprintf(f, "# Large configuration file for testing dynamic allocation\n");
+  fprintf(f, "# This file contains many configurations to test memory allocation\n");
+
+  // Generate a large number of configurations (much more than the old MAX_CONFIGS=100)
+  const int num_configs = 500; // 5x the old static limit
+  const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"};
+  const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"};
+  const char* protocols[] = {"simple", "ll", "ll128"};
+
+  for (int i = 0; i < num_configs; i++) {
+    // Vary the configurations to create realistic test data
+    const char* coll = collectives[i % 5];
+    const char* algo = algorithms[i % 4];
+    const char* proto = protocols[i % 3];
+
+    size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB
+    size_t max_bytes = min_bytes + 65536;    // 64KB range
+    int channels = (i % 8) + 1;              // 1-8 channels
+    int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes
+    int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks
+    int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps
+    int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1
+
+    fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n",
+            coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff);
+  }
+
+  fclose(f);
+
+  // Set environment to use our large config file
+  setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1);
+
+  // Initialize plugin with large config
+  void* context = NULL;
+  ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
+  TEST_ASSERT(context != NULL, "Context should be allocated");
+
+  // Verify that configurations were loaded
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file");
+  TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size");
+  TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated");
+
+  // Test that we can access configurations throughout the array
+  // (This would have failed with the old static MAX_CONFIGS=100 limit)
+  for (int i = 0; i < ctx->numConfigs; i++) {
+    TuningConfig* config = &ctx->configs[i];
+    // Basic sanity checks on the loaded configurations
+    TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce,
+                "Collective type should be valid");
+    TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes");
+    TEST_ASSERT(config->nChannels > 0, "nChannels should be positive");
+  }
+
+  // Test specific configuration access at various indices
+  // Index 0 (first config)
+  TuningConfig* first_config = &ctx->configs[0];
+  TEST_ASSERT(first_config != NULL, "First config should be accessible");
+
+  // Index in middle
+  TuningConfig* mid_config = &ctx->configs[num_configs / 2];
+  TEST_ASSERT(mid_config != NULL, "Middle config should be accessible");
+
+  // Index near end (this would have crashed with static array of 100)
+  TuningConfig* late_config = &ctx->configs[num_configs - 1];
+  TEST_ASSERT(late_config != NULL, "Last config should be accessible");
+
+  // Test memory allocation size - verify we didn't over-allocate
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Memory allocated for %d configurations (%zu bytes total)",
+              ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig));
+
+  // Test that the plugin can still find matching configurations from the large set
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Default high cost
+    }
+  }
+
+  int nChannels;
+  // Try to find a matching configuration - should work with large config set
+  result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(large_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test 12: Very large configuration stress test
+int test_very_large_config_stress() {
+  const char* stress_config_file = "test_stress.conf";
+
+  // Create an even larger configuration file to stress test the implementation
+  FILE* f = fopen(stress_config_file, "w");
+  TEST_ASSERT(f != NULL, "Should be able to create stress test config file");
+
+  fprintf(f, "# Stress test configuration with very large number of entries\n");
+
+  // Generate an extremely large number of configurations
+  const int stress_configs = 2000; // 20x the old static limit
+
+  for (int i = 0; i < stress_configs; i++) {
+    // Create varied but valid configurations
+    fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n",
+            i * 512, (i * 512) + 1024);
+  }
+
+  fclose(f);
+
+  setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1);
+
+  // Test initialization with stress config
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
+
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations");
+  TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated");
+
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Stress test - loaded %d configurations successfully", stress_configs);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Memory usage: %zu bytes for configuration array",
+              stress_configs * sizeof(TuningConfig));
+
+  // Verify we can access configurations throughout the entire range
+  for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config
+    TuningConfig* config = &ctx->configs[i];
+    TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type");
+    TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes");
+  }
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(stress_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test 13: Edge case - empty config file
+int test_empty_config() {
+  const char* empty_config_file = "test_empty.conf";
+
+  // Create empty config file (only comments)
+  create_test_config(empty_config_file,
+    "# Empty configuration file\n"
+    "# No actual configurations\n"
+    "\n"
+    "\n");
+
+  setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
+
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations");
+  TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations");
+  TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config");
+
+  // Test that plugin still works with no configurations (fallback behavior)
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(empty_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test runner function pointer type
+typedef int (*TestFunction)(void);
+
+// Test registry
+typedef struct {
+  const char* name;
+  TestFunction func;
+  const char* description;
+} TestCase;
+
+// All available tests
+TestCase test_cases[] = {
+  {"init", test_plugin_init, "Plugin initialization"},
+  {"config-valid", test_config_parsing_valid, "Valid configuration parsing"},
+  {"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"},
+  {"collective", test_collective_matching, "Collective type matching"},
+  {"size", test_size_matching, "Size range matching"},
+  {"topology", test_topology_matching, "Topology matching"},
+  {"channels", test_default_channels, "Default channels behavior"},
+  {"regbuff", test_regbuff_matching, "Registered buffer matching"},
+  {"pipeops", test_pipeops_matching, "Pipeline operations matching"},
+  {"fallback", test_no_match_fallback, "Fallback behavior"},
+  {"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
+  {"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
+  {"empty-config", test_empty_config, "Empty configuration file handling"},
+  {NULL, NULL, NULL} // End marker
+};
+
+// Show help/usage information
+void show_help(const char* program_name) {
+  printf("Usage: %s [test_name ...]\n\n", program_name);
+  printf("Available tests:\n");
+  for (int i = 0; test_cases[i].name != NULL; i++) {
+    printf("  %-15s - %s\n", test_cases[i].name, test_cases[i].description);
+  }
+  printf("\nExamples:\n");
+  printf("  %s                    # Run all tests\n", program_name);
+  printf("  %s init               # Run only initialization test\n", program_name);
+  printf("  %s init collective    # Run initialization and collective tests\n", program_name);
+  printf("  %s --help             # Show this help\n", program_name);
+}
+
+// Find test by name
+TestFunction find_test(const char* name) {
+  for (int i = 0; test_cases[i].name != NULL; i++) {
+    if (strcmp(test_cases[i].name, name) == 0) {
+      return test_cases[i].func;
+    }
+  }
+  return NULL;
+}
+
+// Main test runner
+int main(int argc, char* argv[]) {
+  int passed = 0, total = 0;
+
+  // Check for help
+  if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) {
+    show_help(argv[0]);
+    return 0;
+  }
+
+  printf("Running NCCL Tuner Plugin Unit Tests\n");
+  printf("=====================================\n");
+
+  if (argc == 1) {
+    // No arguments - run all tests
+    for (int i = 0; test_cases[i].name != NULL; i++) {
+      total++;
+      passed += test_cases[i].func();
+    }
+  } else {
+    // Run specific tests
+    for (int arg = 1; arg < argc; arg++) {
+      TestFunction test_func = find_test(argv[arg]);
+      if (test_func) {
+        total++;
+        passed += test_func();
+      } else {
+        printf("ERROR: Unknown test '%s'\n", argv[arg]);
+        printf("Use --help to see available tests\n");
+        return 1;
+      }
+    }
+  }
+
+  printf("\n=====================================\n");
+  printf("Test Results: %d/%d tests passed\n", passed, total);
+
+  if (passed == total) {
+    printf("All tests PASSED!\n");
+    return 0;
+  } else {
+    printf("Some tests FAILED!\n");
+    return 1;
+  }
+}
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@ -0,0 +1,165 @@
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+KEEP ?= 0
+DEBUG ?= 0
+ASAN ?= 0
+UBSAN ?= 0
+TRACE ?= 0
+WERROR ?= 0
+PROFAPI ?= 1
+NVTX ?= 1
+RDMA_CORE ?= 0
+NET_PROFILER ?= 0
+MLX5DV ?= 0
+MAX_EXT_NET_PLUGINS ?= 0
+
+NVCC = $(CUDA_HOME)/bin/nvcc
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
+
+# You should define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61
+ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
+# SM35 is deprecated from CUDA12.0 onwards
+CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
+endif
+CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
+CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
+CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
+CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
+CUDA12_8_GENCODE = -gencode=arch=compute_100,code=sm_100 \
+                   -gencode=arch=compute_120,code=sm_120
+CUDA13_GENCODE = -gencode=arch=compute_110,code=sm_110
+
+CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
+CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
+CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
+CUDA12_PTX    = -gencode=arch=compute_90,code=compute_90
+CUDA13_PTX    = -gencode=arch=compute_120,code=compute_120
+
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Prior to SM75 is deprecated from CUDA13.0 onwards
+  NVCC_GENCODE ?= $(CUDA10_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_PTX)
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
+# Include Hopper support if we're using CUDA11.8 or above
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
+# Include Volta support if we're using CUDA9 or above
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
+else
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
+endif
+$(info NVCC_GENCODE is ${NVCC_GENCODE})
+
+# CUDA 13.0 requires c++17
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+  CXXSTD ?= -std=c++17
+else
+  CXXSTD ?= -std=c++11
+endif
+
+CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
+              -Wall -Wno-unused-function -Wno-sign-compare $(CXXSTD) -Wvla \
+              -I $(CUDA_INC) -I $(CUDA_INC)/cccl \
+              $(CXXFLAGS)
+# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
+# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
+# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all
+# Use addprefix so that we can specify more than one path
+NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+
+########## GCOV ##########
+GCOV ?= 0 # disable by default.
+GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
+CXXFLAGS  += ${GCOV_FLAGS}
+NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
+LDFLAGS   += ${GCOV_FLAGS}
+NVLDFLAGS   += ${GCOV_FLAGS:%=-Xcompiler %}
+# $(warning GCOV_FLAGS=${GCOV_FLAGS})
+########## GCOV ##########
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+# Make sure to run with ASAN_OPTIONS=protect_shadow_gap=0 otherwise CUDA will fail with OOM
+ifneq ($(ASAN), 0)
+CXXFLAGS += -fsanitize=address
+LDFLAGS += -fsanitize=address -static-libasan
+NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
+endif
+
+ifneq ($(UBSAN), 0)
+CXXFLAGS += -fsanitize=undefined
+LDFLAGS += -fsanitize=undefined -static-libubsan
+NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+CXXFLAGS  += -Wall -Wextra
+else
+.SILENT:
+endif
+
+ifneq ($(TRACE), 0)
+CXXFLAGS  += -DENABLE_TRACE
+endif
+
+ifeq ($(NVTX), 0)
+CXXFLAGS  += -DNVTX_DISABLE
+endif
+
+ifneq ($(WERROR), 0)
+CXXFLAGS  += -Werror
+endif
+
+ifneq ($(KEEP), 0)
+NVCUFLAGS += -keep
+endif
+
+ifneq ($(PROFAPI), 0)
+CXXFLAGS += -DPROFAPI
+endif
+
+ifneq ($(RDMA_CORE), 0)
+CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 -libverbs
+endif
+
+ifneq ($(MLX5DV), 0)
+CXXFLAGS += -DNCCL_BUILD_MLX5DV=1 -lmlx5
+endif
+
+ifneq ($(NET_PROFILER), 0)
+CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1
+endif
+
+ifneq ($(MAX_EXT_NET_PLUGINS), 0)
+CXXFLAGS += -DNCCL_NET_MAX_PLUGINS=$(MAX_EXT_NET_PLUGINS)
+endif
--- a/makefiles/formatting.mk
+++ b/makefiles/formatting.mk
@ -0,0 +1,33 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
+# As this file defines a new target (format), it should be included at least after the definition of the
+# default target.
+
+ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
+ASTYLEDIR := $(BUILDDIR)/contrib
+ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
+ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
+ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
+ASTYLEVER := 3.1
+ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
+
+$(ASTYLEDIR) :
+	@mkdir -p $(ASTYLEDIR)
+
+$(ASTYLETAR) : $(ASTYLEDIR)
+	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
+
+$(ASTYLEBLD) : $(ASTYLETAR)
+	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
+
+$(ASTYLEBIN) : $(ASTYLEBLD)
+	${MAKE} -C $(ASTYLEBLD)
+
+.PHONY : format
+format : $(ASTYLEBIN)
+	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -0,0 +1,6 @@
+##### version
+NCCL_MAJOR   := 2
+NCCL_MINOR   := 27
+NCCL_PATCH   := 5
+NCCL_SUFFIX  :=
+PKG_REVISION := 1
--- a/pkg/Makefile
+++ b/pkg/Makefile
@ -0,0 +1,26 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.PHONY : all clean
+
+default : build
+build : debian.build txz.build
+
+BUILDDIR ?= $(abspath ../build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := debian txz
+all:   ${TARGETS:%=%.build}
+prep:  ${TARGETS:%=%.prep}
+build: ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+
+%.prep:
+	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
+
+%.build:
+	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
+
+%.clean:
+	${MAKE} -C $* clean
--- a/pkg/debian/.gitignore
+++ b/pkg/debian/.gitignore
--- a/pkg/debian/Makefile
+++ b/pkg/debian/Makefile
@ -0,0 +1,53 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+DEBPREPDIR := $(BUILDDIR)/debian
+PKGDIR  := $(BUILDDIR)/pkg/deb/
+
+DEBGEN_IN  := $(wildcard *.in)
+DEBGEN     := $(DEBGEN_IN:.in=)
+DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
+DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
+
+PKG_TIMESTAMP  := $(shell date -R)
+PKG_ARCH       ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
+PKG_MULTIARCH  ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
+
+prep : $(DEBTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build : prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	@printf "Building Debian package\n"
+	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
+
+clean:
+	rm -Rf $(DEBPREPDIR) $(PKGDIR)
+
+$(DEBPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(DEBPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
+	    $< > $@
+
+$(DEBPREPDIR)/% : %
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DEBPREPDIR)
+	cp -f $< $@
--- a/pkg/debian/changelog.in
+++ b/pkg/debian/changelog.in
@ -0,0 +1,5 @@
+nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
+
+  * Automatic Debian package from build
+
+ -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
--- a/pkg/debian/compat
+++ b/pkg/debian/compat
--- a/pkg/debian/control.in
+++ b/pkg/debian/control.in
@ -0,0 +1,30 @@
+Source: nccl
+Section: libs
+Maintainer: cudatools <cudatools@nvidia.com>
+Priority: optional
+Build-depends: debhelper(>=9)
+Standards-Version: 3.9.5
+
+Package: libnccl${nccl:Major}
+Section: libs
+Architecture: ${pkg:Arch}
+Depends: ${misc:Depends}, ${shlibs:Depends}
+Description: NVIDIA Collective Communication Library (NCCL) Runtime
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+ communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+ broadcast, and reduce-scatter.
+ It has been optimized to achieve high bandwidth on any platform using PCIe,
+ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+ sockets.
+
+Package: libnccl-dev
+Section: libdevel
+Architecture: ${pkg:Arch}
+Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
+Description: NVIDIA Collective Communication Library (NCCL) Development Files
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+ communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+ broadcast, and reduce-scatter.
+ It has been optimized to achieve high bandwidth on any platform using PCIe,
+ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+ sockets.
--- a/pkg/debian/copyright
+++ b/pkg/debian/copyright
@ -0,0 +1 @@
+../../LICENSE.txt
--- a/pkg/debian/gbp.conf
+++ b/pkg/debian/gbp.conf
@ -0,0 +1,9 @@
+[DEFAULT]
+debian-branch   = master
+upstream-branch = master
+
+ignore-new = True
+
+[git-buildpackage]
+
+no-purge = True
--- a/pkg/debian/libnccl-dev.install.in
+++ b/pkg/debian/libnccl-dev.install.in
@ -0,0 +1,4 @@
+bin/ncclras /usr/bin
+include/nccl.h /usr/include
+lib/libnccl.so /usr/lib/${pkg:MultiArch}
+lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
--- a/pkg/debian/libnccl2.install.in
+++ b/pkg/debian/libnccl2.install.in
@ -0,0 +1,2 @@
+lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
+lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
--- a/pkg/debian/rules
+++ b/pkg/debian/rules
@ -11,3 +11,6 @@ override_dh_auto_test:

 override_dh_auto_clean:
 	# Do not make clean
+
+override_dh_builddeb:
+	dh_builddeb -- -Zxz
--- a/pkg/debian/source/format
+++ b/pkg/debian/source/format
--- a/pkg/redhat/Makefile
+++ b/pkg/redhat/Makefile
@ -0,0 +1,62 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+RPMPREPDIR := $(BUILDDIR)/redhat
+PKGDIR  := $(BUILDDIR)/pkg/rpm/
+
+RPMGEN_IN  := $(wildcard *.in)
+RPMGEN     := $(RPMGEN_IN:.in=)
+RPMFILES   := $(RPMGEN)
+RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
+
+PKG_TIMESTAMP  := $(shell date -R)
+ARCH           := $(shell uname -m)
+PKG_ARCH       ?= $(shell uname -m)
+PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
+ifeq ($(PKG_MULTIARCH),)
+# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
+PKG_MULTIARCH  := $(ARCH)-linux-gnu
+endif
+
+prep : $(RPMTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build : prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
+	@printf "Building Redhat package\n"
+	mkdir -p $(PKGDIR)
+	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
+                 --define "_rpmdir $(PKGDIR)" \
+                 --define "_builddir $(PKGDIR)/build/" \
+                 --define "_buildrootdir $(PKGDIR)/buildroot/" \
+                 -bb $(BUILDDIR)/redhat/nccl.spec
+
+clean:
+	rm -Rf $(RPMPREPDIR) $(PKGDIR)
+
+$(RPMPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(RPMPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
+	    $< > $@
+
+$(RPMPREPDIR)/% : %
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(RPMPREPDIR)
+	cp -f $< $@
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@ -0,0 +1,84 @@
+Name:           libnccl
+Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
+Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
+
+Group:          Development/Libraries
+License:        BSD
+URL:            http://developer.nvidia.com/nccl
+Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
+Requires(pre,preun): /sbin/ldconfig
+
+%description
+NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+broadcast, and reduce-scatter.
+It has been optimized to achieve high bandwidth on any platform using PCIe,
+NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+sockets.
+
+%package devel
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
+Group:          Development/Libraries
+Requires:       libnccl >= ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+%description devel
+NCCL development files
+
+%package static
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
+Group:          Development/Libraries
+%description static
+NCCL static library
+
+%define debug_package %{nil}
+
+%prep
+%setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
+
+%build
+
+%install
+rm -rf $RPM_BUILD_ROOT
+install -m 755 -d $RPM_BUILD_ROOT
+install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
+install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
+ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
+
+# devel
+install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
+install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
+install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
+install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
+ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
+
+# static
+install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
+
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+
+%post devel -p /sbin/ldconfig
+%postun devel -p /sbin/ldconfig
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files devel
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_bindir}/ncclras
+%{_includedir}/nccl.h
+%{_libdir}/libnccl.so
+
+%files static
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_libdir}/libnccl_static.a
+
+%files
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_libdir}/libnccl.so.${nccl:Major}
+%{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+
+%changelog
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@ -0,0 +1,40 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+TXZPREPDIR  := $(BUILDDIR)/srctxz
+PKGDIR  := $(BUILDDIR)/pkg/srctxz/
+
+TXZGEN_IN  := $(wildcard *.in)
+TXZGEN     := $(TXZGEN_IN:.in=)
+TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
+
+PKG_REVISION   ?= 3
+PKG_ARCH       := $(shell uname -m)
+
+prep: $(TXZTARGETS)
+
+build: prep
+	$(MAKE) -C ../../src clean
+	@printf "Building source tar.xz package\n"
+	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
+
+clean:
+	rm -Rf $(TXZPREPDIR) $(PKGDIR)
+
+$(TXZPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(TXZPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    $< > $@
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@ -0,0 +1,35 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# To run from $BUILDDIR/
+
+cd ..
+NCCLDIR=`basename $PWD`
+
+echo "Checking for unclean directory ..."
+git clean -x -i
+echo "Clean done."
+echo "Checking for uncommited files ..."
+if [ "`git status -s | wc -l`" != "0" ]; then
+  git status -s
+  echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
+  read
+fi
+
+cd ..
+NCCL_MAJOR=${nccl:Major}
+NCCL_MINOR=${nccl:Minor}
+NCCL_PATCH=${nccl:Patch}
+NCCL_SUFFIX=${nccl:Suffix}
+NCCL_BUILD=${pkg:Revision}
+
+NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
+
+tar --exclude build \
+    --exclude ".git*" \
+    --exclude pkg/srctxz \
+    --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
--- a/pkg/txz/Makefile
+++ b/pkg/txz/Makefile
@ -0,0 +1,43 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+TXZPREPDIR  := $(BUILDDIR)/txz
+PKGDIR  := $(BUILDDIR)/pkg/txz/
+
+TXZGEN_IN  := $(wildcard *.in)
+TXZGEN     := $(TXZGEN_IN:.in=)
+TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
+
+PKG_ARCH   := $(shell uname -m)
+
+prep: $(TXZTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build: prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	@printf "Building tar.xz package\n"
+	(cd $(BUILDDIR); bash txz/create_txz.sh)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
+
+clean:
+	rm -Rf $(TXZPREPDIR) $(PKGDIR)
+
+$(TXZPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(TXZPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    $< > $@
--- a/pkg/txz/create_txz.sh.in
+++ b/pkg/txz/create_txz.sh.in
@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# To run from $BUILDDIR/
+
+BUILDDIR=`basename $PWD`
+
+cd ..
+NCCL_MAJOR=${nccl:Major}
+NCCL_MINOR=${nccl:Minor}
+NCCL_PATCH=${nccl:Patch}
+NCCL_SUFFIX=${nccl:Suffix}
+CUDA_MAJOR=${cuda:Major}
+CUDA_MINOR=${cuda:Minor}
+PKG_REVISION=${pkg:Revision}
+PKG_ARCH=${pkg:Arch}
+
+NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
+
+tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
--- a/src/Makefile
+++ b/src/Makefile
@ -0,0 +1,159 @@
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+include ../makefiles/common.mk
+include ../makefiles/version.mk
+
+##### src files
+INCEXPORTS  := nccl.h
+LIBSRCFILES := \
+	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
+	$(wildcard graph/*.cc) \
+	$(wildcard misc/*.cc) \
+	$(wildcard transport/*.cc) \
+	$(wildcard register/*.cc) \
+	$(wildcard plugin/*.cc) \
+	$(wildcard plugin/net/*.cc) \
+	$(wildcard plugin/tuner/*.cc) \
+	$(wildcard plugin/profiler/*.cc) \
+	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
+BINSRCFILES := ras/client.cc
+
+##### lib files
+LIBNAME     := libnccl.so
+STATICLIBNAME := libnccl_static.a
+##### binaries
+BINNAME := ncclras
+##### pkgconfig files
+PKGCONFIGFILE := nccl.pc
+##### dirs
+BUILDDIR ?= $(abspath ../build)
+INCDIR := $(BUILDDIR)/include
+LIBDIR := $(BUILDDIR)/lib
+OBJDIR := $(BUILDDIR)/obj
+PKGDIR := $(BUILDDIR)/lib/pkgconfig
+BINDIR := $(BUILDDIR)/bin
+##### target files
+CUDARTLIB  ?= cudart_static
+
+# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658
+ifeq ($(CUDARTLIB), cudart_static)
+	LIBSRCFILES += enhcompat.cc
+endif
+
+INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
+LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
+LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
+STATICLIBTARGET := $(STATICLIBNAME)
+PKGTARGET  := $(PKGCONFIGFILE)
+LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
+BINOBJ     := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
+DEPFILES   := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
+LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
+INCPLUGIN  := include/plugin
+
+DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
+
+##### rules
+build : lib staticlib binary
+
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
+
+staticlib : $(LIBDIR)/$(STATICLIBTARGET)
+
+binary : $(BINDIR)/$(BINNAME)
+
+$(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS)
+	$(MAKE) -C ./device
+
+# Empty target to force rebuild
+ALWAYS_REBUILD:
+
+-include $(DEPFILES)
+$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
+
+$(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
+# NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
+	@$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
+	mkdir -p $(INCDIR)
+	@printf "Generating %-35s > %s\n" $< $@
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
+	    $< > $@
+
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
+	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS)
+	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
+	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST))
+
+$(BINDIR)/$(BINNAME): $(BINOBJ)
+	@printf "Linking    %-35s > %s\n" $(BINNAME) $@
+	mkdir -p $(BINDIR)
+	$(CXX) $(CXXFLAGS) $^ -o $@
+
+$(PKGDIR)/nccl.pc : nccl.pc.in
+	mkdir -p $(PKGDIR)
+	@printf "Generating %-35s > %s\n" $< $@
+	sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \
+	    -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    $< > $@
+
+$(INCDIR)/%.h : %.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)
+	install -m 644 $< $@
+
+$(INCDIR)/nccl_%.h : include/nccl_%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)
+	install -m 644 $< $@
+
+$(PKGDIR)/%.pc : %.pc
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(PKGDIR)
+	install -m 644 $< $@
+
+$(OBJDIR)/%.o : %.cc $(INCTARGETS)
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
+	@rm -f $(@:%.o=%.d.tmp)
+
+clean :
+	$(MAKE) -C device clean
+	rm -rf ${BINDIR} ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
+
+install : build
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/lib/pkgconfig
+	mkdir -p $(PREFIX)/include
+	mkdir -p $(PREFIX)/bin
+	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
+	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
+	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+	cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/
+
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
+# Note that formatting.mk defines a new target so in order to not overwrite the default target,
+# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
+# as the BUILDDIR variable.
+include ../makefiles/formatting.mk
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@ -1,203 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllGatherKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
-    /////////////// begin AllGather steps ///////////////
-    int offset;
-    int maxOffset = size-chunkOffset;
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring.userRank[0];
-    offset = chunkOffset + rankDest * size;
-
-    if (thisInput == thisOutput) {
-      Prims::Copy(
-          thisInput  + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    } else {
-      Prims::DoubleCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: copy to next GPU
-    if (pushrecv) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring.userRank[1];
-      offset = chunkOffset + rankDest * size;
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 384
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllGather(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (count == 0)
-    return ncclSuccess;
-
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream));
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllGatherKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class AllGather {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllGather<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream) {
-  return enqueue<AllGather, FuncNull>(sendbuff, recvbuff, count, datatype, 0, comm, stream);
-}
-
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@ -1,233 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllReduceKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += nranks*sliceSize) {
-    /////////////// begin AllReduce steps ///////////////
-    int offset;
-    int maxOffset;
-    int slice;
-
-    // step 0: push data to next GPU
-    slice = ring.userRank[nranks-1];
-    offset = chunkOffset + slice * sliceSize;
-    maxOffset = size-offset;
-
-    Prims::Copy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      slice = ring.userRank[nranks-j];
-      offset = chunkOffset + slice * sliceSize;
-      maxOffset = size-offset;
-
-      Prims::Reduce(
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-
-    // step k - 1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    slice = ring.userRank[0];
-    offset = chunkOffset + slice * sliceSize;
-    maxOffset = size-offset;
-
-    Prims::ReduceCopy(
-        prevInput  + poffset,
-        thisInput  + offset,
-        pushrecv ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
-
-    if (pushrecv) {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * sliceSize;
-	maxOffset = size-offset;
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * sliceSize;
-	maxOffset = size-offset;
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      slice = ring.userRank[1];
-      offset = chunkOffset + slice * sliceSize;
-      maxOffset = size-offset;
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllReduce(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (count == 0)
-    return ncclSuccess;
-
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream));
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template <typename> class RedOp>
-class AllReduce {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllReduce<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
-  return enqueue<AllReduce>(sendbuff, recvbuff, count, datatype, op, 0, comm, stream);
-}
-
--- a/src/allocator.cc
+++ b/src/allocator.cc
@ -0,0 +1,196 @@
+/*************************************************************************
+ * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "transport.h"
+#include "group.h"
+
+NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
+ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+
+#if CUDART_VERSION >= 12010
+  size_t memGran = 0;
+  CUdevice currentDev;
+  CUmemAllocationProp memprop = {};
+  CUmemAccessDesc accessDesc = {};
+  CUmemGenericAllocationHandle handle = (CUmemGenericAllocationHandle)-1;
+  int cudaDev;
+  int flag;
+  int dcnt;
+
+  if (ptr == NULL || size == 0) goto fallback;
+
+  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
+
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+
+  if (ncclCuMemEnable()) {
+    size_t handleSize = size;
+    int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+    // Query device to see if FABRIC handle support is available
+    flag = 0;
+    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
+    if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
+    memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
+    memprop.location.id = currentDev;
+    // Query device to see if RDMA support is available
+    flag = 0;
+    CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
+    if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
+    CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+    CUDACHECK(cudaGetDeviceCount(&dcnt));
+    ALIGN_SIZE(handleSize, memGran);
+
+    if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
+      /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
+      CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
+      if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
+        requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
+        memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
+        /* Allocate the physical memory on the device */
+        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
+      } else if (err != CUDA_SUCCESS) {
+        // Catch and report any error from above
+        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
+      }
+    } else {
+      /* Allocate the physical memory on the device */
+      CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
+    }
+    /* Reserve a virtual address range */
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
+    /* Map the virtual address range to the physical allocation */
+    CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
+    /* Now allow RW access to the newly mapped memory */
+    for (int i = 0; i < dcnt; ++i) {
+      int p2p = 0;
+      if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, i, cudaDev) == cudaSuccess) && p2p)) {
+        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        accessDesc.location.id = i;
+        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
+      }
+      if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
+    }
+    goto exit;
+  }
+
+fallback:
+#endif
+  // Coverity is right to complain that we may pass a NULL ptr to cudaMalloc.  That's deliberate though:
+  // we want CUDA to return an error to the caller.
+  // coverity[var_deref_model]
+  CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
+ncclResult_t  ncclMemFree(void *ptr) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+  int saveDevice;
+
+  CUDACHECK(cudaGetDevice(&saveDevice));
+#if CUDART_VERSION >= 12010
+  CUdevice ptrDev = 0;
+
+  if (ptr == NULL) goto fallback;
+  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
+
+  CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
+  CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
+  if (ncclCuMemEnable()) {
+    NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
+    goto exit;
+  }
+
+fallback:
+#endif
+  CUDACHECKGOTO(cudaFree(ptr), ret, fail);
+
+exit:
+  CUDACHECK(cudaSetDevice(saveDevice));
+  return ret;
+fail:
+  goto exit;
+}
+
+// This is a collective function and should be called by all ranks in the communicator
+ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
+  ncclResult_t ret = ncclSuccess;
+  void* regSymAddr = NULL;
+  size_t allocSize = size;
+  size_t granularity;
+  CUdevice cuDev;
+  CUmemAllocationProp memprop = {};
+  CUmemGenericAllocationHandle memHandle;
+  int bit = 0, cnt = 0;
+
+  // aligment must be power of 2 as an input
+  while (bit < sizeof(size_t) * 8) {
+    if (alignment & (1L << bit)) cnt++;
+    if (cnt == 2) {
+      WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
+      goto fail;
+    }
+    bit++;
+  }
+  // temporarily align the alignment to NCCL_REC_PAGE_SIZE
+  ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
+
+  CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
+  memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  memprop.requestedHandleTypes = ncclCuMemHandleType;
+  memprop.location.id = cuDev;
+  CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
+  ALIGN_SIZE(allocSize, granularity);
+
+  CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
+  ALIGN_SIZE(comm->symAllocHead, alignment);
+  NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, &regSymAddr), ret, fail);
+  NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+  comm->symAllocHead += allocSize;
+  *symPtr = regSymAddr;
+
+exit:
+  return ret;
+fail:
+  *symPtr = NULL;
+  goto exit;
+}
+
+ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
+  CUmemGenericAllocationHandle handle;
+  size_t size = 0;
+  ncclResult_t ret = ncclSuccess;
+  int saveDev = comm->cudaDev;
+  CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
+  if (ncclCuMemEnable()) {
+    CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+    CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
+    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+    CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
+    NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
+    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+  }
+exit:
+  CUDACHECK(cudaSetDevice(saveDev));
+  return ret;
+fail:
+  goto exit;
+}
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@ -1,165 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void BroadcastKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int rank = ring.userRank[0];
-  const int nextRank = ring.userRank[1];
-  const int root = args.root;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int offset = 0; offset < size; offset += sliceSize) {
-    int maxOffset = size-offset;
-    if (rank == root) {
-      Prims::Copy(
-          thisInput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
-    } else if (nextRank == root) {
-      if (pushrecv) maxOffset = 0; // Only wait for signals
-      Prims::Copy(
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      if (pushrecv) {
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(
-            prevInput + boffset,
-            thisOutput + offset,
-            nextOutput + boffset,
-	    sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for last update from next then reset the flag
-      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-      *ring.recvFlagFromNext = 0;
-    }
-
-    if (rank != root) {
-      // reset the flag
-      *ring.recvFlagFromPrev = 0;
-    }
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 256
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingBroadcast(void* buff, const int count, const int root,
-    ncclComm* comm, cudaStream_t stream) {
-  if (count == 0)
-    return ncclSuccess;
-
-  if (comm->nRanks != 1) {
-    KernelArgs<T> args;
-    ArgsSetup(&args, buff, buff, root, count, comm);
-    LAUNCH_KERNEL(BroadcastKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class Broadcast {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int root, ncclComm* comm, cudaStream_t stream) {
-    return RingBroadcast<RedOp<T>, T>(recvbuff, count, root, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclBcast, void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  return enqueue<Broadcast, FuncNull>(nullptr, buff, count, datatype, root, comm, stream);
-}
-
--- a/src/channel.cc
+++ b/src/channel.cc
@ -0,0 +1,177 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "channel.h"
+#include "param.h"
+#include "gdrwrap.h"
+#include "transport.h"
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
+  struct ncclChannel* channel = &comm->channels[channelId];
+  if (channel->id != -1) return ncclSuccess;
+
+  int nRanks = comm->nRanks;
+  int nvlsRanks = comm->localRanks;
+  int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
+  channel->id = channelId;
+  channel->workFifoProduced = 0;
+
+  struct ncclSharedResources* sharedRes = comm->sharedRes;
+  cudaStream_t deviceStream;
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+
+  if (channel->peers == NULL) {
+    // The extra on nRanks+1 is for collnet root (i.e. network)
+    // Allocate everything related to sharedRes with ncclCalloc as this can be
+    // shared between communicators hence should not be tied to comm.
+    if (sharedRes->peers[channelId] == NULL) {
+      NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks));
+    }
+    channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer*>(&comm->memPermanent, nPeers);
+    for (int r = 0; r < nRanks; r++) {
+      channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r];
+      ncclAtomicRefCountIncrement(&channel->peers[r]->refCount);
+    }
+  }
+
+  if (channel->devPeers == NULL) {
+    if (sharedRes->devPeers[channelId] == NULL) {
+      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream));
+    }
+    /* channel->devPeers is not shared, so just free it when calling commFree() */
+    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream));
+    ncclCommPushCudaFree(comm, channel->devPeers);
+    NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
+    for (int r = 0; r < nRanks; r++) {
+      uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream));
+      channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
+    }
+  }
+
+  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, deviceStream));
+  ncclCommPushCudaFree(comm, channel->devRingUserRanks);
+
+  /* guarantee addr has been copied into channel->devPeers */
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
+  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
+  return ncclSuccess;
+}
+
+ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
+  struct ncclChannel* channel = &comm->channels[channelId];
+  struct ncclSharedResources* sharedRes = comm->sharedRes;
+  cudaStream_t deviceStream;
+
+  if (channel->nvlsPeers != NULL)
+    return ncclSuccess;
+
+  if (channel->id == -1)
+    NCCLCHECK(initChannel(comm, channelId));
+
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+
+  int nvlsRanks = comm->localRanks;
+
+  if (share) {
+    channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
+    channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
+    for (int r = 0; r < nvlsRanks; ++r) {
+      int tr = comm->topParentLocalRanks[r];
+      uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
+      channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
+      channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
+      ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
+    }
+  } else {
+    NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, deviceStream));
+    for (int r = 0; r < nvlsRanks; ++r) {
+      uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
+      channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
+      channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
+      ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
+    }
+  }
+
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
+  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
+
+  return ncclSuccess;
+}
+
+ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
+  struct ncclChannel* channel = &comm->channels[channelId];
+  struct ncclSharedResources* sharedRes = comm->sharedRes;
+  uintptr_t addr;
+  cudaStream_t deviceStream;
+
+  if (channel->collnetPeers != NULL)
+    return ncclSuccess;
+
+  if (channel->id == -1)
+    NCCLCHECK(initChannel(comm, channelId));
+
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+
+  if (share) {
+    channel->collnetPeers = parent->channels[channelId].collnetPeers;
+    channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
+    addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
+    channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
+    channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
+    ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
+  } else {
+    NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, deviceStream));
+    addr = (uintptr_t)channel->collnetDevPeers;
+    channel->peers[comm->nRanks] = channel->collnetPeers;
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
+    channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
+    ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
+  }
+
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
+  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
+
+  return ncclSuccess;
+}
+
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) {
+  int nPeers = nRanks + collnetNRanks + nvlsNRanks;
+  /* channel peers are only valid when async init thread completes commAlloc() and
+   * the channel is initialized with initChannel(); if either is not done, this channel
+   * should never be free. */
+  if (channel->id == -1 || channel->peers == NULL) return ncclSuccess;
+
+  // Free transport proxy resources
+  // Note: free all send resources first due to CollNet arrangement
+  for (int r = 0; r < nPeers; r++) {
+    struct ncclChannelPeer* peer = channel->peers[r];
+    if (peer) {
+      if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) {
+        for (int b=0; b<NCCL_MAX_CONNS; b++) {
+          if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
+          if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
+        }
+        if (r == nRanks) {
+          free(channel->collnetPeers);
+          ncclCudaFree(channel->collnetDevPeers);
+        } else if (r == nPeers - 1) {
+          free(channel->nvlsPeers);
+          ncclCudaFree(channel->nvlsDevPeers);
+        }
+      }
+    }
+  }
+
+  free(channel->devPeersHostPtr);
+  return ncclSuccess;
+}
--- a/src/collectives.cc
+++ b/src/collectives.cc
@ -0,0 +1,174 @@
+/*************************************************************************
+ * Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h" // Need some checks here since we access comm
+#include "collectives.h"
+#include "enqueue.h"
+#include "nccl.h"
+#include "nvtx_payload_schemas.h"
+
+const char* ncclFuncToString(ncclFunc_t fn) {
+  switch (fn) {
+  case ncclFuncAllGather: return "AllGather";
+  case ncclFuncAllReduce: return "AllReduce";
+  case ncclFuncBroadcast: return "Broadcast";
+  case ncclFuncRecv: return "Recv";
+  case ncclFuncReduce: return "Reduce";
+  case ncclFuncReduceScatter: return "ReduceScatter";
+  case ncclFuncSendRecv: return "SendRecv";
+  case ncclFuncSend: return "Send";
+  default: return "Invalid";
+  }
+}
+
+const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
+  switch (op) {
+  case ncclDevSum: return "Sum";
+  case ncclDevProd: return "Prod";
+  case ncclDevMinMax: return "MinMax";
+  case ncclDevPreMulSum: return "PreMulSum";
+  case ncclDevSumPostDiv: return "SumPostDiv";
+  default: return "Unknown";
+  }
+}
+
+const char* ncclDatatypeToString(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8: return "ncclInt8";
+  case ncclInt32: return "ncclInt32";
+  case ncclUint32: return "ncclUint32";
+  case ncclInt64: return "ncclInt64";
+  case ncclUint64: return "ncclUint64";
+  case ncclFloat16: return "ncclFloat16";
+  case ncclFloat32: return "ncclFloat32";
+  case ncclFloat64: return "ncclFloat64";
+  case ncclBfloat16: return "ncclBfloat16";
+  case ncclFloat8e4m3: return "ncclFloat8e4m3";
+  case ncclFloat8e5m2: return "ncclFloat8e5m2";
+  default: return "Unknown";
+  }
+}
+
+const char* ncclAlgoToString(int algo) {
+  switch (algo) {
+  case NCCL_ALGO_TREE: return "TREE";
+  case NCCL_ALGO_RING: return "RING";
+  case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
+  case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
+  case NCCL_ALGO_NVLS: return "NVLS";
+  case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
+  case NCCL_ALGO_PAT: return "PAT";
+  default: return "Unknown";
+  }
+}
+
+const char* ncclProtoToString(int proto) {
+  switch (proto) {
+  case NCCL_PROTO_LL: return "LL";
+  case NCCL_PROTO_LL128: return "LL128";
+  case NCCL_PROTO_SIMPLE: return "SIMPLE";
+  default: return "Unknown";
+  }
+}
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  // Just pass the size of one message and not the total bytes sent/received.
+  NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype)));
+
+  struct ncclInfo info = { ncclFuncAllGather, "AllGather",
+    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
+    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op));
+
+  struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
+    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
+    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
+
+  struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+/* Deprecated original "in place" function, similar to MPI */
+NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
+}
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, op));
+
+  struct ncclInfo info = { ncclFuncReduce, "Reduce",
+    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op));
+
+  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
+    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
+
+  struct ncclInfo info = { ncclFuncSend, "Send",
+    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+    1, 1 };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
+
+  struct ncclInfo info = { ncclFuncRecv, "Recv",
+    NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+    1, 1 };
+  return ncclEnqueueCheck(&info);
+}
--- a/src/common_kernel.h
+++ b/src/common_kernel.h
@ -1,530 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COMMON_KERNEL_H_
-#define COMMON_KERNEL_H_
-
-#include <cstdio>
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-// BAR macro and helpers
-#define WARP_SIZE 32
-#define ROUNDUP(x, y)                                                           \
-    (((((x) + (y) - 1) / (y))) * (y))
-#define BAR_EXEC(type, barid, nthreads) \
-    asm("bar." #type " " #barid ", " #nthreads ";\n\t")
-#define BAR_EXPAND(type, barid, nthreads) \
-    BAR_EXEC(type, barid, (nthreads))
-
-// Named barrier macro.
-// Expands to asm("bar.type barid, nthreads") where
-// nthreads has been rounded up to WARP_SIZE.
-#define BAR(type, barid, nthreads) \
-    BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
-
-__device__ unsigned int spinct;
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-__device__ inline void Wait(const FUNC& func) {
-  while (!func()) {
-    // waste time
-    atomicInc(&spinct, 10);
-  }
-}
-
-typedef uint64_t PackType;
-
-// unpack x and y to elements of type T and apply FUNC to each element
-template<class FUNC, typename T>
-struct MULTI {
-  __device__ PackType operator()(const PackType x, const PackType y) const;
-};
-
-template<class FUNC>
-struct MULTI<FUNC, char> {
-  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
-      "PackType must be twice the size of uint32_t.");
-  union converter {
-    PackType storage;
-    struct {
-      uint32_t a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    // for char, we do these as vector ops
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, int> {
-  static_assert(sizeof(PackType) == 2 * sizeof(int),
-      "PackType must be twice the size of int.");
-  union converter {
-    PackType storage;
-    struct {
-      int a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template<class FUNC>
-struct MULTI<FUNC, half> {
-  static_assert(sizeof(PackType) == 2 * sizeof(float),
-      "PackType must be twice the size of float.");
-  union converter {
-    PackType storage;
-    struct {
-      half2 a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-#endif
-
-template<class FUNC>
-struct MULTI<FUNC, float> {
-  static_assert(sizeof(PackType) == 2 * sizeof(float),
-      "PackType must be twice the size of float.");
-  union converter {
-    PackType storage;
-    struct {
-      float a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, double> {
-  static_assert(sizeof(PackType) == sizeof(double),
-      "PackType must be the same size as double.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
-    return __double_as_longlong(rv);
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, unsigned long long> {
-  static_assert(sizeof(PackType) == sizeof(unsigned long long),
-      "PackType must be the same size as unsigned long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    unsigned long long rv = FUNC()(x, y);
-    return rv;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, long long> {
-  static_assert(sizeof(PackType) == sizeof(long long),
-      "PackType must be the same size as long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    long long rv = FUNC()((long long)x, (long long)y);
-    return rv;
-  }
-};
-
-template<typename T, bool FETCHTWO>
-__device__ inline void FetchOneOrTwo64b(PackType& s0,
-    const volatile T * __restrict__ const src0, PackType& s1,
-    const volatile T * __restrict__ const src1, const int idx) {
-  s0 = (reinterpret_cast<const volatile PackType *>(src0))[idx];
-  if (FETCHTWO) {
-    s1 = (reinterpret_cast<const volatile PackType *>(src1))[idx];
-  }
-}
-
-template<typename T, bool STORETWO>
-__device__ inline void StoreOneOrTwo64b(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, PackType val, const int idx) {
-  (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
-  if (STORETWO) {
-    (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
-  }
-}
-
-template<class FUNC, typename T, bool ISREDUCE>
-__device__ inline PackType ReduceOrCopy64b(const PackType s0,
-    const PackType s1) {
-  if (ISREDUCE) {
-    return MULTI<FUNC, T>()(s0, s1);
-  } else {
-    return s0;
-  }
-}
-
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
-template<typename T> inline __device__
-T vFetch(const volatile T* ptr) {
-  return *ptr;
-}
-
-#ifdef CUDA_HAS_HALF
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  half r;
-  r.x = ptr->x;
-  return r;
-}
-#endif
-
-template<typename T> inline __device__
-void vStore(volatile T* ptr, const T val) {
-  *ptr = val;
-}
-
-#ifdef CUDA_HAS_HALF
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  ptr->x = val.x;
-}
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
-    bool HAS_SRC1>
-__device__ inline void ReduceOrCopy(const int tid,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
-    int N) {
-  if (N<=0) {
-    return;
-  }
-
-  const int UNROLL2 = (UNROLL >= 2) ? (UNROLL / 2) : 1;
-  const bool NOUNROLL2 = ((UNROLL / 2) == 0);
-
-  int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;
-
-  // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
-      (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
-      (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = N;
-  }
-
-/*
-  if (threadIdx.x == 0) {
-    printf("** alignable: %s", (alignable ? "YES" : " NO"));
-    printf(", dest0 = 0x%08X", dest0);
-    printf(", src0 = 0x%08X", src0);
-    if (HAS_DEST1) printf(", dest1 = 0x%08X", dest1);
-    if (HAS_SRC1) printf(", src1 = 0x%08X", src1);
-    printf("\n");
-  }
-*/
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  for (int idx = tid; idx < Npreamble; idx += THREADS) {
-    // ought to be no way this is ever more than one iteration, except when
-    // alignable is false
-    T val = vFetch(src0+idx);
-    if (HAS_SRC1) {
-      val = FUNC()(val, vFetch(src1+idx));
-    }
-
-    vStore(dest0+idx, val);
-    if (HAS_DEST1) {
-      vStore(dest1+idx, val);
-    }
-  }
-
-  // reduce N by however many elements we've handled already
-  int Ndone = Npreamble;
-  int Nrem = N - Ndone;
-
-  // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 64-bit alignable.
-  if (alignable) {
-    if (Ndone > 0) {
-      // align up pointers
-      dest0 += Ndone; if (HAS_DEST1) { dest1 += Ndone; }
-      src0  += Ndone; if (HAS_SRC1)  { src1  += Ndone; }
-    }
-
-    // stage 2a: main loop
-    int Nalign = (Nrem / (sizeof(PackType) / sizeof(T)) / (UNROLL * THREADS))
-        * (UNROLL * THREADS); // round down
-
-    #pragma unroll 1 // don't unroll this loop
-    for (int idx = tid; idx < Nalign; idx += UNROLL * THREADS) {
-      PackType t0[UNROLL2];
-      PackType t1[UNROLL2];
-      PackType t2[UNROLL2];
-
-      #pragma unroll
-      for (int j = 0; j < UNROLL2; ++j)
-        FetchOneOrTwo64b<T, HAS_SRC1>(t0[j], src0, t1[j], src1,
-            idx + j * THREADS);
-
-      #pragma unroll
-      for (int j = 0; j < UNROLL2; ++j)
-        t2[j] = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0[j], t1[j]);
-
-      if (!NOUNROLL2) {
-        #pragma unroll
-        for (int j = 0; j < UNROLL2; ++j)
-          FetchOneOrTwo64b<T, HAS_SRC1>(t0[j], src0, t1[j], src1,
-              idx + (UNROLL2 + j) * THREADS);
-      }
-
-      #pragma unroll
-      for (int j = 0; j < UNROLL2; ++j)
-        StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2[j], idx + j * THREADS);
-
-      if (!NOUNROLL2) {
-        #pragma unroll
-        for (int j = 0; j < UNROLL2; ++j)
-          t2[j] = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0[j], t1[j]);
-
-        #pragma unroll
-        for (int j = 0; j < UNROLL2; ++j)
-          StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2[j],
-              idx + (UNROLL2 + j) * THREADS);
-      }
-    }
-
-    // stage 2b: slightly less optimized for section when we don't have full
-    // UNROLLs
-    int Ndone2a = Nalign * (sizeof(PackType)/sizeof(T));
-    Ndone += Ndone2a;
-    Nrem = N - Ndone;
-
-    // TODO: This kind of pointer update arithmetic is expensive.  Should
-    // probably find a better way.
-    if (Nrem > 0) {
-      dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
-      src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
-    }
-
-    Nalign = Nrem / (sizeof(PackType)/sizeof(T));
-
-    #pragma unroll 4
-    for (int idx = tid; idx < Nalign; idx += THREADS) {
-      PackType t0, t1, t2;
-
-      FetchOneOrTwo64b<T, HAS_SRC1>(t0, src0, t1, src1, idx);
-      t2 = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0, t1);
-      StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2, idx);
-    }
-
-    // stage 2c: tail
-    int Ndone2b = Nalign * (sizeof(PackType)/sizeof(T));
-    Ndone += Nalign * (sizeof(PackType)/sizeof(T));
-    Nrem = N - Ndone;
-
-    if (Nrem > 0) {
-      dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
-      src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
-    }
-
-    for (int idx = tid; idx < Nrem; idx += THREADS) {
-      // never ought to make it more than one time through this loop.  only a
-      // few threads should even participate
-      T val = vFetch(src0+idx);
-      if (HAS_SRC1) {
-        val = FUNC()(val, vFetch(src1+idx));
-      }
-
-      vStore(dest0+idx, val);
-      if (HAS_DEST1) {
-        vStore(dest1+idx, val);
-      }
-    }
-  } // done fast path
-}
-
-template<int THREADS, int UNROLL, typename T>
-__device__ inline void CalcLastChunk(int * const bigSliceN,
-    int * const smallSliceN, int * const lastSliceN, int * const numSlices,
-    int * const numBigSlices, int * const numSmallSlices, const int N,
-    const int numChunks, const int chunkSize) {
-  int Nleft = N - ((numChunks - 1) * chunkSize);
-  // semi-equally split up the remaining work into numslices slices.
-  // it's "semi"-equal because we want the divisions to land as neatly as we
-  // can on alignable boundaries
-  int NperTile = UNROLL * THREADS * (sizeof(PackType)/sizeof(T));
-  int numTiles = (Nleft + NperTile - 1) / NperTile;
-  int numTilesPerBigSlice = (numTiles + *numSlices - 1)
-      / *numSlices;
-  int numTilesPerSmallSlice = numTiles / *numSlices;
-
-  *bigSliceN   = NperTile * numTilesPerBigSlice;
-  *smallSliceN = NperTile * numTilesPerSmallSlice;
-  *numBigSlices = numTiles % *numSlices;
-  *numSmallSlices = (*smallSliceN > 0) ?
-      *numSlices - *numBigSlices : 0;
-
-  // the lastSlice will take the place of one of the small slices unless
-  // there are no small slices (because this is a very small reduction), in
-  // which case we replace one of the big slices and leave the small slices
-  // as 0.
-  if (*numSmallSlices > 0) {
-    --*numSmallSlices;
-    if (*numSmallSlices == 0)
-      *smallSliceN = 0;
-  }
-  else {
-    --*numBigSlices;
-    if (*numBigSlices == 0)
-      *bigSliceN = 0;
-  }
-
-  *lastSliceN = Nleft -
-      (*numBigSlices * *bigSliceN
-          + *numSmallSlices * *smallSliceN);
-
-  // in cases where args.N % numSlices is pretty small, we'd rather have one
-  // slightly big last slice than one big slice, a bunch of small slices,
-  // and one smaller last slice
-  if ((*numBigSlices == 1) &&
-      (*numSmallSlices == *numSlices - 2) &&
-      (*lastSliceN < *smallSliceN)) {
-    *numBigSlices += *numSmallSlices;
-    *numSmallSlices = 0;
-    *bigSliceN = *smallSliceN;
-    *smallSliceN = 0;
-    *lastSliceN = Nleft -
-        *numBigSlices * *bigSliceN;
-  }
-
-  // done recalculating
-  *numSlices = *numBigSlices +
-      *numSmallSlices + 1;
-}
-
-// Kernel launch
-template<typename T>
-struct KernelArgs {
-  // general parameters
-  int nRanks;
-  int root;
-  int buffSize;
-  int N;
-  int opIndex;
-  volatile int * __restrict__ opCounter;
-  bool pushrecv;
-
-  // some pre-computed sizes
-  int SliceSize;
-  int SliceOffset;
-  int ChunkSize;
-  int NumChunks;
-
-  // local and remote input, output, and buffer
-  const T * __restrict__ ThisInput;
-  T * __restrict__ ThisOutput;
-
-  DevRing<char>* ring;
-};
-
-template<typename T>
-void ArgsSetup(KernelArgs<T> *args, const void* sendbuff, void* recvbuff,
-		const int root, const int count, ncclComm *comm) {
-  args->nRanks = comm->nRanks;
-  args->root = root;
-  args->buffSize = comm->buffSize;
-  args->N = count;
-  args->opIndex = comm->opSched;
-  args->opCounter = comm->opCounter;
-  args->ThisInput = (const T*)sendbuff;
-  args->ThisOutput = (T*)recvbuff;
-  args->ring = comm->devRing;
-  args->pushrecv = comm->globalMemSpace;
-}
-
-#define LAUNCH_KERNEL(K, THREADS, UNROLL, FUNC, T, \
-		args, stream) do { \
-  dim3 grid(1, 1, 1); \
-  dim3 block(THREADS+1, 1, 1); \
-  void* argptrs[] = {&args}; \
-  CUDACHECK(cudaLaunchKernel( \
-            (void*)K<THREADS, UNROLL, FUNC, T>, \
-            grid, block, argptrs, 0, stream)); \
-} while (0)
-
-template <typename T>
-__device__ inline void incrementOpCounter(const KernelArgs<T> *args) {
-  // increment comm's operation counts
-  __threadfence_system(); // Technically need to ensure that cleared flags
-  // are visible before incrementing op counter.
-  *args->opCounter = args->opIndex+1;
-}
-
-template <int THREADS, typename T> __device__ __forceinline__
-void LoadRing(const DevRing<char>* src, DevRing<T>* dst) {
-  enum { NUM_WORDS = sizeof(DevRing<char>) / sizeof(long long) };
-  static_assert(sizeof(DevRing<char>) % sizeof(long long) == 0, "Bad alignment");
-  static_assert(THREADS >= NUM_WORDS, "Not enough threads to load DevRing");
-  static_assert(sizeof(DevRing<char>) == sizeof(DevRing<T>), "DevRing size mismatch");
-  long long* lldst = reinterpret_cast<long long*>(dst);
-  const long long* llsrc = reinterpret_cast<const long long*>(src);
-  if (threadIdx.x < NUM_WORDS) {
-    lldst[threadIdx.x] = llsrc[threadIdx.x];
-  }
-}
-
-
-#endif // COMMON_KERNEL_H_
--- a/src/copy_kernel.h
+++ b/src/copy_kernel.h
@ -1,57 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COPY_KERNEL_H_
-#define COPY_KERNEL_H_
-
-#include "common_kernel.h"
-
-template<typename T>
-struct FuncPassA {
-  __device__ T operator()(const T x, const T y) const {
-    return x;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template <>
-struct FuncPassA<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    return x;
-  }
-  __device__ half operator()(const half x, const half y) const {
-    half r;
-    r.x = x.x;
-    return r;
-  }
-};
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void Copy(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, false, false>(threadIdx.x,
-      dest, nullptr, src, nullptr, N);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, true, false>(threadIdx.x,
-      dest0, dest1, src, nullptr, N);
-}
-
-#endif // COPY_KERNEL_H_
--- a/src/core.cu
+++ b/src/core.cu
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}`