diff --git a/ext-net/example/Makefile b/ext-net/example/Makefile
index e0a6aa6..9cc623e 100644
--- a/ext-net/example/Makefile
+++ b/ext-net/example/Makefile
@@ -3,15 +3,20 @@
 #
 # See LICENSE.txt for license information
 #
-NCCL_HOME:=../../build/
-CUDA_HOME:=/usr/local/cuda
-INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
-PLUGIN_SO:=libnccl-net.so
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
 
-default: $(PLUGIN_SO)
+SRC_FILES := $(wildcard *.c)
 
-$(PLUGIN_SO): plugin.c
-	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+build: ${BUILDDIR}/libnccl-net-example.so
+
+${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
 
 clean:
-	rm -f $(PLUGIN_SO)
+	rm -f ${BUILDDIR}/libnccl-net-example.so
diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile
index f5cc9f1..777ff5b 100644
--- a/ext-profiler/example/Makefile
+++ b/ext-profiler/example/Makefile
@@ -3,14 +3,20 @@
 #
 # See LICENSE.txt for license information
 #
-NCCL_HOME := ../../build
-INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
-PLUGIN_SO := libnccl-profiler.so
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
 
-default: $(PLUGIN_SO)
+SRC_FILES := $(wildcard *.c)
 
-$(PLUGIN_SO): plugin.c event.c print_event.c
-	$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+build: ${BUILDDIR}/libnccl-profiler-example.so
+
+${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
 
 clean:
-	rm -f $(PLUGIN_SO)
+	rm -f ${BUILDDIR}/libnccl-profiler-example.so
diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c
index e3f707a..b89cd46 100644
--- a/ext-profiler/example/plugin.c
+++ b/ext-profiler/example/plugin.c
@@ -12,7 +12,7 @@
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <unistd.h>
-#include <x86intrin.h>
+#include <time.h>
 #include "event.h"
 #include "print_event.h"
 
@@ -41,22 +41,10 @@ static struct proxyOp* detachPool;
 ncclDebugLogger_t logFn;
 #define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 
-static double freq = -1;
-__hidden void calibrate() {
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  uint64_t timeCycles = __rdtsc();
-  double time = - tv.tv_sec*1e6 - tv.tv_usec;
-  uint64_t total = 0ULL;
-  for (int i = 0; i < 10000; i++) total += __rdtsc();
-  gettimeofday(&tv, NULL);
-  timeCycles = __rdtsc() - timeCycles;
-  time += tv.tv_sec*1e6 + tv.tv_usec;
-  freq = timeCycles / time;
-}
-
 __hidden double gettime(void) {
-  return __rdtsc() / freq;
+  struct timespec t;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return (t.tv_sec*1e6 + (t.tv_nsec*1e-3));
 }
 
 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@@ -98,8 +86,6 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
     // process address space.
     pid = getpid();
 
-    // calibrate and start timer
-    calibrate();
     startTime = gettime();
   }
   pthread_mutex_unlock(&lock);
diff --git a/ext-tuner/basic/Makefile b/ext-tuner/basic/Makefile
new file mode 100644
index 0000000..50edd23
--- /dev/null
+++ b/ext-tuner/basic/Makefile
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
+
+SRC_FILES := $(wildcard *.c)
+DST_DIR   := $(BUILDDIR)/test/unit/plugins
+
+build: ${BUILDDIR}/libnccl-tuner-basic.so
+
+${BUILDDIR}/libnccl-tuner-basic.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
+
+clean:
+	rm -f ${BUILDDIR}/libnccl-tuner-basic.so
diff --git a/ext-tuner/basic/nccl/common.h b/ext-tuner/basic/nccl/common.h
new file mode 100644
index 0000000..9129252
--- /dev/null
+++ b/ext-tuner/basic/nccl/common.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
diff --git a/ext-tuner/basic/nccl/err.h b/ext-tuner/basic/nccl/err.h
new file mode 100644
index 0000000..bb92e83
--- /dev/null
+++ b/ext-tuner/basic/nccl/err.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
diff --git a/ext-tuner/basic/nccl/tuner.h b/ext-tuner/basic/nccl/tuner.h
new file mode 100644
index 0000000..77b543d
--- /dev/null
+++ b/ext-tuner/basic/nccl/tuner.h
@@ -0,0 +1,97 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+#include "err.h"
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+#define NCCL_ALGO_PAT 6
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+#define NCCL_ALGO_PROTO_IGNORE -1.0
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+#endif
diff --git a/ext-tuner/basic/plugin.c b/ext-tuner/basic/plugin.c
new file mode 100644
index 0000000..a17fd00
--- /dev/null
+++ b/ext-tuner/basic/plugin.c
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "tuner.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
+
+__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels) {
+  // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+  }
+  *nChannels = 1;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
+
+#define PLUGIN_NAME "Basic"
+
+const ncclTuner_v4_t ncclTunerPlugin_v4 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .getCollInfo = pluginGetCollInfo,
+  .destroy = pluginDestroy
+};
diff --git a/ext-tuner/example/Makefile b/ext-tuner/example/Makefile
index 9d9ace4..76c16b6 100644
--- a/ext-tuner/example/Makefile
+++ b/ext-tuner/example/Makefile
@@ -3,15 +3,53 @@
 #
 # See LICENSE.txt for license information
 #
-NCCL_HOME:=../../build/
-CUDA_HOME:=/usr/local/cuda
-INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
-PLUGIN_SO:=libnccl-tuner.so
 
-default: $(PLUGIN_SO)
+.DEFAULT_GOAL: build
+PLUGIN_SO:=libnccl-tuner-example.so
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
 
-$(PLUGIN_SO): plugin.c
-	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+SRC_FILES := $(wildcard *.c)
+DST_DIR   := $(BUILDDIR)/test/unit/plugins
+
+default: ${BUILDDIR}/$(PLUGIN_SO)
+
+build: ${BUILDDIR}/$(PLUGIN_SO)
+
+${BUILDDIR}/$(PLUGIN_SO): plugin.c
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+# Test targets - delegate to test directory
+test:
+	$(MAKE) -C test test TEST_CASE=$(TEST_CASE)
+
+test-verbose:
+	$(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE)
+
+# Build tests
+test-build:
+	$(MAKE) -C test all
+
+# Optimize configurations from performance data
+optimize-config:
+	@if [ -z "$(CSV_FILE)" ]; then \
+		echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \
+		echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \
+		exit 1; \
+	fi
+	python3 scripts/optimize_config.py $(CSV_FILE) \
+		$(if $(OUTPUT),-o $(OUTPUT)) \
+		$(if $(METRIC),-m $(METRIC)) \
+		$(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \
+		$(if $(DRY_RUN),--dry-run) \
+		$(if $(NO_HEADER),--no-header)
 
 clean:
-	rm -f $(PLUGIN_SO)
+	rm -f ${BUILDDIR}/$(PLUGIN_SO)
+	$(MAKE) -C test clean
+
+.PHONY: test test-verbose test-build optimize-config clean
diff --git a/ext-tuner/example/README.md b/ext-tuner/example/README.md
new file mode 100644
index 0000000..7f472ae
--- /dev/null
+++ b/ext-tuner/example/README.md
@@ -0,0 +1,164 @@
+# NCCL Example Tuner Plugin
+
+This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling.
+
+## Features
+
+- **File-based Configuration**: Read tuning parameters from a CSV configuration file
+- **Size-based Tuning**: Specify different configurations based on message size ranges
+- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks
+- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default
+- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE`
+- **Fallback Behavior**: Gracefully handles missing config files and invalid entries
+
+## Building
+
+```bash
+make
+```
+
+This will create `libnccl-tuner-example.so` that can be loaded by NCCL.
+
+## Configuration File Format
+
+The configuration file uses CSV (Comma-Separated Values) format with one configuration per line:
+
+```
+collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+```
+
+### Parameters
+
+- **collective_type**: The collective operation type
+  - `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce`
+
+- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies
+  - Use `0` for minimum and `4294967295` for maximum (covers all sizes)
+
+- **algorithm**: The NCCL algorithm to use
+  - `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat`
+
+- **protocol**: The NCCL protocol to use
+  - `ll`, `ll128`, `simple`
+
+- **channels**: Number of channels (SMs) to use
+  - Use a positive integer to specify exact channel count
+  - Use `-1` to keep NCCL's default channel selection
+
+- **nNodes**: Number of nodes to match
+  - Use a positive integer to match specific node count
+  - Use `-1` to match any number of nodes
+
+- **nRanks**: Number of ranks to match
+  - Use a positive integer to match specific rank count
+  - Use `-1` to match any number of ranks
+
+- **numPipeOps**: Number of pipeline operations to match (optional)
+  - Use a positive integer to match specific pipeline operation count
+  - Use `-1` to match any number of pipeline operations
+  - If omitted, configuration will match any numPipeOps value
+
+- **regBuff**: Whether user buffer can be registered (optional)
+  - Use `0` to match only non-registered buffers
+  - Use `1` to match only registered buffers
+  - Use `-1` to match either registered or non-registered buffers
+  - If omitted, configuration will match any regBuff value
+
+### Example Configuration
+
+```csv
+# Single-node, small allreduce: use tree algorithm, registered buffers only
+allreduce,0,65536,tree,simple,2,1,-1,-1,1
+
+# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers
+allreduce,65537,1048576,ring,simple,4,4,32,1,0
+
+# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type
+allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
+
+# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible)
+broadcast,0,32768,tree,simple,-1,1,-1
+
+# Multi-node broadcast: optimized for non-registered buffers, single pipeline op
+broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
+```
+
+Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc.
+
+### Backward Compatibility
+
+Configurations without the numPipeOps and/or regBuff parameters are fully supported:
+- 8 fields: matches any numPipeOps and regBuff values
+- 9 fields: matches any regBuff value
+- 10 fields: full parameter specification
+
+This ensures existing configuration files continue to work without modification.
+
+## Usage
+
+### Method 1: Default Config File
+Place your configuration in `nccl_tuner.conf` in the current working directory.
+
+### Method 2: Environment Variable
+Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path:
+
+```bash
+export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf
+export LD_LIBRARY_PATH=/path/to/plugin:$LD_LIBRARY_PATH
+mpirun -np 4 your_nccl_application
+```
+
+## Editing Configuration Files
+
+### Generating Configuration Files from Raw Data
+
+A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md).
+
+### Spreadsheet Tips:
+- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff`
+- Save as CSV format (not Excel format) for the plugin to read
+- Use data validation to prevent typos in algorithm/protocol names
+
+## Logging
+
+The plugin uses NCCL's logging system. To see tuner-related messages:
+
+```bash
+export NCCL_DEBUG=INFO
+```
+
+This will show when configurations are loaded and applied, including the topology information.
+
+For detailed debugging output during tuning decisions:
+
+```bash
+export NCCL_DEBUG=TRACE
+```
+
+This will show verbose information about which configurations are being evaluated and matched.
+
+## Dimension Matching
+
+Configurations are only applied when the topology matches:
+
+- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks
+- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks
+- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node
+- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology
+
+This allows you to create specialized configurations for different cluster setups while maintaining flexibility.
+
+## Default Behavior
+
+If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic.
+
+When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size.
+
+## Troubleshooting
+
+1. **Config file not found**: Check the file path and permissions
+2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters
+3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory
+4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO`
+5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards
+6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces
diff --git a/ext-tuner/example/nccl_tuner.conf b/ext-tuner/example/nccl_tuner.conf
new file mode 100644
index 0000000..13eb2f0
--- /dev/null
+++ b/ext-tuner/example/nccl_tuner.conf
@@ -0,0 +1,45 @@
+# NCCL Tuner Configuration File (CSV Format)
+# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#
+# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
+# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
+# Protocols: ll, ll128, simple
+# Channels: number of channels to use, or -1 to keep default
+# nNodes: number of nodes to match, or -1 for any number of nodes
+# nRanks: number of ranks to match, or -1 for any number of ranks
+# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
+# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
+#
+# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
+#
+# Examples:
+
+# For single-node configurations with registered buffers
+# Small allreduce operations on single node - use tree algorithm, registered buffers
+allreduce,0,65536,tree,simple,2,1,-1,-1,1
+
+# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers
+# Medium allreduce operations - use ring algorithm
+allreduce,65537,1048576,ring,simple,4,4,32,1,0
+
+# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type
+allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
+
+# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types
+# Single node broadcast - prefer tree, any pipeOps, registered buffers only
+broadcast,0,32768,tree,simple,-1,1,-1,-1,1
+
+# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring
+broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
+
+# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type
+allgather,0,4294967295,ring,simple,4,2,-1
+
+# ReduceScatter operations
+# Small messages on single node, single pipeline op, registered buffers
+reducescatter,0,131072,tree,simple,2,1,-1,1,1
+# Large messages on any topology, multiple pipeline ops, non-registered buffers
+reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0
+
+# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type
+reduce,0,4294967295,tree,simple,-1,-1,-1
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
index 7925dcf..1b8031e 100644
--- a/ext-tuner/example/plugin.c
+++ b/ext-tuner/example/plugin.c
@@ -5,24 +5,443 @@
  ************************************************************************/
 
 #include "tuner.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
 
 #define __hidden __attribute__ ((visibility("hidden")))
+#define MAX_LINE_LENGTH 256
 
-__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
+// CSV field indices for configuration parsing
+// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#define CONFIG_FIELD_COLLTYPE     0
+#define CONFIG_FIELD_MINBYTES     1
+#define CONFIG_FIELD_MAXBYTES     2
+#define CONFIG_FIELD_ALGORITHM    3
+#define CONFIG_FIELD_PROTOCOL     4
+#define CONFIG_FIELD_CHANNELS     5
+#define CONFIG_FIELD_NNODES       6
+#define CONFIG_FIELD_NRANKS       7
+#define CONFIG_FIELD_PIPEOPS      8  // Optional field
+#define CONFIG_FIELD_REGBUFF      9  // Optional field
+
+// Field count constants
+#define CONFIG_FIELDS_REQUIRED    8   // Minimum required fields (up to nRanks)
+#define CONFIG_FIELDS_WITH_PIPEOPS 9  // Fields including numPipeOps
+#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff
+#define CONFIG_FIELDS_MAX         10  // Maximum number of fields supported
+
+typedef struct {
+  ncclFunc_t collType;
+  size_t minBytes;
+  size_t maxBytes;
+  int algorithm;
+  int protocol;
+  int nChannels;
+  int nNodes;
+  int nRanks;
+  int numPipeOps;
+  int regBuff;
+} TuningConfig;
+
+typedef struct {
+  TuningConfig* configs;  // Changed from static array to dynamic pointer
+  int numConfigs;
+  int maxConfigs;         // Added to track allocated size
+  size_t nRanks;
+  size_t nNodes;
+  ncclDebugLogger_t logFunction;
+} TunerContext;
+
+// Parse collective type from string
+static ncclFunc_t parseCollType(const char* str) {
+  if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast;
+  if (strcmp(str, "reduce") == 0) return ncclFuncReduce;
+  if (strcmp(str, "allgather") == 0) return ncclFuncAllGather;
+  if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter;
+  if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce;
+  return ncclFuncAllReduce; // default
+}
+
+// Convert collective type to string
+static const char* collTypeToString(ncclFunc_t collType) {
+  switch (collType) {
+    case ncclFuncBroadcast: return "broadcast";
+    case ncclFuncReduce: return "reduce";
+    case ncclFuncAllGather: return "allgather";
+    case ncclFuncReduceScatter: return "reducescatter";
+    case ncclFuncAllReduce: return "allreduce";
+    default: return "unknown";
+  }
+}
+
+// Parse algorithm from string
+static int parseAlgorithm(const char* str) {
+  if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE;
+  if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING;
+  if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT;
+  if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN;
+  if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS;
+  if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE;
+  if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT;
+  return NCCL_ALGO_RING; // default
+}
+
+// Convert algorithm to string
+static const char* algorithmToString(int algorithm) {
+  switch (algorithm) {
+    case NCCL_ALGO_TREE: return "tree";
+    case NCCL_ALGO_RING: return "ring";
+    case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct";
+    case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain";
+    case NCCL_ALGO_NVLS: return "nvls";
+    case NCCL_ALGO_NVLS_TREE: return "nvls_tree";
+    case NCCL_ALGO_PAT: return "pat";
+    default: return "unknown";
+  }
+}
+
+// Parse protocol from string
+static int parseProtocol(const char* str) {
+  if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL;
+  if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128;
+  if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE;
+  return NCCL_PROTO_SIMPLE; // default
+}
+
+// Convert protocol to string
+static const char* protocolToString(int protocol) {
+  switch (protocol) {
+    case NCCL_PROTO_LL: return "ll";
+    case NCCL_PROTO_LL128: return "ll128";
+    case NCCL_PROTO_SIMPLE: return "simple";
+    default: return "unknown";
+  }
+}
+
+// Helper function to count valid configuration lines in file
+static int countConfigLines(const char* filename) {
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    return 0;
+  }
+
+  char line[MAX_LINE_LENGTH];
+  int count = 0;
+
+  while (fgets(line, sizeof(line), file)) {
+    // Skip comments and empty lines
+    if (line[0] == '#' || line[0] == '\n') continue;
+
+    // Remove trailing newline
+    line[strcspn(line, "\n")] = 0;
+
+    // Check if line has content
+    if (strlen(line) > 0) {
+      count++;
+    }
+  }
+
+  fclose(file);
+  return count;
+}
+
+// Load configuration from file
+static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Config file %s not found, using defaults", filename);
+    }
+    return ncclSuccess; // Not finding config file is not an error
+  }
+
+  // First pass: count valid configuration lines
+  int configCount = countConfigLines(filename);
+  if (configCount == 0) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: No valid configurations found in %s", filename);
+    }
+    fclose(file);
+    return ncclSuccess;
+  }
+
+  // Allocate memory for configurations based on actual count
+  ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig));
+  if (!ctx->configs) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount);
+    }
+    fclose(file);
+    return ncclSystemError;
+  }
+
+  ctx->maxConfigs = configCount;
+  ctx->numConfigs = 0;
+
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount);
+  }
+
+  // Reset file pointer to beginning
+  fseek(file, 0, SEEK_SET);
+
+  char line[MAX_LINE_LENGTH];
+
+  while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) {
+    // Skip comments and empty lines
+    if (line[0] == '#' || line[0] == '\n') continue;
+
+    // Remove trailing newline
+    line[strcspn(line, "\n")] = 0;
+
+    // Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+    char* token;
+    char* tokens[CONFIG_FIELDS_MAX];
+    int tokenCount = 0;
+
+    // Make a copy of the line for tokenizing
+    char lineCopy[MAX_LINE_LENGTH];
+    strncpy(lineCopy, line, sizeof(lineCopy));
+    lineCopy[sizeof(lineCopy) - 1] = '\0';
+
+    // Tokenize by comma
+    token = strtok(lineCopy, ",");
+    while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) {
+      // Trim whitespace
+      while (*token == ' ' || *token == '\t') token++;
+      char* end = token + strlen(token) - 1;
+      while (end > token && (*end == ' ' || *end == '\t')) {
+        *end = '\0';
+        end--;
+      }
+      tokens[tokenCount++] = token;
+      token = strtok(NULL, ",");
+    }
+
+    // Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10)
+    if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) {
+      TuningConfig* config = &ctx->configs[ctx->numConfigs];
+      config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]);
+      config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10);
+      config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10);
+      config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]);
+      config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]);
+      config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]);
+      config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]);
+      config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]);
+
+      // numPipeOps is optional (9th field, index 8)
+      if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) {
+        config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]);
+      } else {
+        config->numPipeOps = -1; // -1 means match any numPipeOps
+      }
+
+      // regBuff is optional (10th field, index 9)
+      if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) {
+        config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]);
+      } else {
+        config->regBuff = -1; // -1 means match any regBuff value
+      }
+
+      ctx->numConfigs++;
+
+      if (ctx->logFunction) {
+        if (config->numPipeOps == -1 && config->regBuff == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks);
+        } else if (config->regBuff == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->numPipeOps);
+        } else if (config->numPipeOps == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->regBuff);
+        } else {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
+        }
+      }
+    }
+  }
+
+  fclose(file);
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename);
+  }
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
+  TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
+  if (!ctx) return ncclSystemError;
+
+  ctx->configs = NULL;     // Initialize to NULL
+  ctx->numConfigs = 0;
+  ctx->maxConfigs = 0;     // Initialize to 0
+  ctx->nRanks = nRanks;
+  ctx->nNodes = nNodes;
+  ctx->logFunction = logFunction;
+
+  if (logFunction) {
+    logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
+  }
+
+  // Try to load config file from environment variable or default location
+  const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE");
+  if (!configFile) {
+    configFile = "nccl_tuner.conf"; // default config file name
+  }
+
+  ncclResult_t result = loadConfig(ctx, configFile);
+  if (result != ncclSuccess) {
+    if (ctx->configs) {
+      free(ctx->configs);  // Clean up allocated memory on error
+    }
+    free(ctx);
+    return result;
+  }
+
+  *context = ctx;
+  return ncclSuccess;
+}
 
 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
                               int regBuff, int* nChannels) {
-  // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
-    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
-  }
+  TunerContext* ctx = (TunerContext*)context;
+  if (!ctx) return ncclInternalError;
+
+  // Default channels
   *nChannels = 1;
+
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d",
+                     collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs);
+  }
+
+  // Look for matching configuration
+  for (int i = 0; i < ctx->numConfigs; i++) {
+    TuningConfig* config = &ctx->configs[i];
+
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d",
+                       i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                       config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
+    }
+
+    // Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff
+    if (config->collType == collType &&
+        nBytes >= config->minBytes &&
+        nBytes <= config->maxBytes &&
+        (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) &&
+        (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) &&
+        (config->numPipeOps == -1 || config->numPipeOps == numPipeOps) &&
+        (config->regBuff == -1 || config->regBuff == regBuff)) {
+
+      if (ctx->logFunction) {
+        ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                         "TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d",
+                         algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels);
+      }
+
+      // Check bounds
+      if (config->algorithm < numAlgo && config->protocol < numProto) {
+        if (collCostTable[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) {
+          if (ctx->logFunction) {
+            ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                             "TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)",
+                             algorithmToString(config->algorithm), protocolToString(config->protocol),
+                             &collCostTable[config->algorithm][config->protocol], collCostTable[config->algorithm][config->protocol]);
+          }
+          collCostTable[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration
+
+          // Only override channels if not set to -1 (keep default)
+          if (config->nChannels != -1) {
+            *nChannels = config->nChannels;
+          }
+
+          if (ctx->logFunction) {
+            if (config->nChannels == -1) {
+              ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                               "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)",
+                               collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                               config->nNodes, config->nRanks);
+            } else {
+              ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                               "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)",
+                               collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                               config->nChannels, config->nNodes, config->nRanks);
+            }
+          }
+          return ncclSuccess;
+        } else {
+          if (ctx->logFunction) {
+            ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                             "TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE",
+                             algorithmToString(config->algorithm), protocolToString(config->protocol));
+          }
+        }
+      } else {
+        if (ctx->logFunction) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)",
+                           algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto);
+        }
+      }
+    } else {
+      if (ctx->logFunction) {
+        ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                         "TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d",
+                         config->collType == collType,
+                         (nBytes >= config->minBytes && nBytes <= config->maxBytes),
+                         (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes),
+                         (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks),
+                         (config->numPipeOps == -1 || config->numPipeOps == numPipeOps),
+                         (config->regBuff == -1 || config->regBuff == regBuff));
+      }
+    }
+  }
+
+  // If no specific config found, apply default behavior
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: No matching config found");
+  }
+
   return ncclSuccess;
 }
 
-__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
+__hidden ncclResult_t pluginDestroy(void* context) {
+  if (context) {
+    TunerContext* ctx = (TunerContext*)context;
+    if (ctx->configs) {
+      free(ctx->configs);  // Free dynamically allocated configs array
+    }
+    free(context);
+  }
+  return ncclSuccess;
+}
 
 #define PLUGIN_NAME "Example"
 
diff --git a/ext-tuner/example/scripts/README.md b/ext-tuner/example/scripts/README.md
new file mode 100644
index 0000000..d31de43
--- /dev/null
+++ b/ext-tuner/example/scripts/README.md
@@ -0,0 +1,106 @@
+# NCCL Tuner Configuration Scripts
+
+This directory contains scripts for optimizing NCCL tuner configurations based on performance data.
+
+## optimize_config.py
+
+A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations.
+
+### Usage
+
+```bash
+python scripts/optimize_config.py [options] <input_csv_file>
+```
+
+### Options
+
+- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`)
+- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`)
+- `--no-header`: Don't add header comments to output file
+- `--dry-run`: Print configurations without writing to file
+
+### CSV Input Format
+
+The input CSV file should have the following columns:
+
+```csv
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
+```
+
+**Required columns:**
+- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.)
+- `size_bytes`: Message size in bytes
+- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.)
+- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`)
+- `channels`: Number of channels (or `-1` for default)
+- `nodes`: Number of nodes (or `-1` for any)
+- `ranks`: Number of ranks (or `-1` for any)
+- `pipeOps`: Number of pipeline operations (or `-1` for any)
+- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any)
+
+**Optional metrics (must have at least one present):**
+- `bandwidth_gbps`: Bandwidth in GB/s (higher is better)
+- `latency_us`: Latency in microseconds (lower is better)
+
+### Examples
+
+**Basic usage with cost optimization:**
+```bash
+python scripts/optimize_config.py sample_performance_data.csv
+```
+
+**Optimize for bandwidth and write to custom file:**
+```bash
+python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv
+```
+
+**Preview configurations without writing:**
+```bash
+python scripts/optimize_config.py --dry-run performance_data.csv
+```
+
+### How It Works
+
+1. **Data Loading**: Reads CSV performance data and validates format
+2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters
+3. **Size Ranges**: Automatically bins data into size ranges for optimization
+4. **Optimization**: Finds the best performing configuration for each group/size combination
+5. **Output**: Generates NCCL tuner config format and appends to specified file
+
+### Default Size Ranges
+
+The script uses these default size ranges (in bytes):
+- Small: 0 - 1,024
+- Medium: 1,025 - 65,536
+- Large: 65,537 - 1,048,576
+- XLarge: 1,048,577 - 16,777,216
+- XXLarge: 16,777,217 - 4,294,967,295
+
+### Sample Data
+
+See `sample_performance_data.csv` for an example of the expected input format.
+
+### Integration with NCCL
+
+The generated configuration file can be used directly with the NCCL tuner plugin:
+
+```bash
+export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf
+export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so
+mpirun -np 8 your_nccl_application
+```
+
+### Performance Data Collection
+
+To collect performance data for optimization, you can:
+
+1. **Use NCCL benchmarks** with different algorithm/protocol combinations
+2. **Profile your applications** with various tuner settings
+3. **Run systematic sweeps** across parameter combinations
+4. **Use NCCL debug output** to collect timing information
+
+The key is to have comprehensive data covering:
+- Different message sizes (small to large)
+- Various topologies (single node, multi-node)
+- All relevant algorithm/protocol combinations
+- Different channel counts and pipeline configurations
diff --git a/ext-tuner/example/scripts/optimize_config.py b/ext-tuner/example/scripts/optimize_config.py
new file mode 100644
index 0000000..c5c9b70
--- /dev/null
+++ b/ext-tuner/example/scripts/optimize_config.py
@@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+"""
+NCCL Tuner Configuration Optimizer
+
+Reads a CSV file containing performance data across different tuning parameters
+and generates optimal NCCL tuner configurations based on the best performing
+combinations.
+
+By default, creates growing size ranges that interpolate between the actual data sizes
+for each unique dimension (node count, rank count combination). This ensures that
+different cluster configurations get their own optimized size boundaries, as
+performance characteristics often vary significantly between topologies.
+
+Each dimension gets its own set of ranges starting from 0 and extending to the maximum
+size for that dimension, with boundaries at midpoints between consecutive data sizes.
+
+CSV Input Format:
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us
+
+Output Format (NCCL Tuner Config):
+collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+
+Usage Examples:
+  # Auto-create dimension-specific interpolated ranges (default)
+  python3 optimize_config.py data.csv
+
+  # Use custom size ranges (applied to all topologies)
+  python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576"
+
+  # Use hardcoded default ranges (applied to all topologies)
+  python3 optimize_config.py data.csv --no-auto-ranges
+"""
+
+import csv
+import argparse
+import sys
+import os
+from collections import defaultdict
+from typing import Dict, List, Tuple, Any
+
+class PerformanceData:
+    def __init__(self, row: Dict[str, str]):
+        self.collective = row['collective']
+        self.size_bytes = int(row['size_bytes'])
+        self.algorithm = row['algorithm']
+        self.protocol = row['protocol']
+        self.channels = int(row['channels']) if row['channels'] != '-1' else -1
+        self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1
+        self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1
+        self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1
+        self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1
+
+        # Performance metrics
+        self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0))  # Higher is better
+        self.latency_us = float(row.get('latency_us', 0))  # Lower is better
+
+    def get_config_key(self) -> Tuple:
+        """Generate a key for grouping similar configurations"""
+        return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff)
+
+    def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]:
+        """Find which size range this data point belongs to for its dimension"""
+        topology_key = (self.nodes, self.ranks)
+
+        # Get size ranges for this dimension, or fall back to default
+        if topology_key in topology_size_ranges:
+            size_ranges = topology_size_ranges[topology_key]
+        elif (-1, -1) in topology_size_ranges:
+            size_ranges = topology_size_ranges[(-1, -1)]
+        else:
+            # Fallback to first available dimension ranges
+            size_ranges = next(iter(topology_size_ranges.values()))
+
+        for min_size, max_size in size_ranges:
+            if min_size <= self.size_bytes <= max_size:
+                return (min_size, max_size)
+        # If no range found, create a single-point range
+        return (self.size_bytes, self.size_bytes)
+
+class ConfigOptimizer:
+    def __init__(self, optimization_metric: str = 'latency_us'):
+        self.optimization_metric = optimization_metric
+        # Default size ranges - will be overridden by auto-detection
+        self.size_ranges = [
+            (0, 1024),
+            (1025, 64*1024),
+            (64*1024+1, 1024*1024),
+            (1024*1024+1, 16*1024*1024),
+            (16*1024*1024+1, 4*1024*1024*1024-1)
+        ]
+        self.auto_size_ranges = True
+
+    def set_size_ranges(self, ranges: List[Tuple[int, int]]):
+        """Set custom size ranges for optimization"""
+        self.size_ranges = ranges
+        self.auto_size_ranges = False
+
+    def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
+        """Create growing size ranges for each unique (nodes, ranks) dimension"""
+        if not data:
+            return {(-1, -1): self.size_ranges}
+
+        # Group data by dimension (nodes, ranks)
+        topology_data = defaultdict(list)
+        for item in data:
+            topology_key = (item.nodes, item.ranks)
+            topology_data[topology_key].append(item)
+
+        topology_ranges = {}
+
+        for topology_key, items in topology_data.items():
+            nodes, ranks = topology_key
+
+            # Extract unique sizes for this dimension and sort them
+            unique_sizes = sorted(set(item.size_bytes for item in items))
+
+            if len(unique_sizes) <= 1:
+                # Only one size, create a single range from 0 to that size
+                size = unique_sizes[0] if unique_sizes else 0
+                ranges = [(0, size)]
+            else:
+                # Create growing ranges that interpolate between data points
+                ranges = []
+
+                for i, size in enumerate(unique_sizes):
+                    if i == 0:
+                        # First range: 0 to midpoint between first and second size
+                        if len(unique_sizes) > 1:
+                            next_size = unique_sizes[i + 1]
+                            max_size = (size + next_size) // 2
+                        else:
+                            max_size = size
+                        min_size = 0
+                    elif i == len(unique_sizes) - 1:
+                        # Last range: previous max + 1 to current size (and beyond)
+                        min_size = ranges[-1][1] + 1
+                        max_size = size
+                    else:
+                        # Intermediate ranges: previous max + 1 to midpoint with next size
+                        min_size = ranges[-1][1] + 1
+                        next_size = unique_sizes[i + 1]
+                        max_size = (size + next_size) // 2
+
+                    ranges.append((min_size, max_size))
+
+            topology_ranges[topology_key] = ranges
+
+            print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:")
+            for i, (min_size, max_size) in enumerate(ranges):
+                # Count data points that fall in this range for this dimension
+                count = sum(1 for item in items if min_size <= item.size_bytes <= max_size)
+                actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size))
+                if actual_sizes:
+                    size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3])
+                    if len(actual_sizes) > 3:
+                        size_list += f", ... (+{len(actual_sizes)-3} more)"
+                    print(f"  Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})")
+
+        return topology_ranges
+
+    def load_data(self, csv_file: str) -> List[PerformanceData]:
+        """Load performance data from CSV file"""
+        data = []
+        try:
+            with open(csv_file, 'r') as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    try:
+                        data.append(PerformanceData(row))
+                    except (ValueError, KeyError) as e:
+                        print(f"Warning: Skipping invalid row: {row} - {e}")
+        except FileNotFoundError:
+            print(f"Error: File {csv_file} not found")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Error reading {csv_file}: {e}")
+            sys.exit(1)
+
+        print(f"Loaded {len(data)} performance data points")
+
+        # Auto-determine size ranges if enabled
+        if self.auto_size_ranges and data:
+            self.topology_size_ranges = self.auto_determine_size_ranges(data)
+        else:
+            # Use default ranges for all topologies
+            self.topology_size_ranges = {(-1, -1): self.size_ranges}
+
+        return data
+
+    def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool:
+        """Determine if new_data is better than current_best"""
+        if self.optimization_metric == 'bandwidth_gbps':
+            return new_data.bandwidth_gbps > current_best.bandwidth_gbps
+        elif self.optimization_metric == 'latency_us':
+            return new_data.latency_us < current_best.latency_us
+        else:
+            # Default to latency
+            return new_data.latency_us < current_best.latency_us
+
+    def optimize_configurations(self, data: List[PerformanceData]) -> List[str]:
+        """Find optimal configurations and return as NCCL config strings"""
+        # Group data by configuration key and size range
+        grouped_data = defaultdict(lambda: defaultdict(list))
+
+        for item in data:
+            config_key = item.get_config_key()
+            size_range = item.get_size_range_key(self.topology_size_ranges)
+            grouped_data[config_key][size_range].append(item)
+
+        # Store optimal configurations before combining ranges
+        optimal_configs = []
+
+        for config_key, size_ranges_dict in grouped_data.items():
+            collective, nodes, ranks, pipeOps, regBuff = config_key
+
+            for (min_size, max_size), items in size_ranges_dict.items():
+                if not items:
+                    continue
+
+                # Find the best performing configuration for this size range
+                best_item = items[0]
+                for item in items[1:]:
+                    if self.is_better(item, best_item):
+                        best_item = item
+
+                # Store the optimal configuration with its range
+                optimal_configs.append({
+                    'collective': collective,
+                    'min_size': min_size,
+                    'max_size': max_size,
+                    'algorithm': best_item.algorithm,
+                    'protocol': best_item.protocol,
+                    'channels': best_item.channels,
+                    'nodes': best_item.nodes,
+                    'ranks': best_item.ranks,
+                    'pipeOps': best_item.pipeOps,
+                    'regBuff': best_item.regBuff,
+                    'metric_value': getattr(best_item, self.optimization_metric)
+                })
+
+        # Combine sequential ranges with identical tunings
+        combined_configs = self.combine_sequential_ranges(optimal_configs)
+
+        # Generate config strings
+        configs = []
+        for config in combined_configs:
+            config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}"
+            configs.append(config_str)
+
+            print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: "
+                  f"{config['algorithm']}/{config['protocol']} channels={config['channels']} "
+                  f"({self.optimization_metric}={config['metric_value']:.3f})")
+
+        return configs
+
+    def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
+        """Combine sequential ranges that have identical tuning parameters"""
+        if not configs:
+            return configs
+
+        # Group by collective and topology (nodes, ranks)
+        topology_groups = defaultdict(list)
+        for config in configs:
+            topology_key = (config['collective'], config['nodes'], config['ranks'],
+                          config['pipeOps'], config['regBuff'])
+            topology_groups[topology_key].append(config)
+
+        combined_configs = []
+
+        for topology_key, topology_configs in topology_groups.items():
+            # Sort by min_size to ensure proper ordering
+            topology_configs.sort(key=lambda x: x['min_size'])
+
+            # Group by tuning parameters (algorithm, protocol, channels)
+            tuning_groups = defaultdict(list)
+            for config in topology_configs:
+                tuning_key = (config['algorithm'], config['protocol'], config['channels'])
+                tuning_groups[tuning_key].append(config)
+
+            # For each tuning group, combine sequential ranges
+            for tuning_key, tuning_configs in tuning_groups.items():
+                if not tuning_configs:
+                    continue
+
+                # Sort by min_size
+                tuning_configs.sort(key=lambda x: x['min_size'])
+
+                # Combine sequential ranges
+                current_config = tuning_configs[0].copy()
+
+                for next_config in tuning_configs[1:]:
+                    # Check if ranges are adjacent or overlapping
+                    if current_config['max_size'] + 1 >= next_config['min_size']:
+                        # Extend the current range
+                        current_config['max_size'] = max(current_config['max_size'], next_config['max_size'])
+                        # Update metric value to the better one
+                        if self.optimization_metric == 'bandwidth_gbps':
+                            if next_config['metric_value'] > current_config['metric_value']:
+                                current_config['metric_value'] = next_config['metric_value']
+                        else:  # latency_us or default
+                            if next_config['metric_value'] < current_config['metric_value']:
+                                current_config['metric_value'] = next_config['metric_value']
+                    else:
+                        # Gap between ranges, save current and start new one
+                        combined_configs.append(current_config)
+                        current_config = next_config.copy()
+
+                # Add the last configuration
+                combined_configs.append(current_config)
+
+        # Sort final configs by collective, nodes, ranks, then min_size
+        combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size']))
+
+        original_count = len(configs)
+        combined_count = len(combined_configs)
+        if combined_count < original_count:
+            print(f"Combined {original_count} ranges into {combined_count} ranges "
+                  f"(reduced by {original_count - combined_count})")
+
+        return combined_configs
+
+    def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True):
+        """Append optimized configurations to NCCL tuner config file"""
+        try:
+            # Create directory if it doesn't exist
+            config_dir = os.path.dirname(config_file)
+            if config_dir and not os.path.exists(config_dir):
+                os.makedirs(config_dir)
+                print(f"Created directory: {config_dir}")
+
+            # Check if file exists and has content
+            file_exists = os.path.exists(config_file)
+            add_separator = False
+
+            if file_exists:
+                with open(config_file, 'r') as f:
+                    content = f.read().strip()
+                    add_separator = len(content) > 0
+                print(f"Appending to existing file: {config_file}")
+            else:
+                print(f"Creating new file: {config_file}")
+
+            with open(config_file, 'a') as f:
+                if add_separator:
+                    f.write("\n\n")
+
+                if add_header:
+                    f.write(f"# Optimized configurations generated by optimize_config.py\n")
+                    f.write(f"# Optimization metric: {self.optimization_metric}\n")
+                    f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n")
+
+                for config in configs:
+                    f.write(f"{config}\n")
+
+            if file_exists:
+                print(f"Appended {len(configs)} optimized configurations to {config_file}")
+            else:
+                print(f"Created {config_file} with {len(configs)} optimized configurations")
+
+        except PermissionError:
+            print(f"Error: Permission denied writing to {config_file}")
+            print("Try running with appropriate permissions or choose a different output location")
+            sys.exit(1)
+        except OSError as e:
+            print(f"Error: Cannot create/write to {config_file}: {e}")
+            print("Check that the path is valid and you have write permissions")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Unexpected error writing to {config_file}: {e}")
+            sys.exit(1)
+
+def main():
+    parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data")
+    parser.add_argument("csv_file", help="Input CSV file with performance data")
+    parser.add_argument("-o", "--output", default="nccl_tuner.conf",
+                       help="Output NCCL tuner config file (default: nccl_tuner.conf)")
+    parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'],
+                       default='latency_us', help="Optimization metric (default: latency_us)")
+    parser.add_argument("--no-header", action="store_true",
+                       help="Don't add header comments to output file")
+    parser.add_argument("--dry-run", action="store_true",
+                       help="Print configurations without writing to file")
+    parser.add_argument("--no-auto-ranges", action="store_true",
+                       help="Disable automatic size range determination (use default ranges)")
+    parser.add_argument("--size-ranges", type=str,
+                       help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'")
+
+    args = parser.parse_args()
+
+    optimizer = ConfigOptimizer(args.metric)
+
+    # Handle size range configuration
+    if args.size_ranges:
+        # Parse custom size ranges
+        try:
+            ranges = []
+            for range_str in args.size_ranges.split(','):
+                min_size, max_size = map(int, range_str.split('-'))
+                ranges.append((min_size, max_size))
+            optimizer.set_size_ranges(ranges)
+            print(f"Using custom size ranges: {ranges}")
+        except ValueError:
+            print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'")
+            sys.exit(1)
+    elif args.no_auto_ranges:
+        # Disable auto-ranging
+        optimizer.auto_size_ranges = False
+        print("Using default hardcoded size ranges")
+    else:
+        # Auto-ranging is enabled by default - creates one bucket per unique size
+        optimizer.auto_size_ranges = True
+        print("Auto-ranging enabled: will create one bucket per unique size in data")
+
+    # Load and optimize data
+    data = optimizer.load_data(args.csv_file)
+    if not data:
+        print("No valid data found in CSV file")
+        sys.exit(1)
+
+    configs = optimizer.optimize_configurations(data)
+
+    if args.dry_run:
+        print("\nGenerated configurations:")
+        for config in configs:
+            print(config)
+    else:
+        optimizer.append_to_config_file(configs, args.output, not args.no_header)
+
+if __name__ == "__main__":
+    main()
diff --git a/ext-tuner/example/scripts/sample_performance_data.csv b/ext-tuner/example/scripts/sample_performance_data.csv
new file mode 100644
index 0000000..7b96403
--- /dev/null
+++ b/ext-tuner/example/scripts/sample_performance_data.csv
@@ -0,0 +1,24 @@
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
+allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5
+allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8
+allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2
+allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1
+allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3
+allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4
+allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2
+allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1
+allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7
+allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1
+broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2
+broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1
+broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5
+broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2
+broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1
+broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3
+allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4
+allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6
+allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1
+allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3
+allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2
+allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1
+allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3
diff --git a/ext-tuner/example/test/Makefile b/ext-tuner/example/test/Makefile
new file mode 100644
index 0000000..d675cbe
--- /dev/null
+++ b/ext-tuner/example/test/Makefile
@@ -0,0 +1,30 @@
+#
+# Makefile for NCCL Tuner Plugin Unit Tests
+#
+
+CC := gcc
+CFLAGS := -Wall -Wextra -g -std=c99 -fPIC
+INC := -I. -I../nccl
+TARGET := test_plugin
+SOURCES := test_plugin.c
+
+# Default target
+all: $(TARGET)
+
+# Build the test executable
+$(TARGET): $(SOURCES)
+	$(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES)
+
+# Run the tests
+test: $(TARGET)
+	./$(TARGET) $(TEST_CASE)
+
+# Run tests with verbose output
+test-verbose: $(TARGET)
+	NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE)
+
+# Clean build artifacts
+clean:
+	rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf
+
+.PHONY: all test test-verbose clean
diff --git a/ext-tuner/example/test/README.md b/ext-tuner/example/test/README.md
new file mode 100644
index 0000000..8203c65
--- /dev/null
+++ b/ext-tuner/example/test/README.md
@@ -0,0 +1,205 @@
+# NCCL Tuner Plugin Unit Tests
+
+This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates.
+
+## Test Structure
+
+```
+test/
+├── test_plugin.c     # Main unit test file
+├── Makefile          # Build system for tests
+└── README.md         # This file
+```
+
+## Building and Running Tests
+
+### Quick Start
+
+```bash
+# Build and run all tests
+make test
+
+# Or step by step
+make           # Build test executable
+./test_plugin  # Run tests
+```
+
+### Advanced Testing
+
+```bash
+# Run with memory leak detection (requires valgrind)
+make test-memory
+
+# Run with verbose logging
+make test-verbose
+
+# Generate code coverage report (requires gcov)
+make coverage
+
+# Create sample test configuration files
+make test-configs
+```
+
+## Test Coverage
+
+The unit tests cover the following functionality:
+
+### 1. **Plugin Initialization (`test_plugin_init`)**
+- Tests successful plugin initialization
+- Verifies context allocation
+- Tests cleanup on destroy
+
+### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)**
+- Valid CSV format parsing
+- Comment and empty line handling
+- Invalid format graceful handling
+- Environment variable configuration
+
+### 3. **Collective Type Matching (`test_collective_matching`)**
+- Correct matching of allreduce, broadcast, etc.
+- Algorithm/protocol selection
+- Channel configuration
+
+### 4. **Size Range Matching (`test_size_matching`)**
+- Small, medium, large message size handling
+- Proper range boundary checking
+- Multiple size-based configurations
+
+### 5. **Topology Matching (`test_topology_matching`)**
+- Single-node vs multi-node configurations
+- Exact nNodes/nRanks matching
+- Wildcard matching (-1 values)
+
+### 6. **Default Channels (`test_default_channels`)**
+- Proper handling of -1 channel specification
+- Preservation of NCCL default behavior
+
+### 7. **Registered Buffer Matching (`test_regbuff_matching`)**
+- Configurations based on regBuff parameter
+- Registered vs non-registered buffer handling
+- Backward compatibility with configs missing regBuff
+
+### 8. **Pipeline Operations Matching (`test_pipeops_matching`)**
+- Configurations based on numPipeOps parameter
+- Single vs multiple pipeline operation handling
+- Backward compatibility with configs missing numPipeOps
+
+### 9. **Fallback Behavior (`test_no_match_fallback`)**
+- Default behavior when no config matches
+- Ring/Simple algorithm fallback
+
+## Test Output
+
+Successful test run:
+```
+Running NCCL Tuner Plugin Unit Tests
+=====================================
+PASS: test_plugin_init
+PASS: test_config_parsing_valid
+PASS: test_config_parsing_invalid
+PASS: test_collective_matching
+PASS: test_size_matching
+PASS: test_topology_matching
+PASS: test_default_channels
+PASS: test_regbuff_matching
+PASS: test_pipeops_matching
+PASS: test_no_match_fallback
+
+=====================================
+Test Results: 9/9 tests passed
+All tests PASSED!
+```
+
+Failed test example:
+```
+FAIL: test_collective_matching - Tree/Simple should have low cost
+Test Results: 8/9 tests passed
+Some tests FAILED!
+```
+
+## Mock NCCL Implementation
+
+The tests use the actual NCCL header files from the `../nccl/` directory:
+
+- `tuner.h` - Complete NCCL tuner interface and type definitions
+- `common.h` - Common NCCL types and logging functions
+- `err.h` - NCCL error codes
+
+This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation.
+
+## Integration with CI/CD
+
+```bash
+# Install tests for CI/CD pipeline
+make install-test
+
+# Run as part of automated testing
+make test && echo "Tests passed" || echo "Tests failed"
+```
+
+## Memory Testing
+
+The tests can be run with valgrind for memory leak detection:
+
+```bash
+make test-memory
+```
+
+This will detect:
+- Memory leaks
+- Invalid memory access
+- Use of uninitialized memory
+
+## Code Coverage
+
+Generate code coverage reports to ensure comprehensive testing:
+
+```bash
+make coverage
+# Creates test_plugin.c.gcov with line-by-line coverage
+```
+
+## Adding New Tests
+
+To add a new test:
+
+1. Create a new test function in `test_plugin.c`:
+```c
+int test_new_feature() {
+  // Test setup
+  TEST_ASSERT(condition, "description");
+  // Test cleanup
+  TEST_PASS();
+}
+```
+
+2. Add the test to the main function:
+```c
+total++; passed += test_new_feature();
+```
+
+3. Rebuild and run:
+```bash
+make test
+```
+
+## Debugging Tests
+
+For debugging failed tests:
+
+```bash
+# Compile with debug symbols
+make CFLAGS="-g -O0 -DDEBUG"
+
+# Run with gdb
+gdb ./test_plugin
+```
+
+## Cleaning Up
+
+```bash
+# Remove all build artifacts and temporary files
+make clean
+```
+
+This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases.
diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c
new file mode 100644
index 0000000..28897c4
--- /dev/null
+++ b/ext-tuner/example/test/test_plugin.c
@@ -0,0 +1,856 @@
+/*************************************************************************
+ * Unit tests for NCCL Tuner Plugin
+ ************************************************************************/
+
+#define _GNU_SOURCE  // Enable setenv/unsetenv and other GNU extensions
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <stdarg.h>
+
+
+// Include NCCL tuner header (which includes common.h and err.h)
+#include "tuner.h"
+
+// Include plugin source for testing
+#include "../plugin.c"
+
+// Test framework macros
+#define TEST_ASSERT(condition, message) \
+  do { \
+    if (!(condition)) { \
+      printf("FAIL: %s - %s\n", __func__, message); \
+      return 0; \
+    } \
+  } while(0)
+
+#define TEST_PASS() \
+  do { \
+    printf("PASS: %s\n", __func__); \
+    return 1; \
+  } while(0)
+
+// Global test state
+static int test_log_count = 0;
+
+// Mock logger function
+void mock_logger(ncclDebugLogLevel level, unsigned long flags,
+                 const char* file, int line, const char* fmt, ...) {
+  (void)flags; // Suppress unused parameter warning
+  test_log_count++;
+
+  // Check if we should print based on NCCL_DEBUG level
+  const char* debug_level = getenv("NCCL_DEBUG");
+  int should_print = 0;
+
+  if (debug_level) {
+    if (strcmp(debug_level, "TRACE") == 0) {
+      should_print = 1; // Print everything
+    } else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) {
+      should_print = 1; // Print INFO and below
+    } else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) {
+      should_print = 1; // Print WARN and below
+    }
+  }
+
+  if (!should_print) return;
+
+  // Convert log level to string
+  const char* level_str;
+  switch(level) {
+    case NCCL_LOG_NONE: level_str = "NONE"; break;
+    case NCCL_LOG_VERSION: level_str = "VERSION"; break;
+    case NCCL_LOG_WARN: level_str = "WARN"; break;
+    case NCCL_LOG_INFO: level_str = "INFO"; break;
+    case NCCL_LOG_ABORT: level_str = "ABORT"; break;
+    case NCCL_LOG_TRACE: level_str = "TRACE"; break;
+    default: level_str = "UNKNOWN"; break;
+  }
+
+  // Print log header
+  printf("[TUNER:%s:%s:%d] ", level_str, file, line);
+
+  // Print formatted message
+  va_list args;
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+
+  printf("\n");
+}
+
+// Helper function to create test config file
+void create_test_config(const char* filename, const char* content) {
+  FILE* f = fopen(filename, "w");
+  if (f) {
+    fprintf(f, "%s", content);
+    fclose(f);
+  }
+}
+
+// Test 1: Plugin initialization
+int test_plugin_init() {
+  void* context = NULL;
+
+  // Test successful initialization
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
+  TEST_ASSERT(context != NULL, "Context should be allocated");
+
+  // Clean up
+  pluginDestroy(context);
+  TEST_PASS();
+}
+
+// Test 2: Configuration file parsing - valid CSV
+int test_config_parsing_valid() {
+  const char* test_config =
+    "# Test configuration\n"
+    "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"
+    "broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n"
+    "# Comment line\n"
+    "\n"  // Empty line
+    "reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n";
+
+  create_test_config("test_valid.conf", test_config);
+
+  // Set environment variable to use our test config
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_valid.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 3: Configuration file parsing - invalid CSV
+int test_config_parsing_invalid() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,1  # Missing nRanks and other fields\n"
+    "invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n"
+    "broadcast,abc,def,ring,simple,1,1,1,-1,-1\n";  // Invalid numbers
+
+  create_test_config("test_invalid.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
+  // Should still succeed but with no valid configs loaded
+  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_invalid.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 4: Collective type matching
+int test_collective_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n"
+    "broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n";
+
+  create_test_config("test_match.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  // Create mock cost table
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Default high cost
+    }
+  }
+
+  int nChannels;
+
+  // Test allreduce matching (should match first config)
+  ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                                          cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                                          0, &nChannels);
+
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 8, "Should set 8 channels");
+
+  // Test broadcast matching (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Reset costs
+    }
+  }
+
+  result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 4, "Should set 4 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_match.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 5: Size range matching
+int test_size_matching() {
+  const char* test_config =
+    "allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n"
+    "allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n"
+    "allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n";
+
+  create_test_config("test_size.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+  int nChannels = 1;
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels");
+
+  // Test medium message (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels");
+
+  // Test large message (should match third config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_size.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 6: Topology matching
+int test_topology_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"      // Single node only
+    "allreduce,0,65536,ring,simple,4,4,32,-1,-1\n"      // 4 nodes, 32 ranks exactly
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any topology
+
+  create_test_config("test_topo.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1);
+
+  // Test with single node setup
+  void* context1 = NULL;
+  pluginInit(8, 1, mock_logger, &context1);  // 8 ranks, 1 node
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
+  TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
+
+  pluginDestroy(context1);
+
+  // Test with 4 nodes, 32 ranks setup
+  void* context2 = NULL;
+  pluginInit(32, 4, mock_logger, &context2);  // 32 ranks, 4 nodes
+
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config");
+  TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels");
+
+  // Clean up
+  unlink("test_topo.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 7: Default channels behavior (-1)
+int test_default_channels() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n";  // Use default channels
+
+  create_test_config("test_default.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels = 99;  // Set to known value
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol");
+  TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_default.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 8: regBuff matching
+int test_regbuff_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n"      // Registered buffers only
+    "allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n"      // Non-registered buffers only
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any buffer type (backward compatible)
+
+  create_test_config("test_regbuff.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+  }
+
+  int nChannels;
+
+  // Test registered buffer (should match first config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    1, &nChannels);  // regBuff = 1 (registered)
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels");
+
+  // Test non-registered buffer (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);  // regBuff = 0 (non-registered)
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels");
+
+  // Test backward compatibility - config without regBuff should match any regBuff value
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  // First try with regBuff=2 (unusual value, should match third config)
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    2, &nChannels);  // regBuff = 2 (only third config should match)
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_regbuff.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 9: numPipeOps matching
+int test_pipeops_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n"      // Single pipeline op
+    "allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n"      // Multiple pipeline ops
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any pipeline ops (backward compatible)
+
+  create_test_config("test_pipeops.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+  }
+
+  int nChannels;
+
+  // Test single pipeline op (should match first config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels");
+
+  // Test multiple pipeline ops (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels");
+
+  // Test different number of pipeline ops (should match third config - backward compatible)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_pipeops.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 10: No matching configuration (fallback behavior)
+int test_no_match_fallback() {
+  const char* test_config =
+    "broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n";  // Only broadcast config
+
+  create_test_config("test_fallback.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  // Try allreduce (should not match, use fallback)
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified");
+  TEST_ASSERT(nChannels == 1, "Should use default channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_fallback.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 11: Large configuration files (testing dynamic allocation)
+int test_large_config() {
+  const char* large_config_file = "test_large.conf";
+
+  // Create a large configuration file with many entries
+  // This tests the dynamic allocation functionality
+  FILE* f = fopen(large_config_file, "w");
+  TEST_ASSERT(f != NULL, "Should be able to create large config file");
+
+  // Write header comment
+  fprintf(f, "# Large configuration file for testing dynamic allocation\n");
+  fprintf(f, "# This file contains many configurations to test memory allocation\n");
+
+  // Generate a large number of configurations (much more than the old MAX_CONFIGS=100)
+  const int num_configs = 500; // 5x the old static limit
+  const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"};
+  const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"};
+  const char* protocols[] = {"simple", "ll", "ll128"};
+
+  for (int i = 0; i < num_configs; i++) {
+    // Vary the configurations to create realistic test data
+    const char* coll = collectives[i % 5];
+    const char* algo = algorithms[i % 4];
+    const char* proto = protocols[i % 3];
+
+    size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB
+    size_t max_bytes = min_bytes + 65536;    // 64KB range
+    int channels = (i % 8) + 1;              // 1-8 channels
+    int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes
+    int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks
+    int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps
+    int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1
+
+    fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n",
+            coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff);
+  }
+
+  fclose(f);
+
+  // Set environment to use our large config file
+  setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1);
+
+  // Initialize plugin with large config
+  void* context = NULL;
+  ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
+  TEST_ASSERT(context != NULL, "Context should be allocated");
+
+  // Verify that configurations were loaded
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file");
+  TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size");
+  TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated");
+
+  // Test that we can access configurations throughout the array
+  // (This would have failed with the old static MAX_CONFIGS=100 limit)
+  for (int i = 0; i < ctx->numConfigs; i++) {
+    TuningConfig* config = &ctx->configs[i];
+    // Basic sanity checks on the loaded configurations
+    TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce,
+                "Collective type should be valid");
+    TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes");
+    TEST_ASSERT(config->nChannels > 0, "nChannels should be positive");
+  }
+
+  // Test specific configuration access at various indices
+  // Index 0 (first config)
+  TuningConfig* first_config = &ctx->configs[0];
+  TEST_ASSERT(first_config != NULL, "First config should be accessible");
+
+  // Index in middle
+  TuningConfig* mid_config = &ctx->configs[num_configs / 2];
+  TEST_ASSERT(mid_config != NULL, "Middle config should be accessible");
+
+  // Index near end (this would have crashed with static array of 100)
+  TuningConfig* late_config = &ctx->configs[num_configs - 1];
+  TEST_ASSERT(late_config != NULL, "Last config should be accessible");
+
+  // Test memory allocation size - verify we didn't over-allocate
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Memory allocated for %d configurations (%zu bytes total)",
+              ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig));
+
+  // Test that the plugin can still find matching configurations from the large set
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Default high cost
+    }
+  }
+
+  int nChannels;
+  // Try to find a matching configuration - should work with large config set
+  result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(large_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test 12: Very large configuration stress test
+int test_very_large_config_stress() {
+  const char* stress_config_file = "test_stress.conf";
+
+  // Create an even larger configuration file to stress test the implementation
+  FILE* f = fopen(stress_config_file, "w");
+  TEST_ASSERT(f != NULL, "Should be able to create stress test config file");
+
+  fprintf(f, "# Stress test configuration with very large number of entries\n");
+
+  // Generate an extremely large number of configurations
+  const int stress_configs = 2000; // 20x the old static limit
+
+  for (int i = 0; i < stress_configs; i++) {
+    // Create varied but valid configurations
+    fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n",
+            i * 512, (i * 512) + 1024);
+  }
+
+  fclose(f);
+
+  setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1);
+
+  // Test initialization with stress config
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
+
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations");
+  TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated");
+
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Stress test - loaded %d configurations successfully", stress_configs);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Memory usage: %zu bytes for configuration array",
+              stress_configs * sizeof(TuningConfig));
+
+  // Verify we can access configurations throughout the entire range
+  for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config
+    TuningConfig* config = &ctx->configs[i];
+    TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type");
+    TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes");
+  }
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(stress_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test 13: Edge case - empty config file
+int test_empty_config() {
+  const char* empty_config_file = "test_empty.conf";
+
+  // Create empty config file (only comments)
+  create_test_config(empty_config_file,
+    "# Empty configuration file\n"
+    "# No actual configurations\n"
+    "\n"
+    "\n");
+
+  setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
+
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations");
+  TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations");
+  TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config");
+
+  // Test that plugin still works with no configurations (fallback behavior)
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(empty_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test runner function pointer type
+typedef int (*TestFunction)(void);
+
+// Test registry
+typedef struct {
+  const char* name;
+  TestFunction func;
+  const char* description;
+} TestCase;
+
+// All available tests
+TestCase test_cases[] = {
+  {"init", test_plugin_init, "Plugin initialization"},
+  {"config-valid", test_config_parsing_valid, "Valid configuration parsing"},
+  {"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"},
+  {"collective", test_collective_matching, "Collective type matching"},
+  {"size", test_size_matching, "Size range matching"},
+  {"topology", test_topology_matching, "Topology matching"},
+  {"channels", test_default_channels, "Default channels behavior"},
+  {"regbuff", test_regbuff_matching, "Registered buffer matching"},
+  {"pipeops", test_pipeops_matching, "Pipeline operations matching"},
+  {"fallback", test_no_match_fallback, "Fallback behavior"},
+  {"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
+  {"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
+  {"empty-config", test_empty_config, "Empty configuration file handling"},
+  {NULL, NULL, NULL} // End marker
+};
+
+// Show help/usage information
+void show_help(const char* program_name) {
+  printf("Usage: %s [test_name ...]\n\n", program_name);
+  printf("Available tests:\n");
+  for (int i = 0; test_cases[i].name != NULL; i++) {
+    printf("  %-15s - %s\n", test_cases[i].name, test_cases[i].description);
+  }
+  printf("\nExamples:\n");
+  printf("  %s                    # Run all tests\n", program_name);
+  printf("  %s init               # Run only initialization test\n", program_name);
+  printf("  %s init collective    # Run initialization and collective tests\n", program_name);
+  printf("  %s --help             # Show this help\n", program_name);
+}
+
+// Find test by name
+TestFunction find_test(const char* name) {
+  for (int i = 0; test_cases[i].name != NULL; i++) {
+    if (strcmp(test_cases[i].name, name) == 0) {
+      return test_cases[i].func;
+    }
+  }
+  return NULL;
+}
+
+// Main test runner
+int main(int argc, char* argv[]) {
+  int passed = 0, total = 0;
+
+  // Check for help
+  if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) {
+    show_help(argv[0]);
+    return 0;
+  }
+
+  printf("Running NCCL Tuner Plugin Unit Tests\n");
+  printf("=====================================\n");
+
+  if (argc == 1) {
+    // No arguments - run all tests
+    for (int i = 0; test_cases[i].name != NULL; i++) {
+      total++;
+      passed += test_cases[i].func();
+    }
+  } else {
+    // Run specific tests
+    for (int arg = 1; arg < argc; arg++) {
+      TestFunction test_func = find_test(argv[arg]);
+      if (test_func) {
+        total++;
+        passed += test_func();
+      } else {
+        printf("ERROR: Unknown test '%s'\n", argv[arg]);
+        printf("Use --help to see available tests\n");
+        return 1;
+      }
+    }
+  }
+
+  printf("\n=====================================\n");
+  printf("Test Results: %d/%d tests passed\n", passed, total);
+
+  if (passed == total) {
+    printf("All tests PASSED!\n");
+    return 0;
+  } else {
+    printf("Some tests FAILED!\n");
+    return 1;
+  }
+}
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 8a35a8f..6ba9bbf 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -40,10 +40,12 @@ ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
 CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
 endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
+CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
-CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
-                 -gencode=arch=compute_120,code=sm_120
+CUDA12_8_GENCODE = -gencode=arch=compute_100,code=sm_100 \
+                   -gencode=arch=compute_120,code=sm_120
+CUDA13_GENCODE = -gencode=arch=compute_110,code=sm_110
 
 CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
@@ -53,10 +55,10 @@ CUDA13_PTX    = -gencode=arch=compute_120,code=compute_120
 
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
 # Prior to SM75 is deprecated from CUDA13.0 onwards
-  NVCC_GENCODE ?= $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+  NVCC_GENCODE ?= $(CUDA10_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
-  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
 # Include Hopper support if we're using CUDA11.8 or above
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index f41e7a7..013e972 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 27
-NCCL_PATCH   := 3
+NCCL_PATCH   := 5
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/device/Makefile b/src/device/Makefile
index df58489..67ab176 100644
--- a/src/device/Makefile
+++ b/src/device/Makefile
@@ -36,9 +36,8 @@ define COMPILE
  $(call COMPILE$(or $3,$(suffix $2)),$1,$2)
 endef
 
-ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12080))"),1)
-	NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a \
-                     -gencode=arch=compute_120a,code=sm_120a
+ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12090))"),1)
+	NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100f,code=sm_100f
 else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1)
   NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a
 else
diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h
index 0d054bb..d36dfe5 100644
--- a/src/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -1009,7 +1009,7 @@ struct Apply_LoadMultimem {
     DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_bfloat16, bf16x2, 4)
   #endif
 
-  #if NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210
+  #if NCCL_CUDA_ARCH_SPECIFIC == 1000 || NCCL_CUDA_ARCH_SPECIFIC == 1010 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210
     DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
     DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
     DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4)
diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py
index f630ff0..8fcb9a4 100755
--- a/src/device/symmetric/generate.py
+++ b/src/device/symmetric/generate.py
@@ -108,7 +108,7 @@ def required_cuda(k):
       if k.algo in ldmc_algos:
         cudart = 12070
         arch = None
-        specific_sms = [100, 120]
+        specific_sms = ["100a", "101a", "100f", "101f", "120a", "121a"]
   return (cudart, arch, specific_sms)
 
 ################################################################################
@@ -145,7 +145,7 @@ def kernel_conds(k):
   if not specific_sms:
     arch_cond = "__CUDA_ARCH__ >= %d"%arch
   else:
-    arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_SPECIFIC==%d"%(10*sm) for sm in specific_sms])
+    arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_%sSPECIFIC==%d"%("FAMILY_" if sm[-1] == "f" else "", 10*int(sm.replace('a', '').replace('f', ''))) for sm in specific_sms])
   return cudart_cond, arch_cond
 
 def instantiate(k):
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index bc5cc75..4b44abd 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -175,6 +175,13 @@ ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu
   return ncclSuccess;
 }
 
+static int mergePathType(int type0, int type1){
+  int max = std::max(type0,type1);
+  int min = std::min(type0,type1);
+  if(max == PATH_PHB && min == PATH_C2C) return PATH_P2C;
+  else return max;
+}
+
 static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
   struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
   struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
@@ -187,7 +194,7 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
 
   // Update path characteristics
   srcNode->paths[t2][i2].count = l;
-  srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
+  srcNode->paths[t2][i2].type = mergePathType(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
   if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
   srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
   return ncclSuccess;
@@ -674,9 +681,9 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       int c;
       NCCLCHECK(ncclGetLocalCpu(system, g, &c));
       if (c == -1) continue;
-      if (gpuNode->paths[NET][n].type == PATH_PHB && gpuNode->paths[CPU][c].type == PATH_C2C) {
-        gpuNode->paths[NET][n].type = PATH_P2C;
-        netNode->paths[GPU][g].type = PATH_P2C;
+      if (mergePathType(gpuNode->paths[CPU][c].type, netNode->paths[CPU][c].type) == PATH_P2C) {
+        gpuNode->paths[NET][n].type = std::min(PATH_P2C, gpuNode->paths[NET][n].type);
+        netNode->paths[GPU][g].type = std::min(PATH_P2C, netNode->paths[GPU][g].type);
       }
     }
   }
@@ -695,16 +702,15 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
           // PXN = PCI + NVLink.
           struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
           // Only use PXN for NIC n if remote GPU p ...
-          if (/* (1) is either connected to the NIC with PXB*/
-              (peerNode->paths[NET][n].type <= PATH_PXB ||
-               /* or with P2C and PxN over C2C is enabled */
-               (ncclParamPxnC2c() && peerNode->paths[NET][n].type == PATH_P2C)) &&
+          int pxnType = ncclParamPxnC2c() ? PATH_P2C : PATH_PXB;
+          if (/* (1) is connected to the NIC with PxN type*/
+              peerNode->paths[NET][n].type <= pxnType &&
               /* and (2) is connected to us through NVLink */
               peerNode->paths[GPU][g].type <= PATH_NVL &&
               /* and (3) is on the same node as us */
               NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) &&
               /* and (4) has either higher bw to that NIC or avoid going through the CPU*/
-              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXB))
+              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > pxnType))
             // We can use that GPU as relay to communicate with that NIC.
             // Only enabling it in the GPU->NIC direction for now to favor
             // receiving locally and sending remotely (consistent with net.cc)
@@ -725,6 +731,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       }
     }
   }
+
+  // Pre-compute NET local gpus to accelerate search
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+    NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &net->net.localGpu));
+  }
   return ncclSuccess;
 }
 
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 9d8ad3f..67e6009 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -437,6 +437,65 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
   return ncclSuccess;
 }
 
+// Add the preferred NICs ordered by GPU first
+static ncclResult_t ncclTopoPrefNetsGpuFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) {
+  const int nGpus = (gpu == -1) ? system->nodes[GPU].count : 1;
+  int gpuCount = nGpus;
+  int gpuIds[NCCL_TOPO_MAX_NODES] = {gpu};
+  int firstNets[NCCL_TOPO_MAX_NODES];
+  if (gpu == -1)
+    for (int g = 0; g < nGpus; g++) gpuIds[g] = g;
+
+  for (int c = 0; c < MAXCHANNELS; c++) {
+    for (int g = 0; g < nGpus; g++) {
+      if (gpuIds[g] == -1) continue;
+      int localNet;
+      int64_t netId;
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes + gpuIds[g];
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &localNet));
+      // store the first net found for each GPU in case of duplicates
+      if(c == 0) firstNets[g] = localNet;
+      // if the NET has already been returned for channel 0, that GPU is done
+      if (c > 0 && firstNets[g] == localNet) {
+        gpuIds[g] = -1;
+        gpuCount--;
+        continue;
+      }
+      // only add it to the list if it doesn't already exist
+      int found = 0;
+      while (found < (*netCount) && nets[found] != localNet) found++;
+      if (found == (*netCount)) nets[(*netCount)++] = localNet;
+    }
+    if (gpuCount == 0) break;
+  }
+  return ncclSuccess;
+}
+
+// Add the preferred NICs ordered by channels first
+static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) {
+  for (int g = 0; g < system->nodes[GPU].count; g++) {
+    if (gpu != -1 && gpu != g) continue;
+    int localNetCount = 0, localNets[MAXCHANNELS];
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes + g;
+    for (int c = 0; c < MAXCHANNELS; c++) {
+      int64_t netId;
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets + localNetCount));
+      if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
+      localNetCount++;
+    }
+    // Append NICs to list
+    for (int i = 0; i < localNetCount; i++) {
+      int n = localNets[i];
+      int found = 0;
+      while (found < (*netCount) && nets[found] != n) found++;
+      if (found == (*netCount)) nets[(*netCount)++] = n;
+    }
+  }
+  return ncclSuccess;
+}
+
 // Build a sorted list of the NETs to try.
 //
 // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
@@ -445,39 +504,25 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 // The list is built the following way:
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
 // 2. add other NETs satisfying typeInter but not already in the list.
-
+NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1);
 ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
   ncclResult_t ret = ncclSuccess;
   int netCount = 0;
-  int localNetCount;
-  int localNets[MAXCHANNELS];
 
-  // First add the preferred NICs
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    if (gpu != -1 && gpu != g) continue;
-    localNetCount = 0;
-    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    for (int c = 0; c<MAXCHANNELS; c++) {
-      int64_t netId;
-      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
-      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
-      if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
-      localNetCount++;
-    }
-    // Append NICs to list
-    for (int i=0; i<localNetCount; i++) {
-      int n = localNets[i];
-      int found = 0;
-      while (found<netCount && nets[found] != n) found++;
-      if (found == netCount) nets[netCount++] = n;
-    }
+  // First add the preferred NETs.
+  if (system->nHosts > 1 && ncclParamScatterEnable()) {
+    // For MNNVL systems, we sort the devices by GPU first, then by channel
+    NCCLCHECK(ncclTopoPrefNetsGpuFirst(system, gpu, nets, &netCount));
+  } else {
+    // For other systems, we sort the devices by channel first, then by GPU
+    NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount));
   }
 
   // Then add others satisfying typeInter
   for (int t=0; t <= typeInter; t++) {
-    for (int g=0; g<system->nodes[GPU].count; g++) {
+    for (int g = 0; g < system->nodes[GPU].count; g++) {
       if (gpu != -1 && gpu != g) continue;
-      localNetCount = 0;
+      int localNetCount = 0, localNets[MAXCHANNELS];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       struct ncclTopoLinkList* paths = gpu->paths[NET];
       for (int n=0; n<system->nodes[NET].count && n<MAXCHANNELS; n++) {
@@ -625,8 +670,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
       // NVLS search only tries to find NIC:GPU combinations to compute the heads.
       if (graph->nChannels < netCount) {
-        int gpu;
-        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
+        int gpu = net->net.localGpu;
         if (gpu != -1) {
           int duplicate = 0;
           // check whether there is duplicate head when one GPU connects with multiple NICs
@@ -643,13 +687,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
         }
       }
     } else {
-      if (graph->nChannels > 0) {
+      if (graph->nChannels > 0 && graph->sameChannels == 1) {
         // Try to replay the last channel
         int g;
         NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
         NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
-      }
-      if (graph->nChannels == 0 || graph->sameChannels == 0) {
+      } else {
         if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
           // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
           int t = 1 << 10;
@@ -658,11 +701,16 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
         }
 
         // Then try the most local GPUs
+        int localGpu = net->net.localGpu;
+        if (localGpu != -1) {
+          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpu));
+        }
         int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType;
         NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType));
         // if no GPUs are connected, skip this net
         if (pathType == PATH_DIS) continue;
         for (int g = 0; g < localGpuCount; ++g) {
+          if (localGpus[g] == localGpu) continue; // We already tried this one
           NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g]));
         }
       }
@@ -749,8 +797,8 @@ struct kvDict kvDictLinkType[] = {
   { "NVB", PATH_NVB },
   { "PIX", PATH_PIX },
   { "PXB", PATH_PXB },
-  { "PXN", PATH_PXN },
   { "P2C", PATH_P2C },
+  { "PXN", PATH_PXN },
   { "PHB", PATH_PHB },
   { "SYS", PATH_SYS },
   { NULL, 0 }
@@ -798,8 +846,10 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
-  if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
   const char* str;
+  NCCLCHECK(xmlGetAttr(xmlGraph, "latencyinter", &str));
+  if (!str) INFO(NCCL_GRAPH, "latencyinter not found in graph, using 0.0");
+  graph->latencyInter = str ? strtof(str, NULL) : 0.0;
   NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
   NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
   NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
@@ -910,7 +960,7 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0,
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
 
 float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 };
-float sm100SpeedArrayInter[] = { 47.9, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float sm100SpeedArrayInter[] = { 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
 
@@ -1136,8 +1186,12 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
       offset = strlen(line);
     }
     for (int i=0; i<ngpus; i++) {
-      sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
+      int g;
+      ncclTopoRankToIndex(system, graph->intra[ngpus * c + i], &g, true);
+      int64_t topoId = system->nodes[GPU].nodes[g].id;
+      sprintf(line + offset, " %s/%lx-%lx", topoNodeTypeStr[GPU], NCCL_TOPO_ID_SYSTEM_ID(topoId), NCCL_TOPO_ID_LOCAL_ID(topoId));
       offset = strlen(line);
+      if (graph->id == 3) break; // NVLS graphs only use the first GPU
     }
     if (system->nodes[NET].count > 0) {
       sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1]));
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 9fe81bb..8fdf54e 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -21,7 +21,7 @@
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
 const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "C2C", "PCI",    "",    "",    "",    "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "P2C", "PXN", "PHB", "SYS", "NET", "DIS" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
@@ -677,7 +677,14 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
     struct ncclXmlNode* node = topNode->subs[s];
     if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
   }
-  for (int systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId;
+
+  int systemId = 0;
+  while (systemId < system->nHosts && system->hostHashes[systemId] != localHostHash) systemId++;
+  system->systemId = systemId;
+  if(systemId == system->nHosts){
+    WARN("localHostHash = 0x%lx not found in the list of system hostHashes",localHostHash);
+    return ncclInvalidArgument;
+  }
 
   NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
   NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
@@ -1143,8 +1150,8 @@ struct kvDict nicPathKvList[] = {
   { "PORT", PATH_PORT },
   { "PIX",  PATH_PIX },
   { "PXB",  PATH_PXB },
-  { "PXN",  PATH_PXN },
   { "P2C",  PATH_P2C },
+  { "PXN",  PATH_PXN },
   { "PHB",  PATH_PHB },
   { "SYS",  PATH_SYS },
   { NULL, 0 }
@@ -1421,7 +1428,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   }
 
   // Only update our topo tracking structure if we aren't dumping (separate steps)
-  if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+  if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, getHostHash()), ret, fail);
 
 exit:
   if (!comm->MNNVL && localRanks) free(localRanks);
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 07ef5e1..9b49c02 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -18,7 +18,7 @@
 #define SM80_NVLINK_BW 20.0
 #define SM90_NVLINK_BW 20.6
 #define SM86_NVLINK_BW 12.0
-#define SM100_NVLINK_BW 40.0
+#define SM100_NVLINK_BW 40.1
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define AMD_BW 16.0
 #define BDW_QPI_BW 6.0
@@ -76,11 +76,11 @@ extern const char* topoLinkTypeStr[];
 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
 #define PATH_PXB 5
 
-// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
-#define PATH_PXN 6
-
 // Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC
-#define PATH_P2C 7
+#define PATH_P2C 6
+
+// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
+#define PATH_PXN 7
 
 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
 #define PATH_PHB 8
@@ -143,6 +143,7 @@ struct ncclTopoNode {
       int gdrSupport;
       int collSupport;
       int maxChannels;
+      int localGpu;
     }net;
     struct {
       int arch;
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 64dc5cf..8e99f18 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -455,9 +455,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
     if (pEnable == 2 && p == NCCL_PROTO_LL128) {
-      // Enable LL128 by default only on Volta/Ampere/Hopper/Blackwell+NVLink. Other cases are not tested and may cause silent data corruption.
       pEnable = 1;
-      pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= (ncclParamLl128C2c() ? PATH_P2C : PATH_PXN)));
+      if (ncclParamLl128C2c() && minCompCap >= 90) {
+        // Enable LL128 by default only on Hopper/Blackwell for all connections up to P2C and PXN.
+        pEnable &= (graphs[a]->typeInter <= PATH_PXN);
+      } else {
+        // Enable LL128 only up to PXB. Don't enable LL128 over PxN because PxN can encapsulate PxB or P2C links.
+        pEnable &= (graphs[a]->typeInter <= PATH_PXB);
+        if (!ncclParamLl128C2c() && minCompCap >= 90)
+          INFO(NCCL_GRAPH, "Disabling LL128 over all PxN connections (PXB and C2C). This ensures that no C2C link will be used by LL128.");
+      }
       pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
       pEnable &= (minCompCap == maxCompCap);
       pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2));
diff --git a/src/init.cc b/src/init.cc
index 83764a8..2a57c46 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1507,7 +1507,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   int minCTAsEnv;
   int maxCTAsEnv;
   int splitShareEnv;
-  int collnetEnableEnv;
+  const char* collnetEnableEnv;
   int ctaPolicyEnv;
   int shrinkShareEnv;
   int nvlsCTAsEnv;
@@ -1561,9 +1561,15 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
     comm->config.shrinkShare = shrinkShareEnv;
   }
 
-  collnetEnableEnv = ncclParamCollnetEnable();
-  if (collnetEnableEnv != NCCL_CONFIG_UNDEF_INT) {
-    comm->config.collnetEnable = collnetEnableEnv;
+  // NCCL_COLLNET_ENABLE needs to be reloaded each time for comm init
+  // since users might change the env on the fly to enable/disable collnet
+  collnetEnableEnv = ncclGetEnv("NCCL_COLLNET_ENABLE");
+  if (collnetEnableEnv != NULL) {
+    int collnetEnableInt = (int)strtol(collnetEnableEnv, NULL, 0);
+    if (collnetEnableInt != NCCL_CONFIG_UNDEF_INT) {
+      comm->config.collnetEnable = collnetEnableInt;
+      INFO(NCCL_ENV, "NCCL_COLLNET_ENABLE set by environment to %d.", collnetEnableInt);
+    }
   }
 
   ctaPolicyEnv = ncclParamCtaPolicy();
diff --git a/src/misc/mlx5dvsymbols.cc b/src/misc/mlx5dvsymbols.cc
index 5bb4109..47cc4eb 100644
--- a/src/misc/mlx5dvsymbols.cc
+++ b/src/misc/mlx5dvsymbols.cc
@@ -52,6 +52,9 @@ ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
 #define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do {  \
     cast = (void**)&funcptr;                                     \
     *cast = dlvsym(handle, symbol, version);                     \
+    if (*cast == NULL) {                                         \
+      INFO(NCCL_NET, "dlvsym failed on %s - %s version %s", symbol, dlerror(), version);  \
+    }                                                            \
   } while (0)
 
   LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported);
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
index 0adb4b1..1766f41 100644
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@@ -21,7 +21,6 @@ struct ncclStrongStreamCapture {
   cudaGraph_t graph;
   unsigned long long graphId;
   cudaStream_t captureStream;
-  cudaGraphNode_t lastRecord;
   void* acquiredBy;
 };
 
@@ -216,7 +215,6 @@ ncclResult_t ncclStrongStreamAcquire(
         CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
       }
       cap->graphId = graph.graphId;
-      cap->lastRecord = nullptr;
       cap->acquiredBy = localThreadId();
       // Push to capturing list.
       cap->next = ss->captureHead;
@@ -296,16 +294,6 @@ ncclResult_t ncclStrongStreamRelease(
         cudaGraphNode_t recordNode;
         CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
 
-        // Make this record order after previous record on this stream.
-        if (cap->lastRecord != nullptr) {
-        #if CUDART_VERSION >= 13000
-          CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &cap->lastRecord, &recordNode, nullptr, 1));
-        #else
-          CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
-        #endif
-        }
-        cap->lastRecord = recordNode;
-
         // Get current nodes from work stream so we can add them as dependencies.
         cudaStreamCaptureStatus status;
         cudaGraphNode_t const* nodes;
@@ -338,6 +326,22 @@ ncclResult_t ncclStrongStreamRelease(
           }
         }
 
+	// Make every future operation captured on cap->captureStream depend on 'recordNode'.
+        #if CUDART_VERSION >= 13000
+        CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(
+                    cap->captureStream,
+                    &recordNode,          /* dependencies                */
+                    /*edges =*/ nullptr,  /* no edge annotations         */
+                    1,                    /* count                       */
+                    cudaStreamSetCaptureDependencies));
+        #else
+        CUDACHECK(cudaStreamUpdateCaptureDependencies(
+                    cap->captureStream,
+                    &recordNode,
+                    1,
+                    cudaStreamSetCaptureDependencies));
+        #endif
+
         if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
           WARN("%s", launchRaceFatalMsg);
           return ncclInvalidUsage;
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
index a9c1d0d..64c97be 100644
--- a/src/plugin/plugin_open.cc
+++ b/src/plugin/plugin_open.cc
@@ -61,20 +61,20 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
   char eNoEntNameList[PATH_MAX] = { 0 };
 
   if (libName && strlen(libName)) {
-    // match names that start with 'lib' and end with '.so'
-    if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) {
-      snprintf(libName_, MAX_STR_LEN, "%s", libName);
-      libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
-      if (libHandles[type]) {
-        INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
-        return libHandles[type];
-      }
-      if (openErr == ENOENT) {
-        appendNameToList(eNoEntNameList, &len, libName_);
-      } else {
-        INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
-      }
+    snprintf(libName_, MAX_STR_LEN, "%s", libName);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
     } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+
+    // libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so')
+    if (strchr(libName, '/') == nullptr && (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") || strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) {
       snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
       libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
       if (libHandles[type]) {
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 19a505e..40897d9 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -494,7 +494,9 @@ static int ibvSpeeds[] = {
   14000, /* FDR */
   25000, /* EDR */
   50000, /* HDR */
-  100000 /* NDR */ };
+  100000, /* NDR */
+  200000  /* XDR */
+};
 
 static int firstBitSet(int val, int max) {
   int i = 0;
@@ -654,7 +656,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
           ibProvider = IB_PROVIDER_MLX5;
           snprintf(dataDirectDevicePath, PATH_MAX, "/sys");
           if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) {
-            INFO(NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
+            INFO(NCCL_INIT|NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
             if(ncclParamIbDataDirect()) dataDirectSupported = 1;
           }
         }