diff --git a/ext-net/example/Makefile b/ext-net/example/Makefile index e0a6aa6..9cc623e 100644 --- a/ext-net/example/Makefile +++ b/ext-net/example/Makefile @@ -3,15 +3,20 @@ # # See LICENSE.txt for license information # -NCCL_HOME:=../../build/ -CUDA_HOME:=/usr/local/cuda -INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl -PLUGIN_SO:=libnccl-net.so +.DEFAULT_GOAL: build +include ../../makefiles/common.mk +SRCDIR ?= $(abspath ../..) +BUILDDIR ?= . +NCCLDIR := $(BUILDDIR) -default: $(PLUGIN_SO) +SRC_FILES := $(wildcard *.c) -$(PLUGIN_SO): plugin.c - $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ +build: ${BUILDDIR}/libnccl-net-example.so + +${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES} + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${BUILDDIR} + $(CC) -Inccl -fPIC -shared -o $@ $^ clean: - rm -f $(PLUGIN_SO) + rm -f ${BUILDDIR}/libnccl-net-example.so diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile index f5cc9f1..777ff5b 100644 --- a/ext-profiler/example/Makefile +++ b/ext-profiler/example/Makefile @@ -3,14 +3,20 @@ # # See LICENSE.txt for license information # -NCCL_HOME := ../../build -INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl -PLUGIN_SO := libnccl-profiler.so +.DEFAULT_GOAL: build +include ../../makefiles/common.mk +SRCDIR ?= $(abspath ../..) +BUILDDIR ?= . +NCCLDIR := $(BUILDDIR) -default: $(PLUGIN_SO) +SRC_FILES := $(wildcard *.c) -$(PLUGIN_SO): plugin.c event.c print_event.c - $(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ +build: ${BUILDDIR}/libnccl-profiler-example.so + +${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES} + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${BUILDDIR} + $(CC) -Inccl -fPIC -shared -o $@ $^ clean: - rm -f $(PLUGIN_SO) + rm -f ${BUILDDIR}/libnccl-profiler-example.so diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c index e3f707a..b89cd46 100644 --- a/ext-profiler/example/plugin.c +++ b/ext-profiler/example/plugin.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "event.h" #include "print_event.h" @@ -41,22 +41,10 @@ static struct proxyOp* detachPool; ncclDebugLogger_t logFn; #define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) -static double freq = -1; -__hidden void calibrate() { - struct timeval tv; - gettimeofday(&tv, NULL); - uint64_t timeCycles = __rdtsc(); - double time = - tv.tv_sec*1e6 - tv.tv_usec; - uint64_t total = 0ULL; - for (int i = 0; i < 10000; i++) total += __rdtsc(); - gettimeofday(&tv, NULL); - timeCycles = __rdtsc() - timeCycles; - time += tv.tv_sec*1e6 + tv.tv_usec; - freq = timeCycles / time; -} - __hidden double gettime(void) { - return __rdtsc() / freq; + struct timespec t; + clock_gettime(CLOCK_MONOTONIC, &t); + return (t.tv_sec*1e6 + (t.tv_nsec*1e-3)); } static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; @@ -98,8 +86,6 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, // process address space. pid = getpid(); - // calibrate and start timer - calibrate(); startTime = gettime(); } pthread_mutex_unlock(&lock); diff --git a/ext-tuner/basic/Makefile b/ext-tuner/basic/Makefile new file mode 100644 index 0000000..50edd23 --- /dev/null +++ b/ext-tuner/basic/Makefile @@ -0,0 +1,23 @@ +# +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# +.DEFAULT_GOAL: build +include ../../makefiles/common.mk +SRCDIR ?= $(abspath ../..) +BUILDDIR ?= . +NCCLDIR := $(BUILDDIR) + +SRC_FILES := $(wildcard *.c) +DST_DIR := $(BUILDDIR)/test/unit/plugins + +build: ${BUILDDIR}/libnccl-tuner-basic.so + +${BUILDDIR}/libnccl-tuner-basic.so: ${SRC_FILES} + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${BUILDDIR} + $(CC) -Inccl -fPIC -shared -o $@ $^ + +clean: + rm -f ${BUILDDIR}/libnccl-tuner-basic.so diff --git a/ext-tuner/basic/nccl/common.h b/ext-tuner/basic/nccl/common.h new file mode 100644 index 0000000..9129252 --- /dev/null +++ b/ext-tuner/basic/nccl/common.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COMMON_H_ +#define COMMON_H_ + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#endif diff --git a/ext-tuner/basic/nccl/err.h b/ext-tuner/basic/nccl/err.h new file mode 100644 index 0000000..bb92e83 --- /dev/null +++ b/ext-tuner/basic/nccl/err.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_ERR_H_ +#define NCCL_ERR_H_ + +/* Error type for plugins */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6 } ncclResult_t; + +#endif diff --git a/ext-tuner/basic/nccl/tuner.h b/ext-tuner/basic/nccl/tuner.h new file mode 100644 index 0000000..77b543d --- /dev/null +++ b/ext-tuner/basic/nccl/tuner.h @@ -0,0 +1,97 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TUNER_H_ +#define NCCL_TUNER_H_ + +#include +#include + +#include "common.h" +#include "err.h" + +#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now +typedef enum { + ncclFuncBroadcast = 0, + ncclFuncReduce = 1, + ncclFuncAllGather = 2, + ncclFuncReduceScatter = 3, + ncclFuncAllReduce = 4, + ncclFuncSendRecv = 5, + ncclFuncSend = 6, + ncclFuncRecv = 7, + ncclNumFuncs = 8 +} ncclFunc_t; + +#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* +#define NCCL_ALGO_UNDEF -1 +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 +#define NCCL_ALGO_COLLNET_DIRECT 2 +#define NCCL_ALGO_COLLNET_CHAIN 3 +#define NCCL_ALGO_NVLS 4 +#define NCCL_ALGO_NVLS_TREE 5 +#define NCCL_ALGO_PAT 6 + +#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 +#define NCCL_PROTO_UNDEF -1 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 + +#define NCCL_ALGO_PROTO_IGNORE -1.0 + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // - regBuff: can register user buffer + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v4_t; + +typedef ncclTuner_v4_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" + +#endif diff --git a/ext-tuner/basic/plugin.c b/ext-tuner/basic/plugin.c new file mode 100644 index 0000000..a17fd00 --- /dev/null +++ b/ext-tuner/basic/plugin.c @@ -0,0 +1,34 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "tuner.h" + +#define __hidden __attribute__ ((visibility("hidden"))) + +__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; } + +__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels) { + // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { + table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; + } + *nChannels = 1; + return ncclSuccess; +} + +__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } + +#define PLUGIN_NAME "Basic" + +const ncclTuner_v4_t ncclTunerPlugin_v4 = { + .name = PLUGIN_NAME, + .init = pluginInit, + .getCollInfo = pluginGetCollInfo, + .destroy = pluginDestroy +}; diff --git a/ext-tuner/example/Makefile b/ext-tuner/example/Makefile index 9d9ace4..76c16b6 100644 --- a/ext-tuner/example/Makefile +++ b/ext-tuner/example/Makefile @@ -3,15 +3,53 @@ # # See LICENSE.txt for license information # -NCCL_HOME:=../../build/ -CUDA_HOME:=/usr/local/cuda -INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl -PLUGIN_SO:=libnccl-tuner.so -default: $(PLUGIN_SO) +.DEFAULT_GOAL: build +PLUGIN_SO:=libnccl-tuner-example.so +include ../../makefiles/common.mk +SRCDIR ?= $(abspath ../..) +BUILDDIR ?= . +NCCLDIR := $(BUILDDIR) -$(PLUGIN_SO): plugin.c - $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ +SRC_FILES := $(wildcard *.c) +DST_DIR := $(BUILDDIR)/test/unit/plugins + +default: ${BUILDDIR}/$(PLUGIN_SO) + +build: ${BUILDDIR}/$(PLUGIN_SO) + +${BUILDDIR}/$(PLUGIN_SO): plugin.c + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${BUILDDIR} + $(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ + +# Test targets - delegate to test directory +test: + $(MAKE) -C test test TEST_CASE=$(TEST_CASE) + +test-verbose: + $(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE) + +# Build tests +test-build: + $(MAKE) -C test all + +# Optimize configurations from performance data +optimize-config: + @if [ -z "$(CSV_FILE)" ]; then \ + echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \ + echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \ + exit 1; \ + fi + python3 scripts/optimize_config.py $(CSV_FILE) \ + $(if $(OUTPUT),-o $(OUTPUT)) \ + $(if $(METRIC),-m $(METRIC)) \ + $(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \ + $(if $(DRY_RUN),--dry-run) \ + $(if $(NO_HEADER),--no-header) clean: - rm -f $(PLUGIN_SO) + rm -f ${BUILDDIR}/$(PLUGIN_SO) + $(MAKE) -C test clean + +.PHONY: test test-verbose test-build optimize-config clean diff --git a/ext-tuner/example/README.md b/ext-tuner/example/README.md new file mode 100644 index 0000000..7f472ae --- /dev/null +++ b/ext-tuner/example/README.md @@ -0,0 +1,164 @@ +# NCCL Example Tuner Plugin + +This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling. + +## Features + +- **File-based Configuration**: Read tuning parameters from a CSV configuration file +- **Size-based Tuning**: Specify different configurations based on message size ranges +- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks +- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default +- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE` +- **Fallback Behavior**: Gracefully handles missing config files and invalid entries + +## Building + +```bash +make +``` + +This will create `libnccl-tuner-example.so` that can be loaded by NCCL. + +## Configuration File Format + +The configuration file uses CSV (Comma-Separated Values) format with one configuration per line: + +``` +collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff +``` + +### Parameters + +- **collective_type**: The collective operation type + - `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce` + +- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies + - Use `0` for minimum and `4294967295` for maximum (covers all sizes) + +- **algorithm**: The NCCL algorithm to use + - `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat` + +- **protocol**: The NCCL protocol to use + - `ll`, `ll128`, `simple` + +- **channels**: Number of channels (SMs) to use + - Use a positive integer to specify exact channel count + - Use `-1` to keep NCCL's default channel selection + +- **nNodes**: Number of nodes to match + - Use a positive integer to match specific node count + - Use `-1` to match any number of nodes + +- **nRanks**: Number of ranks to match + - Use a positive integer to match specific rank count + - Use `-1` to match any number of ranks + +- **numPipeOps**: Number of pipeline operations to match (optional) + - Use a positive integer to match specific pipeline operation count + - Use `-1` to match any number of pipeline operations + - If omitted, configuration will match any numPipeOps value + +- **regBuff**: Whether user buffer can be registered (optional) + - Use `0` to match only non-registered buffers + - Use `1` to match only registered buffers + - Use `-1` to match either registered or non-registered buffers + - If omitted, configuration will match any regBuff value + +### Example Configuration + +```csv +# Single-node, small allreduce: use tree algorithm, registered buffers only +allreduce,0,65536,tree,simple,2,1,-1,-1,1 + +# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers +allreduce,65537,1048576,ring,simple,4,4,32,1,0 + +# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type +allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1 + +# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible) +broadcast,0,32768,tree,simple,-1,1,-1 + +# Multi-node broadcast: optimized for non-registered buffers, single pipeline op +broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0 +``` + +Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc. + +### Backward Compatibility + +Configurations without the numPipeOps and/or regBuff parameters are fully supported: +- 8 fields: matches any numPipeOps and regBuff values +- 9 fields: matches any regBuff value +- 10 fields: full parameter specification + +This ensures existing configuration files continue to work without modification. + +## Usage + +### Method 1: Default Config File +Place your configuration in `nccl_tuner.conf` in the current working directory. + +### Method 2: Environment Variable +Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path: + +```bash +export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf +export LD_LIBRARY_PATH=/path/to/plugin:$LD_LIBRARY_PATH +mpirun -np 4 your_nccl_application +``` + +## Editing Configuration Files + +### Generating Configuration Files from Raw Data + +A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md). + +### Spreadsheet Tips: +- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff` +- Save as CSV format (not Excel format) for the plugin to read +- Use data validation to prevent typos in algorithm/protocol names + +## Logging + +The plugin uses NCCL's logging system. To see tuner-related messages: + +```bash +export NCCL_DEBUG=INFO +``` + +This will show when configurations are loaded and applied, including the topology information. + +For detailed debugging output during tuning decisions: + +```bash +export NCCL_DEBUG=TRACE +``` + +This will show verbose information about which configurations are being evaluated and matched. + +## Dimension Matching + +Configurations are only applied when the topology matches: + +- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks +- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks +- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node +- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology + +This allows you to create specialized configurations for different cluster setups while maintaining flexibility. + +## Default Behavior + +If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic. + +When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size. + +## Troubleshooting + +1. **Config file not found**: Check the file path and permissions +2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters +3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory +4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO` +5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards +6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces diff --git a/ext-tuner/example/nccl_tuner.conf b/ext-tuner/example/nccl_tuner.conf new file mode 100644 index 0000000..13eb2f0 --- /dev/null +++ b/ext-tuner/example/nccl_tuner.conf @@ -0,0 +1,45 @@ +# NCCL Tuner Configuration File (CSV Format) +# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff +# +# Collective types: broadcast, reduce, allgather, reducescatter, allreduce +# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat +# Protocols: ll, ll128, simple +# Channels: number of channels to use, or -1 to keep default +# nNodes: number of nodes to match, or -1 for any number of nodes +# nRanks: number of ranks to match, or -1 for any number of ranks +# numPipeOps: number of pipeline operations to match, or -1 for any number (optional) +# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional) +# +# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value +# +# Examples: + +# For single-node configurations with registered buffers +# Small allreduce operations on single node - use tree algorithm, registered buffers +allreduce,0,65536,tree,simple,2,1,-1,-1,1 + +# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers +# Medium allreduce operations - use ring algorithm +allreduce,65537,1048576,ring,simple,4,4,32,1,0 + +# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type +allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1 + +# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types +# Single node broadcast - prefer tree, any pipeOps, registered buffers only +broadcast,0,32768,tree,simple,-1,1,-1,-1,1 + +# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring +broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0 + +# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type +allgather,0,4294967295,ring,simple,4,2,-1 + +# ReduceScatter operations +# Small messages on single node, single pipeline op, registered buffers +reducescatter,0,131072,tree,simple,2,1,-1,1,1 +# Large messages on any topology, multiple pipeline ops, non-registered buffers +reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0 + +# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type +reduce,0,4294967295,tree,simple,-1,-1,-1 diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c index 7925dcf..1b8031e 100644 --- a/ext-tuner/example/plugin.c +++ b/ext-tuner/example/plugin.c @@ -5,24 +5,443 @@ ************************************************************************/ #include "tuner.h" +#include +#include +#include #define __hidden __attribute__ ((visibility("hidden"))) +#define MAX_LINE_LENGTH 256 -__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; } +// CSV field indices for configuration parsing +// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff +#define CONFIG_FIELD_COLLTYPE 0 +#define CONFIG_FIELD_MINBYTES 1 +#define CONFIG_FIELD_MAXBYTES 2 +#define CONFIG_FIELD_ALGORITHM 3 +#define CONFIG_FIELD_PROTOCOL 4 +#define CONFIG_FIELD_CHANNELS 5 +#define CONFIG_FIELD_NNODES 6 +#define CONFIG_FIELD_NRANKS 7 +#define CONFIG_FIELD_PIPEOPS 8 // Optional field +#define CONFIG_FIELD_REGBUFF 9 // Optional field + +// Field count constants +#define CONFIG_FIELDS_REQUIRED 8 // Minimum required fields (up to nRanks) +#define CONFIG_FIELDS_WITH_PIPEOPS 9 // Fields including numPipeOps +#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff +#define CONFIG_FIELDS_MAX 10 // Maximum number of fields supported + +typedef struct { + ncclFunc_t collType; + size_t minBytes; + size_t maxBytes; + int algorithm; + int protocol; + int nChannels; + int nNodes; + int nRanks; + int numPipeOps; + int regBuff; +} TuningConfig; + +typedef struct { + TuningConfig* configs; // Changed from static array to dynamic pointer + int numConfigs; + int maxConfigs; // Added to track allocated size + size_t nRanks; + size_t nNodes; + ncclDebugLogger_t logFunction; +} TunerContext; + +// Parse collective type from string +static ncclFunc_t parseCollType(const char* str) { + if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast; + if (strcmp(str, "reduce") == 0) return ncclFuncReduce; + if (strcmp(str, "allgather") == 0) return ncclFuncAllGather; + if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter; + if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce; + return ncclFuncAllReduce; // default +} + +// Convert collective type to string +static const char* collTypeToString(ncclFunc_t collType) { + switch (collType) { + case ncclFuncBroadcast: return "broadcast"; + case ncclFuncReduce: return "reduce"; + case ncclFuncAllGather: return "allgather"; + case ncclFuncReduceScatter: return "reducescatter"; + case ncclFuncAllReduce: return "allreduce"; + default: return "unknown"; + } +} + +// Parse algorithm from string +static int parseAlgorithm(const char* str) { + if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE; + if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING; + if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT; + if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN; + if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS; + if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE; + if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT; + return NCCL_ALGO_RING; // default +} + +// Convert algorithm to string +static const char* algorithmToString(int algorithm) { + switch (algorithm) { + case NCCL_ALGO_TREE: return "tree"; + case NCCL_ALGO_RING: return "ring"; + case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct"; + case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain"; + case NCCL_ALGO_NVLS: return "nvls"; + case NCCL_ALGO_NVLS_TREE: return "nvls_tree"; + case NCCL_ALGO_PAT: return "pat"; + default: return "unknown"; + } +} + +// Parse protocol from string +static int parseProtocol(const char* str) { + if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL; + if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128; + if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE; + return NCCL_PROTO_SIMPLE; // default +} + +// Convert protocol to string +static const char* protocolToString(int protocol) { + switch (protocol) { + case NCCL_PROTO_LL: return "ll"; + case NCCL_PROTO_LL128: return "ll128"; + case NCCL_PROTO_SIMPLE: return "simple"; + default: return "unknown"; + } +} + +// Helper function to count valid configuration lines in file +static int countConfigLines(const char* filename) { + FILE* file = fopen(filename, "r"); + if (!file) { + return 0; + } + + char line[MAX_LINE_LENGTH]; + int count = 0; + + while (fgets(line, sizeof(line), file)) { + // Skip comments and empty lines + if (line[0] == '#' || line[0] == '\n') continue; + + // Remove trailing newline + line[strcspn(line, "\n")] = 0; + + // Check if line has content + if (strlen(line) > 0) { + count++; + } + } + + fclose(file); + return count; +} + +// Load configuration from file +static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) { + FILE* file = fopen(filename, "r"); + if (!file) { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Config file %s not found, using defaults", filename); + } + return ncclSuccess; // Not finding config file is not an error + } + + // First pass: count valid configuration lines + int configCount = countConfigLines(filename); + if (configCount == 0) { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: No valid configurations found in %s", filename); + } + fclose(file); + return ncclSuccess; + } + + // Allocate memory for configurations based on actual count + ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig)); + if (!ctx->configs) { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount); + } + fclose(file); + return ncclSystemError; + } + + ctx->maxConfigs = configCount; + ctx->numConfigs = 0; + + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount); + } + + // Reset file pointer to beginning + fseek(file, 0, SEEK_SET); + + char line[MAX_LINE_LENGTH]; + + while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) { + // Skip comments and empty lines + if (line[0] == '#' || line[0] == '\n') continue; + + // Remove trailing newline + line[strcspn(line, "\n")] = 0; + + // Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff + char* token; + char* tokens[CONFIG_FIELDS_MAX]; + int tokenCount = 0; + + // Make a copy of the line for tokenizing + char lineCopy[MAX_LINE_LENGTH]; + strncpy(lineCopy, line, sizeof(lineCopy)); + lineCopy[sizeof(lineCopy) - 1] = '\0'; + + // Tokenize by comma + token = strtok(lineCopy, ","); + while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) { + // Trim whitespace + while (*token == ' ' || *token == '\t') token++; + char* end = token + strlen(token) - 1; + while (end > token && (*end == ' ' || *end == '\t')) { + *end = '\0'; + end--; + } + tokens[tokenCount++] = token; + token = strtok(NULL, ","); + } + + // Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10) + if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) { + TuningConfig* config = &ctx->configs[ctx->numConfigs]; + config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]); + config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10); + config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10); + config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]); + config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]); + config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]); + config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]); + config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]); + + // numPipeOps is optional (9th field, index 8) + if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) { + config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]); + } else { + config->numPipeOps = -1; // -1 means match any numPipeOps + } + + // regBuff is optional (10th field, index 9) + if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) { + config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]); + } else { + config->regBuff = -1; // -1 means match any regBuff value + } + + ctx->numConfigs++; + + if (ctx->logFunction) { + if (config->numPipeOps == -1 && config->regBuff == -1) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any", + tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes, + tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL], + config->nChannels, config->nNodes, config->nRanks); + } else if (config->regBuff == -1) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any", + tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes, + tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL], + config->nChannels, config->nNodes, config->nRanks, config->numPipeOps); + } else if (config->numPipeOps == -1) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d", + tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes, + tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL], + config->nChannels, config->nNodes, config->nRanks, config->regBuff); + } else { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d", + tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes, + tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL], + config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff); + } + } + } + } + + fclose(file); + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename); + } + return ncclSuccess; +} + +__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { + TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext)); + if (!ctx) return ncclSystemError; + + ctx->configs = NULL; // Initialize to NULL + ctx->numConfigs = 0; + ctx->maxConfigs = 0; // Initialize to 0 + ctx->nRanks = nRanks; + ctx->nNodes = nNodes; + ctx->logFunction = logFunction; + + if (logFunction) { + logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks); + } + + // Try to load config file from environment variable or default location + const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE"); + if (!configFile) { + configFile = "nccl_tuner.conf"; // default config file name + } + + ncclResult_t result = loadConfig(ctx, configFile); + if (result != ncclSuccess) { + if (ctx->configs) { + free(ctx->configs); // Clean up allocated memory on error + } + free(ctx); + return result; + } + + *context = ctx; + return ncclSuccess; +} __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff, int* nChannels) { - // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo - float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; - if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { - table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; - } + TunerContext* ctx = (TunerContext*)context; + if (!ctx) return ncclInternalError; + + // Default channels *nChannels = 1; + + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d", + collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs); + } + + // Look for matching configuration + for (int i = 0; i < ctx->numConfigs; i++) { + TuningConfig* config = &ctx->configs[i]; + + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d", + i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol), + config->nNodes, config->nRanks, config->numPipeOps, config->regBuff); + } + + // Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff + if (config->collType == collType && + nBytes >= config->minBytes && + nBytes <= config->maxBytes && + (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) && + (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) && + (config->numPipeOps == -1 || config->numPipeOps == numPipeOps) && + (config->regBuff == -1 || config->regBuff == regBuff)) { + + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d", + algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels); + } + + // Check bounds + if (config->algorithm < numAlgo && config->protocol < numProto) { + if (collCostTable[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)", + algorithmToString(config->algorithm), protocolToString(config->protocol), + &collCostTable[config->algorithm][config->protocol], collCostTable[config->algorithm][config->protocol]); + } + collCostTable[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration + + // Only override channels if not set to -1 (keep default) + if (config->nChannels != -1) { + *nChannels = config->nChannels; + } + + if (ctx->logFunction) { + if (config->nChannels == -1) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)", + collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol), + config->nNodes, config->nRanks); + } else { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)", + collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol), + config->nChannels, config->nNodes, config->nRanks); + } + } + return ncclSuccess; + } else { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE", + algorithmToString(config->algorithm), protocolToString(config->protocol)); + } + } + } else { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)", + algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto); + } + } + } else { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d", + config->collType == collType, + (nBytes >= config->minBytes && nBytes <= config->maxBytes), + (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes), + (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks), + (config->numPipeOps == -1 || config->numPipeOps == numPipeOps), + (config->regBuff == -1 || config->regBuff == regBuff)); + } + } + } + + // If no specific config found, apply default behavior + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: No matching config found"); + } + return ncclSuccess; } -__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } +__hidden ncclResult_t pluginDestroy(void* context) { + if (context) { + TunerContext* ctx = (TunerContext*)context; + if (ctx->configs) { + free(ctx->configs); // Free dynamically allocated configs array + } + free(context); + } + return ncclSuccess; +} #define PLUGIN_NAME "Example" diff --git a/ext-tuner/example/scripts/README.md b/ext-tuner/example/scripts/README.md new file mode 100644 index 0000000..d31de43 --- /dev/null +++ b/ext-tuner/example/scripts/README.md @@ -0,0 +1,106 @@ +# NCCL Tuner Configuration Scripts + +This directory contains scripts for optimizing NCCL tuner configurations based on performance data. + +## optimize_config.py + +A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations. + +### Usage + +```bash +python scripts/optimize_config.py [options] +``` + +### Options + +- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`) +- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`) +- `--no-header`: Don't add header comments to output file +- `--dry-run`: Print configurations without writing to file + +### CSV Input Format + +The input CSV file should have the following columns: + +```csv +collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us +``` + +**Required columns:** +- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.) +- `size_bytes`: Message size in bytes +- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.) +- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`) +- `channels`: Number of channels (or `-1` for default) +- `nodes`: Number of nodes (or `-1` for any) +- `ranks`: Number of ranks (or `-1` for any) +- `pipeOps`: Number of pipeline operations (or `-1` for any) +- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any) + +**Optional metrics (must have at least one present):** +- `bandwidth_gbps`: Bandwidth in GB/s (higher is better) +- `latency_us`: Latency in microseconds (lower is better) + +### Examples + +**Basic usage with cost optimization:** +```bash +python scripts/optimize_config.py sample_performance_data.csv +``` + +**Optimize for bandwidth and write to custom file:** +```bash +python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv +``` + +**Preview configurations without writing:** +```bash +python scripts/optimize_config.py --dry-run performance_data.csv +``` + +### How It Works + +1. **Data Loading**: Reads CSV performance data and validates format +2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters +3. **Size Ranges**: Automatically bins data into size ranges for optimization +4. **Optimization**: Finds the best performing configuration for each group/size combination +5. **Output**: Generates NCCL tuner config format and appends to specified file + +### Default Size Ranges + +The script uses these default size ranges (in bytes): +- Small: 0 - 1,024 +- Medium: 1,025 - 65,536 +- Large: 65,537 - 1,048,576 +- XLarge: 1,048,577 - 16,777,216 +- XXLarge: 16,777,217 - 4,294,967,295 + +### Sample Data + +See `sample_performance_data.csv` for an example of the expected input format. + +### Integration with NCCL + +The generated configuration file can be used directly with the NCCL tuner plugin: + +```bash +export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf +export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so +mpirun -np 8 your_nccl_application +``` + +### Performance Data Collection + +To collect performance data for optimization, you can: + +1. **Use NCCL benchmarks** with different algorithm/protocol combinations +2. **Profile your applications** with various tuner settings +3. **Run systematic sweeps** across parameter combinations +4. **Use NCCL debug output** to collect timing information + +The key is to have comprehensive data covering: +- Different message sizes (small to large) +- Various topologies (single node, multi-node) +- All relevant algorithm/protocol combinations +- Different channel counts and pipeline configurations diff --git a/ext-tuner/example/scripts/optimize_config.py b/ext-tuner/example/scripts/optimize_config.py new file mode 100644 index 0000000..c5c9b70 --- /dev/null +++ b/ext-tuner/example/scripts/optimize_config.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python3 +""" +NCCL Tuner Configuration Optimizer + +Reads a CSV file containing performance data across different tuning parameters +and generates optimal NCCL tuner configurations based on the best performing +combinations. + +By default, creates growing size ranges that interpolate between the actual data sizes +for each unique dimension (node count, rank count combination). This ensures that +different cluster configurations get their own optimized size boundaries, as +performance characteristics often vary significantly between topologies. + +Each dimension gets its own set of ranges starting from 0 and extending to the maximum +size for that dimension, with boundaries at midpoints between consecutive data sizes. + +CSV Input Format: +collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us + +Output Format (NCCL Tuner Config): +collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff + +Usage Examples: + # Auto-create dimension-specific interpolated ranges (default) + python3 optimize_config.py data.csv + + # Use custom size ranges (applied to all topologies) + python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576" + + # Use hardcoded default ranges (applied to all topologies) + python3 optimize_config.py data.csv --no-auto-ranges +""" + +import csv +import argparse +import sys +import os +from collections import defaultdict +from typing import Dict, List, Tuple, Any + +class PerformanceData: + def __init__(self, row: Dict[str, str]): + self.collective = row['collective'] + self.size_bytes = int(row['size_bytes']) + self.algorithm = row['algorithm'] + self.protocol = row['protocol'] + self.channels = int(row['channels']) if row['channels'] != '-1' else -1 + self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1 + self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1 + self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1 + self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1 + + # Performance metrics + self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0)) # Higher is better + self.latency_us = float(row.get('latency_us', 0)) # Lower is better + + def get_config_key(self) -> Tuple: + """Generate a key for grouping similar configurations""" + return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff) + + def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]: + """Find which size range this data point belongs to for its dimension""" + topology_key = (self.nodes, self.ranks) + + # Get size ranges for this dimension, or fall back to default + if topology_key in topology_size_ranges: + size_ranges = topology_size_ranges[topology_key] + elif (-1, -1) in topology_size_ranges: + size_ranges = topology_size_ranges[(-1, -1)] + else: + # Fallback to first available dimension ranges + size_ranges = next(iter(topology_size_ranges.values())) + + for min_size, max_size in size_ranges: + if min_size <= self.size_bytes <= max_size: + return (min_size, max_size) + # If no range found, create a single-point range + return (self.size_bytes, self.size_bytes) + +class ConfigOptimizer: + def __init__(self, optimization_metric: str = 'latency_us'): + self.optimization_metric = optimization_metric + # Default size ranges - will be overridden by auto-detection + self.size_ranges = [ + (0, 1024), + (1025, 64*1024), + (64*1024+1, 1024*1024), + (1024*1024+1, 16*1024*1024), + (16*1024*1024+1, 4*1024*1024*1024-1) + ] + self.auto_size_ranges = True + + def set_size_ranges(self, ranges: List[Tuple[int, int]]): + """Set custom size ranges for optimization""" + self.size_ranges = ranges + self.auto_size_ranges = False + + def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]: + """Create growing size ranges for each unique (nodes, ranks) dimension""" + if not data: + return {(-1, -1): self.size_ranges} + + # Group data by dimension (nodes, ranks) + topology_data = defaultdict(list) + for item in data: + topology_key = (item.nodes, item.ranks) + topology_data[topology_key].append(item) + + topology_ranges = {} + + for topology_key, items in topology_data.items(): + nodes, ranks = topology_key + + # Extract unique sizes for this dimension and sort them + unique_sizes = sorted(set(item.size_bytes for item in items)) + + if len(unique_sizes) <= 1: + # Only one size, create a single range from 0 to that size + size = unique_sizes[0] if unique_sizes else 0 + ranges = [(0, size)] + else: + # Create growing ranges that interpolate between data points + ranges = [] + + for i, size in enumerate(unique_sizes): + if i == 0: + # First range: 0 to midpoint between first and second size + if len(unique_sizes) > 1: + next_size = unique_sizes[i + 1] + max_size = (size + next_size) // 2 + else: + max_size = size + min_size = 0 + elif i == len(unique_sizes) - 1: + # Last range: previous max + 1 to current size (and beyond) + min_size = ranges[-1][1] + 1 + max_size = size + else: + # Intermediate ranges: previous max + 1 to midpoint with next size + min_size = ranges[-1][1] + 1 + next_size = unique_sizes[i + 1] + max_size = (size + next_size) // 2 + + ranges.append((min_size, max_size)) + + topology_ranges[topology_key] = ranges + + print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:") + for i, (min_size, max_size) in enumerate(ranges): + # Count data points that fall in this range for this dimension + count = sum(1 for item in items if min_size <= item.size_bytes <= max_size) + actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size)) + if actual_sizes: + size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3]) + if len(actual_sizes) > 3: + size_list += f", ... (+{len(actual_sizes)-3} more)" + print(f" Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})") + + return topology_ranges + + def load_data(self, csv_file: str) -> List[PerformanceData]: + """Load performance data from CSV file""" + data = [] + try: + with open(csv_file, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + try: + data.append(PerformanceData(row)) + except (ValueError, KeyError) as e: + print(f"Warning: Skipping invalid row: {row} - {e}") + except FileNotFoundError: + print(f"Error: File {csv_file} not found") + sys.exit(1) + except Exception as e: + print(f"Error reading {csv_file}: {e}") + sys.exit(1) + + print(f"Loaded {len(data)} performance data points") + + # Auto-determine size ranges if enabled + if self.auto_size_ranges and data: + self.topology_size_ranges = self.auto_determine_size_ranges(data) + else: + # Use default ranges for all topologies + self.topology_size_ranges = {(-1, -1): self.size_ranges} + + return data + + def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool: + """Determine if new_data is better than current_best""" + if self.optimization_metric == 'bandwidth_gbps': + return new_data.bandwidth_gbps > current_best.bandwidth_gbps + elif self.optimization_metric == 'latency_us': + return new_data.latency_us < current_best.latency_us + else: + # Default to latency + return new_data.latency_us < current_best.latency_us + + def optimize_configurations(self, data: List[PerformanceData]) -> List[str]: + """Find optimal configurations and return as NCCL config strings""" + # Group data by configuration key and size range + grouped_data = defaultdict(lambda: defaultdict(list)) + + for item in data: + config_key = item.get_config_key() + size_range = item.get_size_range_key(self.topology_size_ranges) + grouped_data[config_key][size_range].append(item) + + # Store optimal configurations before combining ranges + optimal_configs = [] + + for config_key, size_ranges_dict in grouped_data.items(): + collective, nodes, ranks, pipeOps, regBuff = config_key + + for (min_size, max_size), items in size_ranges_dict.items(): + if not items: + continue + + # Find the best performing configuration for this size range + best_item = items[0] + for item in items[1:]: + if self.is_better(item, best_item): + best_item = item + + # Store the optimal configuration with its range + optimal_configs.append({ + 'collective': collective, + 'min_size': min_size, + 'max_size': max_size, + 'algorithm': best_item.algorithm, + 'protocol': best_item.protocol, + 'channels': best_item.channels, + 'nodes': best_item.nodes, + 'ranks': best_item.ranks, + 'pipeOps': best_item.pipeOps, + 'regBuff': best_item.regBuff, + 'metric_value': getattr(best_item, self.optimization_metric) + }) + + # Combine sequential ranges with identical tunings + combined_configs = self.combine_sequential_ranges(optimal_configs) + + # Generate config strings + configs = [] + for config in combined_configs: + config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}" + configs.append(config_str) + + print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: " + f"{config['algorithm']}/{config['protocol']} channels={config['channels']} " + f"({self.optimization_metric}={config['metric_value']:.3f})") + + return configs + + def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]: + """Combine sequential ranges that have identical tuning parameters""" + if not configs: + return configs + + # Group by collective and topology (nodes, ranks) + topology_groups = defaultdict(list) + for config in configs: + topology_key = (config['collective'], config['nodes'], config['ranks'], + config['pipeOps'], config['regBuff']) + topology_groups[topology_key].append(config) + + combined_configs = [] + + for topology_key, topology_configs in topology_groups.items(): + # Sort by min_size to ensure proper ordering + topology_configs.sort(key=lambda x: x['min_size']) + + # Group by tuning parameters (algorithm, protocol, channels) + tuning_groups = defaultdict(list) + for config in topology_configs: + tuning_key = (config['algorithm'], config['protocol'], config['channels']) + tuning_groups[tuning_key].append(config) + + # For each tuning group, combine sequential ranges + for tuning_key, tuning_configs in tuning_groups.items(): + if not tuning_configs: + continue + + # Sort by min_size + tuning_configs.sort(key=lambda x: x['min_size']) + + # Combine sequential ranges + current_config = tuning_configs[0].copy() + + for next_config in tuning_configs[1:]: + # Check if ranges are adjacent or overlapping + if current_config['max_size'] + 1 >= next_config['min_size']: + # Extend the current range + current_config['max_size'] = max(current_config['max_size'], next_config['max_size']) + # Update metric value to the better one + if self.optimization_metric == 'bandwidth_gbps': + if next_config['metric_value'] > current_config['metric_value']: + current_config['metric_value'] = next_config['metric_value'] + else: # latency_us or default + if next_config['metric_value'] < current_config['metric_value']: + current_config['metric_value'] = next_config['metric_value'] + else: + # Gap between ranges, save current and start new one + combined_configs.append(current_config) + current_config = next_config.copy() + + # Add the last configuration + combined_configs.append(current_config) + + # Sort final configs by collective, nodes, ranks, then min_size + combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size'])) + + original_count = len(configs) + combined_count = len(combined_configs) + if combined_count < original_count: + print(f"Combined {original_count} ranges into {combined_count} ranges " + f"(reduced by {original_count - combined_count})") + + return combined_configs + + def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True): + """Append optimized configurations to NCCL tuner config file""" + try: + # Create directory if it doesn't exist + config_dir = os.path.dirname(config_file) + if config_dir and not os.path.exists(config_dir): + os.makedirs(config_dir) + print(f"Created directory: {config_dir}") + + # Check if file exists and has content + file_exists = os.path.exists(config_file) + add_separator = False + + if file_exists: + with open(config_file, 'r') as f: + content = f.read().strip() + add_separator = len(content) > 0 + print(f"Appending to existing file: {config_file}") + else: + print(f"Creating new file: {config_file}") + + with open(config_file, 'a') as f: + if add_separator: + f.write("\n\n") + + if add_header: + f.write(f"# Optimized configurations generated by optimize_config.py\n") + f.write(f"# Optimization metric: {self.optimization_metric}\n") + f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n") + + for config in configs: + f.write(f"{config}\n") + + if file_exists: + print(f"Appended {len(configs)} optimized configurations to {config_file}") + else: + print(f"Created {config_file} with {len(configs)} optimized configurations") + + except PermissionError: + print(f"Error: Permission denied writing to {config_file}") + print("Try running with appropriate permissions or choose a different output location") + sys.exit(1) + except OSError as e: + print(f"Error: Cannot create/write to {config_file}: {e}") + print("Check that the path is valid and you have write permissions") + sys.exit(1) + except Exception as e: + print(f"Unexpected error writing to {config_file}: {e}") + sys.exit(1) + +def main(): + parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data") + parser.add_argument("csv_file", help="Input CSV file with performance data") + parser.add_argument("-o", "--output", default="nccl_tuner.conf", + help="Output NCCL tuner config file (default: nccl_tuner.conf)") + parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'], + default='latency_us', help="Optimization metric (default: latency_us)") + parser.add_argument("--no-header", action="store_true", + help="Don't add header comments to output file") + parser.add_argument("--dry-run", action="store_true", + help="Print configurations without writing to file") + parser.add_argument("--no-auto-ranges", action="store_true", + help="Disable automatic size range determination (use default ranges)") + parser.add_argument("--size-ranges", type=str, + help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'") + + args = parser.parse_args() + + optimizer = ConfigOptimizer(args.metric) + + # Handle size range configuration + if args.size_ranges: + # Parse custom size ranges + try: + ranges = [] + for range_str in args.size_ranges.split(','): + min_size, max_size = map(int, range_str.split('-')) + ranges.append((min_size, max_size)) + optimizer.set_size_ranges(ranges) + print(f"Using custom size ranges: {ranges}") + except ValueError: + print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'") + sys.exit(1) + elif args.no_auto_ranges: + # Disable auto-ranging + optimizer.auto_size_ranges = False + print("Using default hardcoded size ranges") + else: + # Auto-ranging is enabled by default - creates one bucket per unique size + optimizer.auto_size_ranges = True + print("Auto-ranging enabled: will create one bucket per unique size in data") + + # Load and optimize data + data = optimizer.load_data(args.csv_file) + if not data: + print("No valid data found in CSV file") + sys.exit(1) + + configs = optimizer.optimize_configurations(data) + + if args.dry_run: + print("\nGenerated configurations:") + for config in configs: + print(config) + else: + optimizer.append_to_config_file(configs, args.output, not args.no_header) + +if __name__ == "__main__": + main() diff --git a/ext-tuner/example/scripts/sample_performance_data.csv b/ext-tuner/example/scripts/sample_performance_data.csv new file mode 100644 index 0000000..7b96403 --- /dev/null +++ b/ext-tuner/example/scripts/sample_performance_data.csv @@ -0,0 +1,24 @@ +collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us +allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5 +allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8 +allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2 +allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1 +allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3 +allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4 +allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2 +allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1 +allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7 +allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1 +broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2 +broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1 +broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5 +broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2 +broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1 +broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3 +allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4 +allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6 +allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1 +allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3 +allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2 +allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1 +allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3 diff --git a/ext-tuner/example/test/Makefile b/ext-tuner/example/test/Makefile new file mode 100644 index 0000000..d675cbe --- /dev/null +++ b/ext-tuner/example/test/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for NCCL Tuner Plugin Unit Tests +# + +CC := gcc +CFLAGS := -Wall -Wextra -g -std=c99 -fPIC +INC := -I. -I../nccl +TARGET := test_plugin +SOURCES := test_plugin.c + +# Default target +all: $(TARGET) + +# Build the test executable +$(TARGET): $(SOURCES) + $(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES) + +# Run the tests +test: $(TARGET) + ./$(TARGET) $(TEST_CASE) + +# Run tests with verbose output +test-verbose: $(TARGET) + NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE) + +# Clean build artifacts +clean: + rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf + +.PHONY: all test test-verbose clean diff --git a/ext-tuner/example/test/README.md b/ext-tuner/example/test/README.md new file mode 100644 index 0000000..8203c65 --- /dev/null +++ b/ext-tuner/example/test/README.md @@ -0,0 +1,205 @@ +# NCCL Tuner Plugin Unit Tests + +This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates. + +## Test Structure + +``` +test/ +├── test_plugin.c # Main unit test file +├── Makefile # Build system for tests +└── README.md # This file +``` + +## Building and Running Tests + +### Quick Start + +```bash +# Build and run all tests +make test + +# Or step by step +make # Build test executable +./test_plugin # Run tests +``` + +### Advanced Testing + +```bash +# Run with memory leak detection (requires valgrind) +make test-memory + +# Run with verbose logging +make test-verbose + +# Generate code coverage report (requires gcov) +make coverage + +# Create sample test configuration files +make test-configs +``` + +## Test Coverage + +The unit tests cover the following functionality: + +### 1. **Plugin Initialization (`test_plugin_init`)** +- Tests successful plugin initialization +- Verifies context allocation +- Tests cleanup on destroy + +### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)** +- Valid CSV format parsing +- Comment and empty line handling +- Invalid format graceful handling +- Environment variable configuration + +### 3. **Collective Type Matching (`test_collective_matching`)** +- Correct matching of allreduce, broadcast, etc. +- Algorithm/protocol selection +- Channel configuration + +### 4. **Size Range Matching (`test_size_matching`)** +- Small, medium, large message size handling +- Proper range boundary checking +- Multiple size-based configurations + +### 5. **Topology Matching (`test_topology_matching`)** +- Single-node vs multi-node configurations +- Exact nNodes/nRanks matching +- Wildcard matching (-1 values) + +### 6. **Default Channels (`test_default_channels`)** +- Proper handling of -1 channel specification +- Preservation of NCCL default behavior + +### 7. **Registered Buffer Matching (`test_regbuff_matching`)** +- Configurations based on regBuff parameter +- Registered vs non-registered buffer handling +- Backward compatibility with configs missing regBuff + +### 8. **Pipeline Operations Matching (`test_pipeops_matching`)** +- Configurations based on numPipeOps parameter +- Single vs multiple pipeline operation handling +- Backward compatibility with configs missing numPipeOps + +### 9. **Fallback Behavior (`test_no_match_fallback`)** +- Default behavior when no config matches +- Ring/Simple algorithm fallback + +## Test Output + +Successful test run: +``` +Running NCCL Tuner Plugin Unit Tests +===================================== +PASS: test_plugin_init +PASS: test_config_parsing_valid +PASS: test_config_parsing_invalid +PASS: test_collective_matching +PASS: test_size_matching +PASS: test_topology_matching +PASS: test_default_channels +PASS: test_regbuff_matching +PASS: test_pipeops_matching +PASS: test_no_match_fallback + +===================================== +Test Results: 9/9 tests passed +All tests PASSED! +``` + +Failed test example: +``` +FAIL: test_collective_matching - Tree/Simple should have low cost +Test Results: 8/9 tests passed +Some tests FAILED! +``` + +## Mock NCCL Implementation + +The tests use the actual NCCL header files from the `../nccl/` directory: + +- `tuner.h` - Complete NCCL tuner interface and type definitions +- `common.h` - Common NCCL types and logging functions +- `err.h` - NCCL error codes + +This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation. + +## Integration with CI/CD + +```bash +# Install tests for CI/CD pipeline +make install-test + +# Run as part of automated testing +make test && echo "Tests passed" || echo "Tests failed" +``` + +## Memory Testing + +The tests can be run with valgrind for memory leak detection: + +```bash +make test-memory +``` + +This will detect: +- Memory leaks +- Invalid memory access +- Use of uninitialized memory + +## Code Coverage + +Generate code coverage reports to ensure comprehensive testing: + +```bash +make coverage +# Creates test_plugin.c.gcov with line-by-line coverage +``` + +## Adding New Tests + +To add a new test: + +1. Create a new test function in `test_plugin.c`: +```c +int test_new_feature() { + // Test setup + TEST_ASSERT(condition, "description"); + // Test cleanup + TEST_PASS(); +} +``` + +2. Add the test to the main function: +```c +total++; passed += test_new_feature(); +``` + +3. Rebuild and run: +```bash +make test +``` + +## Debugging Tests + +For debugging failed tests: + +```bash +# Compile with debug symbols +make CFLAGS="-g -O0 -DDEBUG" + +# Run with gdb +gdb ./test_plugin +``` + +## Cleaning Up + +```bash +# Remove all build artifacts and temporary files +make clean +``` + +This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases. diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c new file mode 100644 index 0000000..28897c4 --- /dev/null +++ b/ext-tuner/example/test/test_plugin.c @@ -0,0 +1,856 @@ +/************************************************************************* + * Unit tests for NCCL Tuner Plugin + ************************************************************************/ + +#define _GNU_SOURCE // Enable setenv/unsetenv and other GNU extensions + +#include +#include +#include +#include +#include +#include +#include + + +// Include NCCL tuner header (which includes common.h and err.h) +#include "tuner.h" + +// Include plugin source for testing +#include "../plugin.c" + +// Test framework macros +#define TEST_ASSERT(condition, message) \ + do { \ + if (!(condition)) { \ + printf("FAIL: %s - %s\n", __func__, message); \ + return 0; \ + } \ + } while(0) + +#define TEST_PASS() \ + do { \ + printf("PASS: %s\n", __func__); \ + return 1; \ + } while(0) + +// Global test state +static int test_log_count = 0; + +// Mock logger function +void mock_logger(ncclDebugLogLevel level, unsigned long flags, + const char* file, int line, const char* fmt, ...) { + (void)flags; // Suppress unused parameter warning + test_log_count++; + + // Check if we should print based on NCCL_DEBUG level + const char* debug_level = getenv("NCCL_DEBUG"); + int should_print = 0; + + if (debug_level) { + if (strcmp(debug_level, "TRACE") == 0) { + should_print = 1; // Print everything + } else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) { + should_print = 1; // Print INFO and below + } else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) { + should_print = 1; // Print WARN and below + } + } + + if (!should_print) return; + + // Convert log level to string + const char* level_str; + switch(level) { + case NCCL_LOG_NONE: level_str = "NONE"; break; + case NCCL_LOG_VERSION: level_str = "VERSION"; break; + case NCCL_LOG_WARN: level_str = "WARN"; break; + case NCCL_LOG_INFO: level_str = "INFO"; break; + case NCCL_LOG_ABORT: level_str = "ABORT"; break; + case NCCL_LOG_TRACE: level_str = "TRACE"; break; + default: level_str = "UNKNOWN"; break; + } + + // Print log header + printf("[TUNER:%s:%s:%d] ", level_str, file, line); + + // Print formatted message + va_list args; + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + + printf("\n"); +} + +// Helper function to create test config file +void create_test_config(const char* filename, const char* content) { + FILE* f = fopen(filename, "w"); + if (f) { + fprintf(f, "%s", content); + fclose(f); + } +} + +// Test 1: Plugin initialization +int test_plugin_init() { + void* context = NULL; + + // Test successful initialization + ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed"); + TEST_ASSERT(context != NULL, "Context should be allocated"); + + // Clean up + pluginDestroy(context); + TEST_PASS(); +} + +// Test 2: Configuration file parsing - valid CSV +int test_config_parsing_valid() { + const char* test_config = + "# Test configuration\n" + "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n" + "broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n" + "# Comment line\n" + "\n" // Empty line + "reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n"; + + create_test_config("test_valid.conf", test_config); + + // Set environment variable to use our test config + setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1); + + void* context = NULL; + ncclResult_t result = pluginInit(16, 2, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed"); + + // Clean up + pluginDestroy(context); + unlink("test_valid.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 3: Configuration file parsing - invalid CSV +int test_config_parsing_invalid() { + const char* test_config = + "allreduce,0,65536,tree,simple,2,1 # Missing nRanks and other fields\n" + "invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n" + "broadcast,abc,def,ring,simple,1,1,1,-1,-1\n"; // Invalid numbers + + create_test_config("test_invalid.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1); + + void* context = NULL; + ncclResult_t result = pluginInit(8, 1, mock_logger, &context); + // Should still succeed but with no valid configs loaded + TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config"); + + // Clean up + pluginDestroy(context); + unlink("test_invalid.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 4: Collective type matching +int test_collective_matching() { + const char* test_config = + "allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n" + "broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n"; + + create_test_config("test_match.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + // Create mock cost table + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; // Default high cost + } + } + + int nChannels; + + // Test allreduce matching (should match first config) + ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + + TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed"); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]); + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost"); + TEST_ASSERT(nChannels == 8, "Should set 8 channels"); + + // Test broadcast matching (should match second config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; // Reset costs + } + } + + result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed"); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost"); + TEST_ASSERT(nChannels == 4, "Should set 4 channels"); + + // Clean up + pluginDestroy(context); + unlink("test_match.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 5: Size range matching +int test_size_matching() { + const char* test_config = + "allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n" + "allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n" + "allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n"; + + create_test_config("test_size.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + int nChannels = 1; + + pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]); + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost"); + TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels"); + + // Test medium message (should match second config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost"); + TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels"); + + // Test large message (should match third config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost"); + TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels"); + + // Clean up + pluginDestroy(context); + unlink("test_size.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 6: Topology matching +int test_topology_matching() { + const char* test_config = + "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n" // Single node only + "allreduce,0,65536,ring,simple,4,4,32,-1,-1\n" // 4 nodes, 32 ranks exactly + "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any topology + + create_test_config("test_topo.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1); + + // Test with single node setup + void* context1 = NULL; + pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + int nChannels; + pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config"); + TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels"); + + pluginDestroy(context1); + + // Test with 4 nodes, 32 ranks setup + void* context2 = NULL; + pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes + + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config"); + TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels"); + + // Clean up + unlink("test_topo.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 7: Default channels behavior (-1) +int test_default_channels() { + const char* test_config = + "allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n"; // Use default channels + + create_test_config("test_default.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + int nChannels = 99; // Set to known value + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol"); + TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1"); + + // Clean up + pluginDestroy(context); + unlink("test_default.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 8: regBuff matching +int test_regbuff_matching() { + const char* test_config = + "allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n" // Registered buffers only + "allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n" // Non-registered buffers only + "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any buffer type (backward compatible) + + create_test_config("test_regbuff.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + } + + int nChannels; + + // Test registered buffer (should match first config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 1, &nChannels); // regBuff = 1 (registered) + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost"); + TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels"); + + // Test non-registered buffer (should match second config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); // regBuff = 0 (non-registered) + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost"); + TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels"); + + // Test backward compatibility - config without regBuff should match any regBuff value + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + // First try with regBuff=2 (unusual value, should match third config) + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 2, &nChannels); // regBuff = 2 (only third config should match) + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost"); + TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels"); + + // Clean up + pluginDestroy(context); + unlink("test_regbuff.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 9: numPipeOps matching +int test_pipeops_matching() { + const char* test_config = + "allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n" // Single pipeline op + "allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n" // Multiple pipeline ops + "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any pipeline ops (backward compatible) + + create_test_config("test_pipeops.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + } + + int nChannels; + + // Test single pipeline op (should match first config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost"); + TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels"); + + // Test multiple pipeline ops (should match second config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost"); + TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels"); + + // Test different number of pipeline ops (should match third config - backward compatible) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost"); + TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels"); + + // Clean up + pluginDestroy(context); + unlink("test_pipeops.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 10: No matching configuration (fallback behavior) +int test_no_match_fallback() { + const char* test_config = + "broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n"; // Only broadcast config + + create_test_config("test_fallback.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + int nChannels; + // Try allreduce (should not match, use fallback) + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified"); + TEST_ASSERT(nChannels == 1, "Should use default channels"); + + // Clean up + pluginDestroy(context); + unlink("test_fallback.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 11: Large configuration files (testing dynamic allocation) +int test_large_config() { + const char* large_config_file = "test_large.conf"; + + // Create a large configuration file with many entries + // This tests the dynamic allocation functionality + FILE* f = fopen(large_config_file, "w"); + TEST_ASSERT(f != NULL, "Should be able to create large config file"); + + // Write header comment + fprintf(f, "# Large configuration file for testing dynamic allocation\n"); + fprintf(f, "# This file contains many configurations to test memory allocation\n"); + + // Generate a large number of configurations (much more than the old MAX_CONFIGS=100) + const int num_configs = 500; // 5x the old static limit + const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"}; + const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"}; + const char* protocols[] = {"simple", "ll", "ll128"}; + + for (int i = 0; i < num_configs; i++) { + // Vary the configurations to create realistic test data + const char* coll = collectives[i % 5]; + const char* algo = algorithms[i % 4]; + const char* proto = protocols[i % 3]; + + size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB + size_t max_bytes = min_bytes + 65536; // 64KB range + int channels = (i % 8) + 1; // 1-8 channels + int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes + int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks + int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps + int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1 + + fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n", + coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff); + } + + fclose(f); + + // Set environment to use our large config file + setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1); + + // Initialize plugin with large config + void* context = NULL; + ncclResult_t result = pluginInit(16, 4, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed"); + TEST_ASSERT(context != NULL, "Context should be allocated"); + + // Verify that configurations were loaded + TunerContext* ctx = (TunerContext*)context; + TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file"); + TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size"); + TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated"); + + // Test that we can access configurations throughout the array + // (This would have failed with the old static MAX_CONFIGS=100 limit) + for (int i = 0; i < ctx->numConfigs; i++) { + TuningConfig* config = &ctx->configs[i]; + // Basic sanity checks on the loaded configurations + TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce, + "Collective type should be valid"); + TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes"); + TEST_ASSERT(config->nChannels > 0, "nChannels should be positive"); + } + + // Test specific configuration access at various indices + // Index 0 (first config) + TuningConfig* first_config = &ctx->configs[0]; + TEST_ASSERT(first_config != NULL, "First config should be accessible"); + + // Index in middle + TuningConfig* mid_config = &ctx->configs[num_configs / 2]; + TEST_ASSERT(mid_config != NULL, "Middle config should be accessible"); + + // Index near end (this would have crashed with static array of 100) + TuningConfig* late_config = &ctx->configs[num_configs - 1]; + TEST_ASSERT(late_config != NULL, "Last config should be accessible"); + + // Test memory allocation size - verify we didn't over-allocate + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "Memory allocated for %d configurations (%zu bytes total)", + ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig)); + + // Test that the plugin can still find matching configurations from the large set + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; // Default high cost + } + } + + int nChannels; + // Try to find a matching configuration - should work with large config set + result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set"); + + // Clean up + pluginDestroy(context); + unlink(large_config_file); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + + TEST_PASS(); +} + +// Test 12: Very large configuration stress test +int test_very_large_config_stress() { + const char* stress_config_file = "test_stress.conf"; + + // Create an even larger configuration file to stress test the implementation + FILE* f = fopen(stress_config_file, "w"); + TEST_ASSERT(f != NULL, "Should be able to create stress test config file"); + + fprintf(f, "# Stress test configuration with very large number of entries\n"); + + // Generate an extremely large number of configurations + const int stress_configs = 2000; // 20x the old static limit + + for (int i = 0; i < stress_configs; i++) { + // Create varied but valid configurations + fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n", + i * 512, (i * 512) + 1024); + } + + fclose(f); + + setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1); + + // Test initialization with stress config + void* context = NULL; + ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files"); + + TunerContext* ctx = (TunerContext*)context; + TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations"); + TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated"); + + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "Stress test - loaded %d configurations successfully", stress_configs); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "Memory usage: %zu bytes for configuration array", + stress_configs * sizeof(TuningConfig)); + + // Verify we can access configurations throughout the entire range + for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config + TuningConfig* config = &ctx->configs[i]; + TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type"); + TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes"); + } + + // Clean up + pluginDestroy(context); + unlink(stress_config_file); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + + TEST_PASS(); +} + +// Test 13: Edge case - empty config file +int test_empty_config() { + const char* empty_config_file = "test_empty.conf"; + + // Create empty config file (only comments) + create_test_config(empty_config_file, + "# Empty configuration file\n" + "# No actual configurations\n" + "\n" + "\n"); + + setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1); + + void* context = NULL; + ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files"); + + TunerContext* ctx = (TunerContext*)context; + TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations"); + TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations"); + TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config"); + + // Test that plugin still works with no configurations (fallback behavior) + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + int nChannels; + result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config"); + + // Clean up + pluginDestroy(context); + unlink(empty_config_file); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + + TEST_PASS(); +} + +// Test runner function pointer type +typedef int (*TestFunction)(void); + +// Test registry +typedef struct { + const char* name; + TestFunction func; + const char* description; +} TestCase; + +// All available tests +TestCase test_cases[] = { + {"init", test_plugin_init, "Plugin initialization"}, + {"config-valid", test_config_parsing_valid, "Valid configuration parsing"}, + {"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"}, + {"collective", test_collective_matching, "Collective type matching"}, + {"size", test_size_matching, "Size range matching"}, + {"topology", test_topology_matching, "Topology matching"}, + {"channels", test_default_channels, "Default channels behavior"}, + {"regbuff", test_regbuff_matching, "Registered buffer matching"}, + {"pipeops", test_pipeops_matching, "Pipeline operations matching"}, + {"fallback", test_no_match_fallback, "Fallback behavior"}, + {"large-config", test_large_config, "Large configuration files (dynamic allocation)"}, + {"stress-config", test_very_large_config_stress, "Very large configuration stress test"}, + {"empty-config", test_empty_config, "Empty configuration file handling"}, + {NULL, NULL, NULL} // End marker +}; + +// Show help/usage information +void show_help(const char* program_name) { + printf("Usage: %s [test_name ...]\n\n", program_name); + printf("Available tests:\n"); + for (int i = 0; test_cases[i].name != NULL; i++) { + printf(" %-15s - %s\n", test_cases[i].name, test_cases[i].description); + } + printf("\nExamples:\n"); + printf(" %s # Run all tests\n", program_name); + printf(" %s init # Run only initialization test\n", program_name); + printf(" %s init collective # Run initialization and collective tests\n", program_name); + printf(" %s --help # Show this help\n", program_name); +} + +// Find test by name +TestFunction find_test(const char* name) { + for (int i = 0; test_cases[i].name != NULL; i++) { + if (strcmp(test_cases[i].name, name) == 0) { + return test_cases[i].func; + } + } + return NULL; +} + +// Main test runner +int main(int argc, char* argv[]) { + int passed = 0, total = 0; + + // Check for help + if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) { + show_help(argv[0]); + return 0; + } + + printf("Running NCCL Tuner Plugin Unit Tests\n"); + printf("=====================================\n"); + + if (argc == 1) { + // No arguments - run all tests + for (int i = 0; test_cases[i].name != NULL; i++) { + total++; + passed += test_cases[i].func(); + } + } else { + // Run specific tests + for (int arg = 1; arg < argc; arg++) { + TestFunction test_func = find_test(argv[arg]); + if (test_func) { + total++; + passed += test_func(); + } else { + printf("ERROR: Unknown test '%s'\n", argv[arg]); + printf("Use --help to see available tests\n"); + return 1; + } + } + } + + printf("\n=====================================\n"); + printf("Test Results: %d/%d tests passed\n", passed, total); + + if (passed == total) { + printf("All tests PASSED!\n"); + return 0; + } else { + printf("Some tests FAILED!\n"); + return 1; + } +} diff --git a/makefiles/common.mk b/makefiles/common.mk index 8a35a8f..6ba9bbf 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -40,10 +40,12 @@ ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0) CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35 endif CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 +CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90 -CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \ - -gencode=arch=compute_120,code=sm_120 +CUDA12_8_GENCODE = -gencode=arch=compute_100,code=sm_100 \ + -gencode=arch=compute_120,code=sm_120 +CUDA13_GENCODE = -gencode=arch=compute_110,code=sm_110 CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 @@ -53,10 +55,10 @@ CUDA13_PTX = -gencode=arch=compute_120,code=compute_120 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) # Prior to SM75 is deprecated from CUDA13.0 onwards - NVCC_GENCODE ?= $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) + NVCC_GENCODE ?= $(CUDA10_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0) # Include Blackwell support if we're using CUDA12.8 or above - NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) + NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) # Include Hopper support if we're using CUDA11.8 or above NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX) diff --git a/makefiles/version.mk b/makefiles/version.mk index f41e7a7..013e972 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 27 -NCCL_PATCH := 3 +NCCL_PATCH := 5 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/device/Makefile b/src/device/Makefile index df58489..67ab176 100644 --- a/src/device/Makefile +++ b/src/device/Makefile @@ -36,9 +36,8 @@ define COMPILE $(call COMPILE$(or $3,$(suffix $2)),$1,$2) endef -ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12080))"),1) - NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a \ - -gencode=arch=compute_120a,code=sm_120a +ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12090))"),1) + NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100f,code=sm_100f else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1) NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a else diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h index 0d054bb..d36dfe5 100644 --- a/src/device/reduce_kernel.h +++ b/src/device/reduce_kernel.h @@ -1009,7 +1009,7 @@ struct Apply_LoadMultimem { DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_bfloat16, bf16x2, 4) #endif - #if NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210 + #if NCCL_CUDA_ARCH_SPECIFIC == 1000 || NCCL_CUDA_ARCH_SPECIFIC == 1010 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210 DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4) DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4) DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4) diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py index f630ff0..8fcb9a4 100755 --- a/src/device/symmetric/generate.py +++ b/src/device/symmetric/generate.py @@ -108,7 +108,7 @@ def required_cuda(k): if k.algo in ldmc_algos: cudart = 12070 arch = None - specific_sms = [100, 120] + specific_sms = ["100a", "101a", "100f", "101f", "120a", "121a"] return (cudart, arch, specific_sms) ################################################################################ @@ -145,7 +145,7 @@ def kernel_conds(k): if not specific_sms: arch_cond = "__CUDA_ARCH__ >= %d"%arch else: - arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_SPECIFIC==%d"%(10*sm) for sm in specific_sms]) + arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_%sSPECIFIC==%d"%("FAMILY_" if sm[-1] == "f" else "", 10*int(sm.replace('a', '').replace('f', ''))) for sm in specific_sms]) return cudart_cond, arch_cond def instantiate(k): diff --git a/src/graph/paths.cc b/src/graph/paths.cc index bc5cc75..4b44abd 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -175,6 +175,13 @@ ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu return ncclSuccess; } +static int mergePathType(int type0, int type1){ + int max = std::max(type0,type1); + int min = std::min(type0,type1); + if(max == PATH_PHB && min == PATH_C2C) return PATH_P2C; + else return max; +} + static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) { struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix; struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; @@ -187,7 +194,7 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, // Update path characteristics srcNode->paths[t2][i2].count = l; - srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); + srcNode->paths[t2][i2].type = mergePathType(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN; srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw); return ncclSuccess; @@ -674,9 +681,9 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm int c; NCCLCHECK(ncclGetLocalCpu(system, g, &c)); if (c == -1) continue; - if (gpuNode->paths[NET][n].type == PATH_PHB && gpuNode->paths[CPU][c].type == PATH_C2C) { - gpuNode->paths[NET][n].type = PATH_P2C; - netNode->paths[GPU][g].type = PATH_P2C; + if (mergePathType(gpuNode->paths[CPU][c].type, netNode->paths[CPU][c].type) == PATH_P2C) { + gpuNode->paths[NET][n].type = std::min(PATH_P2C, gpuNode->paths[NET][n].type); + netNode->paths[GPU][g].type = std::min(PATH_P2C, netNode->paths[GPU][g].type); } } } @@ -695,16 +702,15 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm // PXN = PCI + NVLink. struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex; // Only use PXN for NIC n if remote GPU p ... - if (/* (1) is either connected to the NIC with PXB*/ - (peerNode->paths[NET][n].type <= PATH_PXB || - /* or with P2C and PxN over C2C is enabled */ - (ncclParamPxnC2c() && peerNode->paths[NET][n].type == PATH_P2C)) && + int pxnType = ncclParamPxnC2c() ? PATH_P2C : PATH_PXB; + if (/* (1) is connected to the NIC with PxN type*/ + peerNode->paths[NET][n].type <= pxnType && /* and (2) is connected to us through NVLink */ peerNode->paths[GPU][g].type <= PATH_NVL && /* and (3) is on the same node as us */ NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && /* and (4) has either higher bw to that NIC or avoid going through the CPU*/ - (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXB)) + (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > pxnType)) // We can use that GPU as relay to communicate with that NIC. // Only enabling it in the GPU->NIC direction for now to favor // receiving locally and sending remotely (consistent with net.cc) @@ -725,6 +731,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm } } } + + // Pre-compute NET local gpus to accelerate search + for (int n=0; nnodes[NET].count; n++) { + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &net->net.localGpu)); + } return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 9d8ad3f..67e6009 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -437,6 +437,65 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop return ncclSuccess; } +// Add the preferred NICs ordered by GPU first +static ncclResult_t ncclTopoPrefNetsGpuFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) { + const int nGpus = (gpu == -1) ? system->nodes[GPU].count : 1; + int gpuCount = nGpus; + int gpuIds[NCCL_TOPO_MAX_NODES] = {gpu}; + int firstNets[NCCL_TOPO_MAX_NODES]; + if (gpu == -1) + for (int g = 0; g < nGpus; g++) gpuIds[g] = g; + + for (int c = 0; c < MAXCHANNELS; c++) { + for (int g = 0; g < nGpus; g++) { + if (gpuIds[g] == -1) continue; + int localNet; + int64_t netId; + struct ncclTopoNode* gpu = system->nodes[GPU].nodes + gpuIds[g]; + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &localNet)); + // store the first net found for each GPU in case of duplicates + if(c == 0) firstNets[g] = localNet; + // if the NET has already been returned for channel 0, that GPU is done + if (c > 0 && firstNets[g] == localNet) { + gpuIds[g] = -1; + gpuCount--; + continue; + } + // only add it to the list if it doesn't already exist + int found = 0; + while (found < (*netCount) && nets[found] != localNet) found++; + if (found == (*netCount)) nets[(*netCount)++] = localNet; + } + if (gpuCount == 0) break; + } + return ncclSuccess; +} + +// Add the preferred NICs ordered by channels first +static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) { + for (int g = 0; g < system->nodes[GPU].count; g++) { + if (gpu != -1 && gpu != g) continue; + int localNetCount = 0, localNets[MAXCHANNELS]; + struct ncclTopoNode* gpu = system->nodes[GPU].nodes + g; + for (int c = 0; c < MAXCHANNELS; c++) { + int64_t netId; + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets + localNetCount)); + if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; + localNetCount++; + } + // Append NICs to list + for (int i = 0; i < localNetCount; i++) { + int n = localNets[i]; + int found = 0; + while (found < (*netCount) && nets[found] != n) found++; + if (found == (*netCount)) nets[(*netCount)++] = n; + } + } + return ncclSuccess; +} + // Build a sorted list of the NETs to try. // // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu @@ -445,39 +504,25 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop // The list is built the following way: // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. // 2. add other NETs satisfying typeInter but not already in the list. - +NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1); ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) { ncclResult_t ret = ncclSuccess; int netCount = 0; - int localNetCount; - int localNets[MAXCHANNELS]; - // First add the preferred NICs - for (int g=0; gnodes[GPU].count; g++) { - if (gpu != -1 && gpu != g) continue; - localNetCount = 0; - struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - for (int c = 0; cgpu.rank, c, &netId, NULL)); - NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); - if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; - localNetCount++; - } - // Append NICs to list - for (int i=0; inHosts > 1 && ncclParamScatterEnable()) { + // For MNNVL systems, we sort the devices by GPU first, then by channel + NCCLCHECK(ncclTopoPrefNetsGpuFirst(system, gpu, nets, &netCount)); + } else { + // For other systems, we sort the devices by channel first, then by GPU + NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount)); } // Then add others satisfying typeInter for (int t=0; t <= typeInter; t++) { - for (int g=0; gnodes[GPU].count; g++) { + for (int g = 0; g < system->nodes[GPU].count; g++) { if (gpu != -1 && gpu != g) continue; - localNetCount = 0; + int localNetCount = 0, localNets[MAXCHANNELS]; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; struct ncclTopoLinkList* paths = gpu->paths[NET]; for (int n=0; nnodes[NET].count && npattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) { // NVLS search only tries to find NIC:GPU combinations to compute the heads. if (graph->nChannels < netCount) { - int gpu; - NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu)); + int gpu = net->net.localGpu; if (gpu != -1) { int duplicate = 0; // check whether there is duplicate head when one GPU connects with multiple NICs @@ -643,13 +687,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } } else { - if (graph->nChannels > 0) { + if (graph->nChannels > 0 && graph->sameChannels == 1) { // Try to replay the last channel int g; NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); - } - if (graph->nChannels == 0 || graph->sameChannels == 0) { + } else { if (graph->nChannels == 0 && system->nodes[NVS].count == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; @@ -658,11 +701,16 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } // Then try the most local GPUs + int localGpu = net->net.localGpu; + if (localGpu != -1) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpu)); + } int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType; NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType)); // if no GPUs are connected, skip this net if (pathType == PATH_DIS) continue; for (int g = 0; g < localGpuCount; ++g) { + if (localGpus[g] == localGpu) continue; // We already tried this one NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g])); } } @@ -749,8 +797,8 @@ struct kvDict kvDictLinkType[] = { { "NVB", PATH_NVB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, - { "PXN", PATH_PXN }, { "P2C", PATH_P2C }, + { "PXN", PATH_PXN }, { "PHB", PATH_PHB }, { "SYS", PATH_SYS }, { NULL, 0 } @@ -798,8 +846,10 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter)); - if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0; const char* str; + NCCLCHECK(xmlGetAttr(xmlGraph, "latencyinter", &str)); + if (!str) INFO(NCCL_GRAPH, "latencyinter not found in graph, using 0.0"); + graph->latencyInter = str ? strtof(str, NULL) : 0.0; NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str)); @@ -910,7 +960,7 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 }; -float sm100SpeedArrayInter[] = { 47.9, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float sm100SpeedArrayInter[] = { 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float)) @@ -1136,8 +1186,12 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr offset = strlen(line); } for (int i=0; iintra[ngpus*c+i]); + int g; + ncclTopoRankToIndex(system, graph->intra[ngpus * c + i], &g, true); + int64_t topoId = system->nodes[GPU].nodes[g].id; + sprintf(line + offset, " %s/%lx-%lx", topoNodeTypeStr[GPU], NCCL_TOPO_ID_SYSTEM_ID(topoId), NCCL_TOPO_ID_LOCAL_ID(topoId)); offset = strlen(line); + if (graph->id == 3) break; // NVLS graphs only use the first GPU } if (system->nodes[NET].count > 0) { sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1])); diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 9fe81bb..8fdf54e 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -21,7 +21,7 @@ const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "", "SYS", "NET" }; -const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" }; +const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "P2C", "PXN", "PHB", "SYS", "NET", "DIS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ @@ -677,7 +677,14 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem struct ncclXmlNode* node = topNode->subs[s]; if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); } - for (int systemId=0; systemIdnHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId; + + int systemId = 0; + while (systemId < system->nHosts && system->hostHashes[systemId] != localHostHash) systemId++; + system->systemId = systemId; + if(systemId == system->nHosts){ + WARN("localHostHash = 0x%lx not found in the list of system hostHashes",localHostHash); + return ncclInvalidArgument; + } NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0)); NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0)); @@ -1143,8 +1150,8 @@ struct kvDict nicPathKvList[] = { { "PORT", PATH_PORT }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, - { "PXN", PATH_PXN }, { "P2C", PATH_P2C }, + { "PXN", PATH_PXN }, { "PHB", PATH_PHB }, { "SYS", PATH_SYS }, { NULL, 0 } @@ -1421,7 +1428,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy } // Only update our topo tracking structure if we aren't dumping (separate steps) - if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail); + if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, getHostHash()), ret, fail); exit: if (!comm->MNNVL && localRanks) free(localRanks); diff --git a/src/graph/topo.h b/src/graph/topo.h index 07ef5e1..9b49c02 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -18,7 +18,7 @@ #define SM80_NVLINK_BW 20.0 #define SM90_NVLINK_BW 20.6 #define SM86_NVLINK_BW 12.0 -#define SM100_NVLINK_BW 40.0 +#define SM100_NVLINK_BW 40.1 #define PCI_BW 12.0 // PCI Gen3 x16 #define AMD_BW 16.0 #define BDW_QPI_BW 6.0 @@ -76,11 +76,11 @@ extern const char* topoLinkTypeStr[]; // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) #define PATH_PXB 5 -// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. -#define PATH_PXN 6 - // Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC -#define PATH_P2C 7 +#define PATH_P2C 6 + +// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. +#define PATH_PXN 7 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) #define PATH_PHB 8 @@ -143,6 +143,7 @@ struct ncclTopoNode { int gdrSupport; int collSupport; int maxChannels; + int localGpu; }net; struct { int arch; diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 64dc5cf..8e99f18 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -455,9 +455,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom for (int c=0; ctypeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= (ncclParamLl128C2c() ? PATH_P2C : PATH_PXN))); + if (ncclParamLl128C2c() && minCompCap >= 90) { + // Enable LL128 by default only on Hopper/Blackwell for all connections up to P2C and PXN. + pEnable &= (graphs[a]->typeInter <= PATH_PXN); + } else { + // Enable LL128 only up to PXB. Don't enable LL128 over PxN because PxN can encapsulate PxB or P2C links. + pEnable &= (graphs[a]->typeInter <= PATH_PXB); + if (!ncclParamLl128C2c() && minCompCap >= 90) + INFO(NCCL_GRAPH, "Disabling LL128 over all PxN connections (PXB and C2C). This ensures that no C2C link will be used by LL128."); + } pEnable &= (graphs[a]->typeIntra <= PATH_NVB); pEnable &= (minCompCap == maxCompCap); pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2)); diff --git a/src/init.cc b/src/init.cc index 83764a8..2a57c46 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1507,7 +1507,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { int minCTAsEnv; int maxCTAsEnv; int splitShareEnv; - int collnetEnableEnv; + const char* collnetEnableEnv; int ctaPolicyEnv; int shrinkShareEnv; int nvlsCTAsEnv; @@ -1561,9 +1561,15 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { comm->config.shrinkShare = shrinkShareEnv; } - collnetEnableEnv = ncclParamCollnetEnable(); - if (collnetEnableEnv != NCCL_CONFIG_UNDEF_INT) { - comm->config.collnetEnable = collnetEnableEnv; + // NCCL_COLLNET_ENABLE needs to be reloaded each time for comm init + // since users might change the env on the fly to enable/disable collnet + collnetEnableEnv = ncclGetEnv("NCCL_COLLNET_ENABLE"); + if (collnetEnableEnv != NULL) { + int collnetEnableInt = (int)strtol(collnetEnableEnv, NULL, 0); + if (collnetEnableInt != NCCL_CONFIG_UNDEF_INT) { + comm->config.collnetEnable = collnetEnableInt; + INFO(NCCL_ENV, "NCCL_COLLNET_ENABLE set by environment to %d.", collnetEnableInt); + } } ctaPolicyEnv = ncclParamCtaPolicy(); diff --git a/src/misc/mlx5dvsymbols.cc b/src/misc/mlx5dvsymbols.cc index 5bb4109..47cc4eb 100644 --- a/src/misc/mlx5dvsymbols.cc +++ b/src/misc/mlx5dvsymbols.cc @@ -52,6 +52,9 @@ ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) { #define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ cast = (void**)&funcptr; \ *cast = dlvsym(handle, symbol, version); \ + if (*cast == NULL) { \ + INFO(NCCL_NET, "dlvsym failed on %s - %s version %s", symbol, dlerror(), version); \ + } \ } while (0) LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported); diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index 0adb4b1..1766f41 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -21,7 +21,6 @@ struct ncclStrongStreamCapture { cudaGraph_t graph; unsigned long long graphId; cudaStream_t captureStream; - cudaGraphNode_t lastRecord; void* acquiredBy; }; @@ -216,7 +215,6 @@ ncclResult_t ncclStrongStreamAcquire( CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock); } cap->graphId = graph.graphId; - cap->lastRecord = nullptr; cap->acquiredBy = localThreadId(); // Push to capturing list. cap->next = ss->captureHead; @@ -296,16 +294,6 @@ ncclResult_t ncclStrongStreamRelease( cudaGraphNode_t recordNode; CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent)); - // Make this record order after previous record on this stream. - if (cap->lastRecord != nullptr) { - #if CUDART_VERSION >= 13000 - CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &cap->lastRecord, &recordNode, nullptr, 1)); - #else - CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1)); - #endif - } - cap->lastRecord = recordNode; - // Get current nodes from work stream so we can add them as dependencies. cudaStreamCaptureStatus status; cudaGraphNode_t const* nodes; @@ -338,6 +326,22 @@ ncclResult_t ncclStrongStreamRelease( } } + // Make every future operation captured on cap->captureStream depend on 'recordNode'. + #if CUDART_VERSION >= 13000 + CUDACHECK(cudaStreamUpdateCaptureDependencies_v2( + cap->captureStream, + &recordNode, /* dependencies */ + /*edges =*/ nullptr, /* no edge annotations */ + 1, /* count */ + cudaStreamSetCaptureDependencies)); + #else + CUDACHECK(cudaStreamUpdateCaptureDependencies( + cap->captureStream, + &recordNode, + 1, + cudaStreamSetCaptureDependencies)); + #endif + if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { WARN("%s", launchRaceFatalMsg); return ncclInvalidUsage; diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc index a9c1d0d..64c97be 100644 --- a/src/plugin/plugin_open.cc +++ b/src/plugin/plugin_open.cc @@ -61,20 +61,20 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) { char eNoEntNameList[PATH_MAX] = { 0 }; if (libName && strlen(libName)) { - // match names that start with 'lib' and end with '.so' - if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) { - snprintf(libName_, MAX_STR_LEN, "%s", libName); - libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); - if (libHandles[type]) { - INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); - return libHandles[type]; - } - if (openErr == ENOENT) { - appendNameToList(eNoEntNameList, &len, libName_); - } else { - INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); - } + snprintf(libName_, MAX_STR_LEN, "%s", libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } + + // libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so') + if (strchr(libName, '/') == nullptr && (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") || strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) { snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName); libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); if (libHandles[type]) { diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 19a505e..40897d9 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -494,7 +494,9 @@ static int ibvSpeeds[] = { 14000, /* FDR */ 25000, /* EDR */ 50000, /* HDR */ - 100000 /* NDR */ }; + 100000, /* NDR */ + 200000 /* XDR */ +}; static int firstBitSet(int val, int max) { int i = 0; @@ -654,7 +656,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr ibProvider = IB_PROVIDER_MLX5; snprintf(dataDirectDevicePath, PATH_MAX, "/sys"); if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) { - INFO(NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name); + INFO(NCCL_INIT|NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name); if(ncclParamIbDataDirect()) dataDirectSupported = 1; } }