Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
3ea7eedf3b |
@ -3,15 +3,20 @@
|
|||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
NCCL_HOME:=../../build/
|
.DEFAULT_GOAL: build
|
||||||
CUDA_HOME:=/usr/local/cuda
|
include ../../makefiles/common.mk
|
||||||
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
|
SRCDIR ?= $(abspath ../..)
|
||||||
PLUGIN_SO:=libnccl-net.so
|
BUILDDIR ?= .
|
||||||
|
NCCLDIR := $(BUILDDIR)
|
||||||
|
|
||||||
default: $(PLUGIN_SO)
|
SRC_FILES := $(wildcard *.c)
|
||||||
|
|
||||||
$(PLUGIN_SO): plugin.c
|
build: ${BUILDDIR}/libnccl-net-example.so
|
||||||
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
|
||||||
|
${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES}
|
||||||
|
@printf "Compiling %-35s > %s\n" $< $@
|
||||||
|
@mkdir -p ${BUILDDIR}
|
||||||
|
$(CC) -Inccl -fPIC -shared -o $@ $^
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f $(PLUGIN_SO)
|
rm -f ${BUILDDIR}/libnccl-net-example.so
|
||||||
|
@ -3,14 +3,20 @@
|
|||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
NCCL_HOME := ../../build
|
.DEFAULT_GOAL: build
|
||||||
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
|
include ../../makefiles/common.mk
|
||||||
PLUGIN_SO := libnccl-profiler.so
|
SRCDIR ?= $(abspath ../..)
|
||||||
|
BUILDDIR ?= .
|
||||||
|
NCCLDIR := $(BUILDDIR)
|
||||||
|
|
||||||
default: $(PLUGIN_SO)
|
SRC_FILES := $(wildcard *.c)
|
||||||
|
|
||||||
$(PLUGIN_SO): plugin.c event.c print_event.c
|
build: ${BUILDDIR}/libnccl-profiler-example.so
|
||||||
$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
|
||||||
|
${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES}
|
||||||
|
@printf "Compiling %-35s > %s\n" $< $@
|
||||||
|
@mkdir -p ${BUILDDIR}
|
||||||
|
$(CC) -Inccl -fPIC -shared -o $@ $^
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f $(PLUGIN_SO)
|
rm -f ${BUILDDIR}/libnccl-profiler-example.so
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <x86intrin.h>
|
#include <time.h>
|
||||||
#include "event.h"
|
#include "event.h"
|
||||||
#include "print_event.h"
|
#include "print_event.h"
|
||||||
|
|
||||||
@ -41,22 +41,10 @@ static struct proxyOp* detachPool;
|
|||||||
ncclDebugLogger_t logFn;
|
ncclDebugLogger_t logFn;
|
||||||
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||||
|
|
||||||
static double freq = -1;
|
|
||||||
__hidden void calibrate() {
|
|
||||||
struct timeval tv;
|
|
||||||
gettimeofday(&tv, NULL);
|
|
||||||
uint64_t timeCycles = __rdtsc();
|
|
||||||
double time = - tv.tv_sec*1e6 - tv.tv_usec;
|
|
||||||
uint64_t total = 0ULL;
|
|
||||||
for (int i = 0; i < 10000; i++) total += __rdtsc();
|
|
||||||
gettimeofday(&tv, NULL);
|
|
||||||
timeCycles = __rdtsc() - timeCycles;
|
|
||||||
time += tv.tv_sec*1e6 + tv.tv_usec;
|
|
||||||
freq = timeCycles / time;
|
|
||||||
}
|
|
||||||
|
|
||||||
__hidden double gettime(void) {
|
__hidden double gettime(void) {
|
||||||
return __rdtsc() / freq;
|
struct timespec t;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &t);
|
||||||
|
return (t.tv_sec*1e6 + (t.tv_nsec*1e-3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
@ -98,8 +86,6 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
|
|||||||
// process address space.
|
// process address space.
|
||||||
pid = getpid();
|
pid = getpid();
|
||||||
|
|
||||||
// calibrate and start timer
|
|
||||||
calibrate();
|
|
||||||
startTime = gettime();
|
startTime = gettime();
|
||||||
}
|
}
|
||||||
pthread_mutex_unlock(&lock);
|
pthread_mutex_unlock(&lock);
|
||||||
|
23
ext-tuner/basic/Makefile
Normal file
23
ext-tuner/basic/Makefile
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# See LICENSE.txt for license information
|
||||||
|
#
|
||||||
|
.DEFAULT_GOAL: build
|
||||||
|
include ../../makefiles/common.mk
|
||||||
|
SRCDIR ?= $(abspath ../..)
|
||||||
|
BUILDDIR ?= .
|
||||||
|
NCCLDIR := $(BUILDDIR)
|
||||||
|
|
||||||
|
SRC_FILES := $(wildcard *.c)
|
||||||
|
DST_DIR := $(BUILDDIR)/test/unit/plugins
|
||||||
|
|
||||||
|
build: ${BUILDDIR}/libnccl-tuner-basic.so
|
||||||
|
|
||||||
|
${BUILDDIR}/libnccl-tuner-basic.so: ${SRC_FILES}
|
||||||
|
@printf "Compiling %-35s > %s\n" $< $@
|
||||||
|
@mkdir -p ${BUILDDIR}
|
||||||
|
$(CC) -Inccl -fPIC -shared -o $@ $^
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f ${BUILDDIR}/libnccl-tuner-basic.so
|
15
ext-tuner/basic/nccl/common.h
Normal file
15
ext-tuner/basic/nccl/common.h
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef COMMON_H_
|
||||||
|
#define COMMON_H_
|
||||||
|
|
||||||
|
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||||
|
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||||
|
|
||||||
|
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||||
|
|
||||||
|
#endif
|
17
ext-tuner/basic/nccl/err.h
Normal file
17
ext-tuner/basic/nccl/err.h
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_ERR_H_
|
||||||
|
#define NCCL_ERR_H_
|
||||||
|
|
||||||
|
/* Error type for plugins */
|
||||||
|
typedef enum { ncclSuccess = 0,
|
||||||
|
ncclUnhandledCudaError = 1,
|
||||||
|
ncclSystemError = 2,
|
||||||
|
ncclInternalError = 3,
|
||||||
|
ncclInvalidArgument = 4,
|
||||||
|
ncclInvalidUsage = 5,
|
||||||
|
ncclRemoteError = 6 } ncclResult_t;
|
||||||
|
|
||||||
|
#endif
|
97
ext-tuner/basic/nccl/tuner.h
Normal file
97
ext-tuner/basic/nccl/tuner.h
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NCCL_TUNER_H_
|
||||||
|
#define NCCL_TUNER_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "err.h"
|
||||||
|
|
||||||
|
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
|
||||||
|
typedef enum {
|
||||||
|
ncclFuncBroadcast = 0,
|
||||||
|
ncclFuncReduce = 1,
|
||||||
|
ncclFuncAllGather = 2,
|
||||||
|
ncclFuncReduceScatter = 3,
|
||||||
|
ncclFuncAllReduce = 4,
|
||||||
|
ncclFuncSendRecv = 5,
|
||||||
|
ncclFuncSend = 6,
|
||||||
|
ncclFuncRecv = 7,
|
||||||
|
ncclNumFuncs = 8
|
||||||
|
} ncclFunc_t;
|
||||||
|
|
||||||
|
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
|
||||||
|
#define NCCL_ALGO_UNDEF -1
|
||||||
|
#define NCCL_ALGO_TREE 0
|
||||||
|
#define NCCL_ALGO_RING 1
|
||||||
|
#define NCCL_ALGO_COLLNET_DIRECT 2
|
||||||
|
#define NCCL_ALGO_COLLNET_CHAIN 3
|
||||||
|
#define NCCL_ALGO_NVLS 4
|
||||||
|
#define NCCL_ALGO_NVLS_TREE 5
|
||||||
|
#define NCCL_ALGO_PAT 6
|
||||||
|
|
||||||
|
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||||
|
#define NCCL_PROTO_UNDEF -1
|
||||||
|
#define NCCL_PROTO_LL 0
|
||||||
|
#define NCCL_PROTO_LL128 1
|
||||||
|
#define NCCL_PROTO_SIMPLE 2
|
||||||
|
|
||||||
|
#define NCCL_ALGO_PROTO_IGNORE -1.0
|
||||||
|
|
||||||
|
// API to be implemented by external tuner
|
||||||
|
typedef struct {
|
||||||
|
// Name of the tuner
|
||||||
|
const char* name;
|
||||||
|
|
||||||
|
// Initializes tuner states.
|
||||||
|
// Inputs:
|
||||||
|
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||||
|
// - nNodes: number of nodes in current communicator.
|
||||||
|
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||||
|
// Outputs:
|
||||||
|
// - context: tuner context object
|
||||||
|
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||||
|
|
||||||
|
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||||
|
// Inputs:
|
||||||
|
// - context: tuner context object
|
||||||
|
// - collType: collective type , e.g., allreduce, allgather…
|
||||||
|
// - nBytes: collective size in bytes
|
||||||
|
// - numPipeOps: number of operations in the group
|
||||||
|
// - numAlgo: number of algorithms in collCostTable
|
||||||
|
// - numProto: number of protocols in collCostTable
|
||||||
|
// - regBuff: can register user buffer
|
||||||
|
//
|
||||||
|
// Outputs:
|
||||||
|
// - nChannels: number of channels (hence SMs) to be used.
|
||||||
|
//
|
||||||
|
// InOut:
|
||||||
|
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
|
||||||
|
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
|
||||||
|
//
|
||||||
|
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||||
|
// default tuning for the given collective.
|
||||||
|
// Also, the plugin is allowed to not set any output, or set only the
|
||||||
|
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||||
|
// Unset fields will be set automatically by NCCL.
|
||||||
|
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||||
|
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||||
|
int regBuff, int* nChannels);
|
||||||
|
|
||||||
|
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||||
|
// context: tuner context object
|
||||||
|
ncclResult_t (*destroy)(void* context);
|
||||||
|
} ncclTuner_v4_t;
|
||||||
|
|
||||||
|
typedef ncclTuner_v4_t ncclTuner_t;
|
||||||
|
|
||||||
|
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
|
||||||
|
|
||||||
|
#endif
|
34
ext-tuner/basic/plugin.c
Normal file
34
ext-tuner/basic/plugin.c
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#include "tuner.h"
|
||||||
|
|
||||||
|
#define __hidden __attribute__ ((visibility("hidden")))
|
||||||
|
|
||||||
|
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
|
||||||
|
|
||||||
|
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
|
||||||
|
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||||
|
int regBuff, int* nChannels) {
|
||||||
|
// Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
|
||||||
|
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
|
||||||
|
if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
|
||||||
|
table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
|
||||||
|
}
|
||||||
|
*nChannels = 1;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
|
||||||
|
|
||||||
|
#define PLUGIN_NAME "Basic"
|
||||||
|
|
||||||
|
const ncclTuner_v4_t ncclTunerPlugin_v4 = {
|
||||||
|
.name = PLUGIN_NAME,
|
||||||
|
.init = pluginInit,
|
||||||
|
.getCollInfo = pluginGetCollInfo,
|
||||||
|
.destroy = pluginDestroy
|
||||||
|
};
|
@ -3,15 +3,53 @@
|
|||||||
#
|
#
|
||||||
# See LICENSE.txt for license information
|
# See LICENSE.txt for license information
|
||||||
#
|
#
|
||||||
NCCL_HOME:=../../build/
|
|
||||||
CUDA_HOME:=/usr/local/cuda
|
|
||||||
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
|
|
||||||
PLUGIN_SO:=libnccl-tuner.so
|
|
||||||
|
|
||||||
default: $(PLUGIN_SO)
|
.DEFAULT_GOAL: build
|
||||||
|
PLUGIN_SO:=libnccl-tuner-example.so
|
||||||
|
include ../../makefiles/common.mk
|
||||||
|
SRCDIR ?= $(abspath ../..)
|
||||||
|
BUILDDIR ?= .
|
||||||
|
NCCLDIR := $(BUILDDIR)
|
||||||
|
|
||||||
$(PLUGIN_SO): plugin.c
|
SRC_FILES := $(wildcard *.c)
|
||||||
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
DST_DIR := $(BUILDDIR)/test/unit/plugins
|
||||||
|
|
||||||
|
default: ${BUILDDIR}/$(PLUGIN_SO)
|
||||||
|
|
||||||
|
build: ${BUILDDIR}/$(PLUGIN_SO)
|
||||||
|
|
||||||
|
${BUILDDIR}/$(PLUGIN_SO): plugin.c
|
||||||
|
@printf "Compiling %-35s > %s\n" $< $@
|
||||||
|
@mkdir -p ${BUILDDIR}
|
||||||
|
$(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||||
|
|
||||||
|
# Test targets - delegate to test directory
|
||||||
|
test:
|
||||||
|
$(MAKE) -C test test TEST_CASE=$(TEST_CASE)
|
||||||
|
|
||||||
|
test-verbose:
|
||||||
|
$(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE)
|
||||||
|
|
||||||
|
# Build tests
|
||||||
|
test-build:
|
||||||
|
$(MAKE) -C test all
|
||||||
|
|
||||||
|
# Optimize configurations from performance data
|
||||||
|
optimize-config:
|
||||||
|
@if [ -z "$(CSV_FILE)" ]; then \
|
||||||
|
echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \
|
||||||
|
echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
python3 scripts/optimize_config.py $(CSV_FILE) \
|
||||||
|
$(if $(OUTPUT),-o $(OUTPUT)) \
|
||||||
|
$(if $(METRIC),-m $(METRIC)) \
|
||||||
|
$(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \
|
||||||
|
$(if $(DRY_RUN),--dry-run) \
|
||||||
|
$(if $(NO_HEADER),--no-header)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f $(PLUGIN_SO)
|
rm -f ${BUILDDIR}/$(PLUGIN_SO)
|
||||||
|
$(MAKE) -C test clean
|
||||||
|
|
||||||
|
.PHONY: test test-verbose test-build optimize-config clean
|
||||||
|
164
ext-tuner/example/README.md
Normal file
164
ext-tuner/example/README.md
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
# NCCL Example Tuner Plugin
|
||||||
|
|
||||||
|
This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **File-based Configuration**: Read tuning parameters from a CSV configuration file
|
||||||
|
- **Size-based Tuning**: Specify different configurations based on message size ranges
|
||||||
|
- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks
|
||||||
|
- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default
|
||||||
|
- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE`
|
||||||
|
- **Fallback Behavior**: Gracefully handles missing config files and invalid entries
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make
|
||||||
|
```
|
||||||
|
|
||||||
|
This will create `libnccl-tuner-example.so` that can be loaded by NCCL.
|
||||||
|
|
||||||
|
## Configuration File Format
|
||||||
|
|
||||||
|
The configuration file uses CSV (Comma-Separated Values) format with one configuration per line:
|
||||||
|
|
||||||
|
```
|
||||||
|
collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||||
|
```
|
||||||
|
|
||||||
|
### Parameters
|
||||||
|
|
||||||
|
- **collective_type**: The collective operation type
|
||||||
|
- `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce`
|
||||||
|
|
||||||
|
- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies
|
||||||
|
- Use `0` for minimum and `4294967295` for maximum (covers all sizes)
|
||||||
|
|
||||||
|
- **algorithm**: The NCCL algorithm to use
|
||||||
|
- `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat`
|
||||||
|
|
||||||
|
- **protocol**: The NCCL protocol to use
|
||||||
|
- `ll`, `ll128`, `simple`
|
||||||
|
|
||||||
|
- **channels**: Number of channels (SMs) to use
|
||||||
|
- Use a positive integer to specify exact channel count
|
||||||
|
- Use `-1` to keep NCCL's default channel selection
|
||||||
|
|
||||||
|
- **nNodes**: Number of nodes to match
|
||||||
|
- Use a positive integer to match specific node count
|
||||||
|
- Use `-1` to match any number of nodes
|
||||||
|
|
||||||
|
- **nRanks**: Number of ranks to match
|
||||||
|
- Use a positive integer to match specific rank count
|
||||||
|
- Use `-1` to match any number of ranks
|
||||||
|
|
||||||
|
- **numPipeOps**: Number of pipeline operations to match (optional)
|
||||||
|
- Use a positive integer to match specific pipeline operation count
|
||||||
|
- Use `-1` to match any number of pipeline operations
|
||||||
|
- If omitted, configuration will match any numPipeOps value
|
||||||
|
|
||||||
|
- **regBuff**: Whether user buffer can be registered (optional)
|
||||||
|
- Use `0` to match only non-registered buffers
|
||||||
|
- Use `1` to match only registered buffers
|
||||||
|
- Use `-1` to match either registered or non-registered buffers
|
||||||
|
- If omitted, configuration will match any regBuff value
|
||||||
|
|
||||||
|
### Example Configuration
|
||||||
|
|
||||||
|
```csv
|
||||||
|
# Single-node, small allreduce: use tree algorithm, registered buffers only
|
||||||
|
allreduce,0,65536,tree,simple,2,1,-1,-1,1
|
||||||
|
|
||||||
|
# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers
|
||||||
|
allreduce,65537,1048576,ring,simple,4,4,32,1,0
|
||||||
|
|
||||||
|
# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type
|
||||||
|
allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
|
||||||
|
|
||||||
|
# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible)
|
||||||
|
broadcast,0,32768,tree,simple,-1,1,-1
|
||||||
|
|
||||||
|
# Multi-node broadcast: optimized for non-registered buffers, single pipeline op
|
||||||
|
broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
|
||||||
|
```
|
||||||
|
|
||||||
|
Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc.
|
||||||
|
|
||||||
|
### Backward Compatibility
|
||||||
|
|
||||||
|
Configurations without the numPipeOps and/or regBuff parameters are fully supported:
|
||||||
|
- 8 fields: matches any numPipeOps and regBuff values
|
||||||
|
- 9 fields: matches any regBuff value
|
||||||
|
- 10 fields: full parameter specification
|
||||||
|
|
||||||
|
This ensures existing configuration files continue to work without modification.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Method 1: Default Config File
|
||||||
|
Place your configuration in `nccl_tuner.conf` in the current working directory.
|
||||||
|
|
||||||
|
### Method 2: Environment Variable
|
||||||
|
Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf
|
||||||
|
export LD_LIBRARY_PATH=/path/to/plugin:$LD_LIBRARY_PATH
|
||||||
|
mpirun -np 4 your_nccl_application
|
||||||
|
```
|
||||||
|
|
||||||
|
## Editing Configuration Files
|
||||||
|
|
||||||
|
### Generating Configuration Files from Raw Data
|
||||||
|
|
||||||
|
A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md).
|
||||||
|
|
||||||
|
### Spreadsheet Tips:
|
||||||
|
- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff`
|
||||||
|
- Save as CSV format (not Excel format) for the plugin to read
|
||||||
|
- Use data validation to prevent typos in algorithm/protocol names
|
||||||
|
|
||||||
|
## Logging
|
||||||
|
|
||||||
|
The plugin uses NCCL's logging system. To see tuner-related messages:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export NCCL_DEBUG=INFO
|
||||||
|
```
|
||||||
|
|
||||||
|
This will show when configurations are loaded and applied, including the topology information.
|
||||||
|
|
||||||
|
For detailed debugging output during tuning decisions:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export NCCL_DEBUG=TRACE
|
||||||
|
```
|
||||||
|
|
||||||
|
This will show verbose information about which configurations are being evaluated and matched.
|
||||||
|
|
||||||
|
## Dimension Matching
|
||||||
|
|
||||||
|
Configurations are only applied when the topology matches:
|
||||||
|
|
||||||
|
- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks
|
||||||
|
- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks
|
||||||
|
- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node
|
||||||
|
- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology
|
||||||
|
|
||||||
|
This allows you to create specialized configurations for different cluster setups while maintaining flexibility.
|
||||||
|
|
||||||
|
## Default Behavior
|
||||||
|
|
||||||
|
If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic.
|
||||||
|
|
||||||
|
When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
1. **Config file not found**: Check the file path and permissions
|
||||||
|
2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters
|
||||||
|
3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory
|
||||||
|
4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO`
|
||||||
|
5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards
|
||||||
|
6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces
|
45
ext-tuner/example/nccl_tuner.conf
Normal file
45
ext-tuner/example/nccl_tuner.conf
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# NCCL Tuner Configuration File (CSV Format)
|
||||||
|
# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||||
|
#
|
||||||
|
# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
|
||||||
|
# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
|
||||||
|
# Protocols: ll, ll128, simple
|
||||||
|
# Channels: number of channels to use, or -1 to keep default
|
||||||
|
# nNodes: number of nodes to match, or -1 for any number of nodes
|
||||||
|
# nRanks: number of ranks to match, or -1 for any number of ranks
|
||||||
|
# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
|
||||||
|
# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
|
||||||
|
#
|
||||||
|
# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
|
||||||
|
#
|
||||||
|
# Examples:
|
||||||
|
|
||||||
|
# For single-node configurations with registered buffers
|
||||||
|
# Small allreduce operations on single node - use tree algorithm, registered buffers
|
||||||
|
allreduce,0,65536,tree,simple,2,1,-1,-1,1
|
||||||
|
|
||||||
|
# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers
|
||||||
|
# Medium allreduce operations - use ring algorithm
|
||||||
|
allreduce,65537,1048576,ring,simple,4,4,32,1,0
|
||||||
|
|
||||||
|
# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type
|
||||||
|
allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
|
||||||
|
|
||||||
|
# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types
|
||||||
|
# Single node broadcast - prefer tree, any pipeOps, registered buffers only
|
||||||
|
broadcast,0,32768,tree,simple,-1,1,-1,-1,1
|
||||||
|
|
||||||
|
# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring
|
||||||
|
broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
|
||||||
|
|
||||||
|
# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type
|
||||||
|
allgather,0,4294967295,ring,simple,4,2,-1
|
||||||
|
|
||||||
|
# ReduceScatter operations
|
||||||
|
# Small messages on single node, single pipeline op, registered buffers
|
||||||
|
reducescatter,0,131072,tree,simple,2,1,-1,1,1
|
||||||
|
# Large messages on any topology, multiple pipeline ops, non-registered buffers
|
||||||
|
reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0
|
||||||
|
|
||||||
|
# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type
|
||||||
|
reduce,0,4294967295,tree,simple,-1,-1,-1
|
@ -5,24 +5,443 @@
|
|||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
#include "tuner.h"
|
#include "tuner.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
#define __hidden __attribute__ ((visibility("hidden")))
|
#define __hidden __attribute__ ((visibility("hidden")))
|
||||||
|
#define MAX_LINE_LENGTH 256
|
||||||
|
|
||||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
|
// CSV field indices for configuration parsing
|
||||||
|
// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||||
|
#define CONFIG_FIELD_COLLTYPE 0
|
||||||
|
#define CONFIG_FIELD_MINBYTES 1
|
||||||
|
#define CONFIG_FIELD_MAXBYTES 2
|
||||||
|
#define CONFIG_FIELD_ALGORITHM 3
|
||||||
|
#define CONFIG_FIELD_PROTOCOL 4
|
||||||
|
#define CONFIG_FIELD_CHANNELS 5
|
||||||
|
#define CONFIG_FIELD_NNODES 6
|
||||||
|
#define CONFIG_FIELD_NRANKS 7
|
||||||
|
#define CONFIG_FIELD_PIPEOPS 8 // Optional field
|
||||||
|
#define CONFIG_FIELD_REGBUFF 9 // Optional field
|
||||||
|
|
||||||
|
// Field count constants
|
||||||
|
#define CONFIG_FIELDS_REQUIRED 8 // Minimum required fields (up to nRanks)
|
||||||
|
#define CONFIG_FIELDS_WITH_PIPEOPS 9 // Fields including numPipeOps
|
||||||
|
#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff
|
||||||
|
#define CONFIG_FIELDS_MAX 10 // Maximum number of fields supported
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ncclFunc_t collType;
|
||||||
|
size_t minBytes;
|
||||||
|
size_t maxBytes;
|
||||||
|
int algorithm;
|
||||||
|
int protocol;
|
||||||
|
int nChannels;
|
||||||
|
int nNodes;
|
||||||
|
int nRanks;
|
||||||
|
int numPipeOps;
|
||||||
|
int regBuff;
|
||||||
|
} TuningConfig;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
TuningConfig* configs; // Changed from static array to dynamic pointer
|
||||||
|
int numConfigs;
|
||||||
|
int maxConfigs; // Added to track allocated size
|
||||||
|
size_t nRanks;
|
||||||
|
size_t nNodes;
|
||||||
|
ncclDebugLogger_t logFunction;
|
||||||
|
} TunerContext;
|
||||||
|
|
||||||
|
// Parse collective type from string
|
||||||
|
static ncclFunc_t parseCollType(const char* str) {
|
||||||
|
if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast;
|
||||||
|
if (strcmp(str, "reduce") == 0) return ncclFuncReduce;
|
||||||
|
if (strcmp(str, "allgather") == 0) return ncclFuncAllGather;
|
||||||
|
if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter;
|
||||||
|
if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce;
|
||||||
|
return ncclFuncAllReduce; // default
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert collective type to string
|
||||||
|
static const char* collTypeToString(ncclFunc_t collType) {
|
||||||
|
switch (collType) {
|
||||||
|
case ncclFuncBroadcast: return "broadcast";
|
||||||
|
case ncclFuncReduce: return "reduce";
|
||||||
|
case ncclFuncAllGather: return "allgather";
|
||||||
|
case ncclFuncReduceScatter: return "reducescatter";
|
||||||
|
case ncclFuncAllReduce: return "allreduce";
|
||||||
|
default: return "unknown";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse algorithm from string
|
||||||
|
static int parseAlgorithm(const char* str) {
|
||||||
|
if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE;
|
||||||
|
if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING;
|
||||||
|
if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT;
|
||||||
|
if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN;
|
||||||
|
if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS;
|
||||||
|
if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE;
|
||||||
|
if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT;
|
||||||
|
return NCCL_ALGO_RING; // default
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert algorithm to string
|
||||||
|
static const char* algorithmToString(int algorithm) {
|
||||||
|
switch (algorithm) {
|
||||||
|
case NCCL_ALGO_TREE: return "tree";
|
||||||
|
case NCCL_ALGO_RING: return "ring";
|
||||||
|
case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct";
|
||||||
|
case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain";
|
||||||
|
case NCCL_ALGO_NVLS: return "nvls";
|
||||||
|
case NCCL_ALGO_NVLS_TREE: return "nvls_tree";
|
||||||
|
case NCCL_ALGO_PAT: return "pat";
|
||||||
|
default: return "unknown";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse protocol from string
|
||||||
|
static int parseProtocol(const char* str) {
|
||||||
|
if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL;
|
||||||
|
if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128;
|
||||||
|
if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE;
|
||||||
|
return NCCL_PROTO_SIMPLE; // default
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert protocol to string
|
||||||
|
static const char* protocolToString(int protocol) {
|
||||||
|
switch (protocol) {
|
||||||
|
case NCCL_PROTO_LL: return "ll";
|
||||||
|
case NCCL_PROTO_LL128: return "ll128";
|
||||||
|
case NCCL_PROTO_SIMPLE: return "simple";
|
||||||
|
default: return "unknown";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to count valid configuration lines in file
|
||||||
|
static int countConfigLines(const char* filename) {
|
||||||
|
FILE* file = fopen(filename, "r");
|
||||||
|
if (!file) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
char line[MAX_LINE_LENGTH];
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
while (fgets(line, sizeof(line), file)) {
|
||||||
|
// Skip comments and empty lines
|
||||||
|
if (line[0] == '#' || line[0] == '\n') continue;
|
||||||
|
|
||||||
|
// Remove trailing newline
|
||||||
|
line[strcspn(line, "\n")] = 0;
|
||||||
|
|
||||||
|
// Check if line has content
|
||||||
|
if (strlen(line) > 0) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(file);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load configuration from file
|
||||||
|
static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
|
||||||
|
FILE* file = fopen(filename, "r");
|
||||||
|
if (!file) {
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Config file %s not found, using defaults", filename);
|
||||||
|
}
|
||||||
|
return ncclSuccess; // Not finding config file is not an error
|
||||||
|
}
|
||||||
|
|
||||||
|
// First pass: count valid configuration lines
|
||||||
|
int configCount = countConfigLines(filename);
|
||||||
|
if (configCount == 0) {
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: No valid configurations found in %s", filename);
|
||||||
|
}
|
||||||
|
fclose(file);
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate memory for configurations based on actual count
|
||||||
|
ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig));
|
||||||
|
if (!ctx->configs) {
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount);
|
||||||
|
}
|
||||||
|
fclose(file);
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->maxConfigs = configCount;
|
||||||
|
ctx->numConfigs = 0;
|
||||||
|
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset file pointer to beginning
|
||||||
|
fseek(file, 0, SEEK_SET);
|
||||||
|
|
||||||
|
char line[MAX_LINE_LENGTH];
|
||||||
|
|
||||||
|
while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) {
|
||||||
|
// Skip comments and empty lines
|
||||||
|
if (line[0] == '#' || line[0] == '\n') continue;
|
||||||
|
|
||||||
|
// Remove trailing newline
|
||||||
|
line[strcspn(line, "\n")] = 0;
|
||||||
|
|
||||||
|
// Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||||
|
char* token;
|
||||||
|
char* tokens[CONFIG_FIELDS_MAX];
|
||||||
|
int tokenCount = 0;
|
||||||
|
|
||||||
|
// Make a copy of the line for tokenizing
|
||||||
|
char lineCopy[MAX_LINE_LENGTH];
|
||||||
|
strncpy(lineCopy, line, sizeof(lineCopy));
|
||||||
|
lineCopy[sizeof(lineCopy) - 1] = '\0';
|
||||||
|
|
||||||
|
// Tokenize by comma
|
||||||
|
token = strtok(lineCopy, ",");
|
||||||
|
while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) {
|
||||||
|
// Trim whitespace
|
||||||
|
while (*token == ' ' || *token == '\t') token++;
|
||||||
|
char* end = token + strlen(token) - 1;
|
||||||
|
while (end > token && (*end == ' ' || *end == '\t')) {
|
||||||
|
*end = '\0';
|
||||||
|
end--;
|
||||||
|
}
|
||||||
|
tokens[tokenCount++] = token;
|
||||||
|
token = strtok(NULL, ",");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10)
|
||||||
|
if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) {
|
||||||
|
TuningConfig* config = &ctx->configs[ctx->numConfigs];
|
||||||
|
config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]);
|
||||||
|
config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10);
|
||||||
|
config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10);
|
||||||
|
config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]);
|
||||||
|
config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]);
|
||||||
|
config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]);
|
||||||
|
config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]);
|
||||||
|
config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]);
|
||||||
|
|
||||||
|
// numPipeOps is optional (9th field, index 8)
|
||||||
|
if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) {
|
||||||
|
config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]);
|
||||||
|
} else {
|
||||||
|
config->numPipeOps = -1; // -1 means match any numPipeOps
|
||||||
|
}
|
||||||
|
|
||||||
|
// regBuff is optional (10th field, index 9)
|
||||||
|
if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) {
|
||||||
|
config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]);
|
||||||
|
} else {
|
||||||
|
config->regBuff = -1; // -1 means match any regBuff value
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->numConfigs++;
|
||||||
|
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
if (config->numPipeOps == -1 && config->regBuff == -1) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any",
|
||||||
|
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
|
||||||
|
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
|
||||||
|
config->nChannels, config->nNodes, config->nRanks);
|
||||||
|
} else if (config->regBuff == -1) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any",
|
||||||
|
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
|
||||||
|
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
|
||||||
|
config->nChannels, config->nNodes, config->nRanks, config->numPipeOps);
|
||||||
|
} else if (config->numPipeOps == -1) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d",
|
||||||
|
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
|
||||||
|
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
|
||||||
|
config->nChannels, config->nNodes, config->nRanks, config->regBuff);
|
||||||
|
} else {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d",
|
||||||
|
tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
|
||||||
|
tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
|
||||||
|
config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(file);
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename);
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
|
||||||
|
TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
|
||||||
|
if (!ctx) return ncclSystemError;
|
||||||
|
|
||||||
|
ctx->configs = NULL; // Initialize to NULL
|
||||||
|
ctx->numConfigs = 0;
|
||||||
|
ctx->maxConfigs = 0; // Initialize to 0
|
||||||
|
ctx->nRanks = nRanks;
|
||||||
|
ctx->nNodes = nNodes;
|
||||||
|
ctx->logFunction = logFunction;
|
||||||
|
|
||||||
|
if (logFunction) {
|
||||||
|
logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to load config file from environment variable or default location
|
||||||
|
const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
if (!configFile) {
|
||||||
|
configFile = "nccl_tuner.conf"; // default config file name
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t result = loadConfig(ctx, configFile);
|
||||||
|
if (result != ncclSuccess) {
|
||||||
|
if (ctx->configs) {
|
||||||
|
free(ctx->configs); // Clean up allocated memory on error
|
||||||
|
}
|
||||||
|
free(ctx);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
*context = ctx;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
|
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
|
||||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||||
int regBuff, int* nChannels) {
|
int regBuff, int* nChannels) {
|
||||||
// Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
|
TunerContext* ctx = (TunerContext*)context;
|
||||||
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
|
if (!ctx) return ncclInternalError;
|
||||||
if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
|
|
||||||
table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
|
// Default channels
|
||||||
}
|
|
||||||
*nChannels = 1;
|
*nChannels = 1;
|
||||||
|
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d",
|
||||||
|
collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for matching configuration
|
||||||
|
for (int i = 0; i < ctx->numConfigs; i++) {
|
||||||
|
TuningConfig* config = &ctx->configs[i];
|
||||||
|
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d",
|
||||||
|
i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol),
|
||||||
|
config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff
|
||||||
|
if (config->collType == collType &&
|
||||||
|
nBytes >= config->minBytes &&
|
||||||
|
nBytes <= config->maxBytes &&
|
||||||
|
(config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) &&
|
||||||
|
(config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) &&
|
||||||
|
(config->numPipeOps == -1 || config->numPipeOps == numPipeOps) &&
|
||||||
|
(config->regBuff == -1 || config->regBuff == regBuff)) {
|
||||||
|
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d",
|
||||||
|
algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check bounds
|
||||||
|
if (config->algorithm < numAlgo && config->protocol < numProto) {
|
||||||
|
if (collCostTable[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) {
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)",
|
||||||
|
algorithmToString(config->algorithm), protocolToString(config->protocol),
|
||||||
|
&collCostTable[config->algorithm][config->protocol], collCostTable[config->algorithm][config->protocol]);
|
||||||
|
}
|
||||||
|
collCostTable[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration
|
||||||
|
|
||||||
|
// Only override channels if not set to -1 (keep default)
|
||||||
|
if (config->nChannels != -1) {
|
||||||
|
*nChannels = config->nChannels;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
if (config->nChannels == -1) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)",
|
||||||
|
collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
|
||||||
|
config->nNodes, config->nRanks);
|
||||||
|
} else {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)",
|
||||||
|
collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
|
||||||
|
config->nChannels, config->nNodes, config->nRanks);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
} else {
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE",
|
||||||
|
algorithmToString(config->algorithm), protocolToString(config->protocol));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)",
|
||||||
|
algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d",
|
||||||
|
config->collType == collType,
|
||||||
|
(nBytes >= config->minBytes && nBytes <= config->maxBytes),
|
||||||
|
(config->nNodes == -1 || config->nNodes == (int)ctx->nNodes),
|
||||||
|
(config->nRanks == -1 || config->nRanks == (int)ctx->nRanks),
|
||||||
|
(config->numPipeOps == -1 || config->numPipeOps == numPipeOps),
|
||||||
|
(config->regBuff == -1 || config->regBuff == regBuff));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no specific config found, apply default behavior
|
||||||
|
if (ctx->logFunction) {
|
||||||
|
ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||||
|
"TUNER/ExamplePlugin: No matching config found");
|
||||||
|
}
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
|
__hidden ncclResult_t pluginDestroy(void* context) {
|
||||||
|
if (context) {
|
||||||
|
TunerContext* ctx = (TunerContext*)context;
|
||||||
|
if (ctx->configs) {
|
||||||
|
free(ctx->configs); // Free dynamically allocated configs array
|
||||||
|
}
|
||||||
|
free(context);
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
#define PLUGIN_NAME "Example"
|
#define PLUGIN_NAME "Example"
|
||||||
|
|
||||||
|
106
ext-tuner/example/scripts/README.md
Normal file
106
ext-tuner/example/scripts/README.md
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
# NCCL Tuner Configuration Scripts
|
||||||
|
|
||||||
|
This directory contains scripts for optimizing NCCL tuner configurations based on performance data.
|
||||||
|
|
||||||
|
## optimize_config.py
|
||||||
|
|
||||||
|
A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/optimize_config.py [options] <input_csv_file>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`)
|
||||||
|
- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`)
|
||||||
|
- `--no-header`: Don't add header comments to output file
|
||||||
|
- `--dry-run`: Print configurations without writing to file
|
||||||
|
|
||||||
|
### CSV Input Format
|
||||||
|
|
||||||
|
The input CSV file should have the following columns:
|
||||||
|
|
||||||
|
```csv
|
||||||
|
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
|
||||||
|
```
|
||||||
|
|
||||||
|
**Required columns:**
|
||||||
|
- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.)
|
||||||
|
- `size_bytes`: Message size in bytes
|
||||||
|
- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.)
|
||||||
|
- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`)
|
||||||
|
- `channels`: Number of channels (or `-1` for default)
|
||||||
|
- `nodes`: Number of nodes (or `-1` for any)
|
||||||
|
- `ranks`: Number of ranks (or `-1` for any)
|
||||||
|
- `pipeOps`: Number of pipeline operations (or `-1` for any)
|
||||||
|
- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any)
|
||||||
|
|
||||||
|
**Optional metrics (must have at least one present):**
|
||||||
|
- `bandwidth_gbps`: Bandwidth in GB/s (higher is better)
|
||||||
|
- `latency_us`: Latency in microseconds (lower is better)
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
**Basic usage with cost optimization:**
|
||||||
|
```bash
|
||||||
|
python scripts/optimize_config.py sample_performance_data.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
**Optimize for bandwidth and write to custom file:**
|
||||||
|
```bash
|
||||||
|
python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
**Preview configurations without writing:**
|
||||||
|
```bash
|
||||||
|
python scripts/optimize_config.py --dry-run performance_data.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### How It Works
|
||||||
|
|
||||||
|
1. **Data Loading**: Reads CSV performance data and validates format
|
||||||
|
2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters
|
||||||
|
3. **Size Ranges**: Automatically bins data into size ranges for optimization
|
||||||
|
4. **Optimization**: Finds the best performing configuration for each group/size combination
|
||||||
|
5. **Output**: Generates NCCL tuner config format and appends to specified file
|
||||||
|
|
||||||
|
### Default Size Ranges
|
||||||
|
|
||||||
|
The script uses these default size ranges (in bytes):
|
||||||
|
- Small: 0 - 1,024
|
||||||
|
- Medium: 1,025 - 65,536
|
||||||
|
- Large: 65,537 - 1,048,576
|
||||||
|
- XLarge: 1,048,577 - 16,777,216
|
||||||
|
- XXLarge: 16,777,217 - 4,294,967,295
|
||||||
|
|
||||||
|
### Sample Data
|
||||||
|
|
||||||
|
See `sample_performance_data.csv` for an example of the expected input format.
|
||||||
|
|
||||||
|
### Integration with NCCL
|
||||||
|
|
||||||
|
The generated configuration file can be used directly with the NCCL tuner plugin:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf
|
||||||
|
export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so
|
||||||
|
mpirun -np 8 your_nccl_application
|
||||||
|
```
|
||||||
|
|
||||||
|
### Performance Data Collection
|
||||||
|
|
||||||
|
To collect performance data for optimization, you can:
|
||||||
|
|
||||||
|
1. **Use NCCL benchmarks** with different algorithm/protocol combinations
|
||||||
|
2. **Profile your applications** with various tuner settings
|
||||||
|
3. **Run systematic sweeps** across parameter combinations
|
||||||
|
4. **Use NCCL debug output** to collect timing information
|
||||||
|
|
||||||
|
The key is to have comprehensive data covering:
|
||||||
|
- Different message sizes (small to large)
|
||||||
|
- Various topologies (single node, multi-node)
|
||||||
|
- All relevant algorithm/protocol combinations
|
||||||
|
- Different channel counts and pipeline configurations
|
430
ext-tuner/example/scripts/optimize_config.py
Normal file
430
ext-tuner/example/scripts/optimize_config.py
Normal file
@ -0,0 +1,430 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
NCCL Tuner Configuration Optimizer
|
||||||
|
|
||||||
|
Reads a CSV file containing performance data across different tuning parameters
|
||||||
|
and generates optimal NCCL tuner configurations based on the best performing
|
||||||
|
combinations.
|
||||||
|
|
||||||
|
By default, creates growing size ranges that interpolate between the actual data sizes
|
||||||
|
for each unique dimension (node count, rank count combination). This ensures that
|
||||||
|
different cluster configurations get their own optimized size boundaries, as
|
||||||
|
performance characteristics often vary significantly between topologies.
|
||||||
|
|
||||||
|
Each dimension gets its own set of ranges starting from 0 and extending to the maximum
|
||||||
|
size for that dimension, with boundaries at midpoints between consecutive data sizes.
|
||||||
|
|
||||||
|
CSV Input Format:
|
||||||
|
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us
|
||||||
|
|
||||||
|
Output Format (NCCL Tuner Config):
|
||||||
|
collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||||
|
|
||||||
|
Usage Examples:
|
||||||
|
# Auto-create dimension-specific interpolated ranges (default)
|
||||||
|
python3 optimize_config.py data.csv
|
||||||
|
|
||||||
|
# Use custom size ranges (applied to all topologies)
|
||||||
|
python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576"
|
||||||
|
|
||||||
|
# Use hardcoded default ranges (applied to all topologies)
|
||||||
|
python3 optimize_config.py data.csv --no-auto-ranges
|
||||||
|
"""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import Dict, List, Tuple, Any
|
||||||
|
|
||||||
|
class PerformanceData:
|
||||||
|
def __init__(self, row: Dict[str, str]):
|
||||||
|
self.collective = row['collective']
|
||||||
|
self.size_bytes = int(row['size_bytes'])
|
||||||
|
self.algorithm = row['algorithm']
|
||||||
|
self.protocol = row['protocol']
|
||||||
|
self.channels = int(row['channels']) if row['channels'] != '-1' else -1
|
||||||
|
self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1
|
||||||
|
self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1
|
||||||
|
self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1
|
||||||
|
self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1
|
||||||
|
|
||||||
|
# Performance metrics
|
||||||
|
self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0)) # Higher is better
|
||||||
|
self.latency_us = float(row.get('latency_us', 0)) # Lower is better
|
||||||
|
|
||||||
|
def get_config_key(self) -> Tuple:
|
||||||
|
"""Generate a key for grouping similar configurations"""
|
||||||
|
return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff)
|
||||||
|
|
||||||
|
def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]:
|
||||||
|
"""Find which size range this data point belongs to for its dimension"""
|
||||||
|
topology_key = (self.nodes, self.ranks)
|
||||||
|
|
||||||
|
# Get size ranges for this dimension, or fall back to default
|
||||||
|
if topology_key in topology_size_ranges:
|
||||||
|
size_ranges = topology_size_ranges[topology_key]
|
||||||
|
elif (-1, -1) in topology_size_ranges:
|
||||||
|
size_ranges = topology_size_ranges[(-1, -1)]
|
||||||
|
else:
|
||||||
|
# Fallback to first available dimension ranges
|
||||||
|
size_ranges = next(iter(topology_size_ranges.values()))
|
||||||
|
|
||||||
|
for min_size, max_size in size_ranges:
|
||||||
|
if min_size <= self.size_bytes <= max_size:
|
||||||
|
return (min_size, max_size)
|
||||||
|
# If no range found, create a single-point range
|
||||||
|
return (self.size_bytes, self.size_bytes)
|
||||||
|
|
||||||
|
class ConfigOptimizer:
|
||||||
|
def __init__(self, optimization_metric: str = 'latency_us'):
|
||||||
|
self.optimization_metric = optimization_metric
|
||||||
|
# Default size ranges - will be overridden by auto-detection
|
||||||
|
self.size_ranges = [
|
||||||
|
(0, 1024),
|
||||||
|
(1025, 64*1024),
|
||||||
|
(64*1024+1, 1024*1024),
|
||||||
|
(1024*1024+1, 16*1024*1024),
|
||||||
|
(16*1024*1024+1, 4*1024*1024*1024-1)
|
||||||
|
]
|
||||||
|
self.auto_size_ranges = True
|
||||||
|
|
||||||
|
def set_size_ranges(self, ranges: List[Tuple[int, int]]):
|
||||||
|
"""Set custom size ranges for optimization"""
|
||||||
|
self.size_ranges = ranges
|
||||||
|
self.auto_size_ranges = False
|
||||||
|
|
||||||
|
def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
|
||||||
|
"""Create growing size ranges for each unique (nodes, ranks) dimension"""
|
||||||
|
if not data:
|
||||||
|
return {(-1, -1): self.size_ranges}
|
||||||
|
|
||||||
|
# Group data by dimension (nodes, ranks)
|
||||||
|
topology_data = defaultdict(list)
|
||||||
|
for item in data:
|
||||||
|
topology_key = (item.nodes, item.ranks)
|
||||||
|
topology_data[topology_key].append(item)
|
||||||
|
|
||||||
|
topology_ranges = {}
|
||||||
|
|
||||||
|
for topology_key, items in topology_data.items():
|
||||||
|
nodes, ranks = topology_key
|
||||||
|
|
||||||
|
# Extract unique sizes for this dimension and sort them
|
||||||
|
unique_sizes = sorted(set(item.size_bytes for item in items))
|
||||||
|
|
||||||
|
if len(unique_sizes) <= 1:
|
||||||
|
# Only one size, create a single range from 0 to that size
|
||||||
|
size = unique_sizes[0] if unique_sizes else 0
|
||||||
|
ranges = [(0, size)]
|
||||||
|
else:
|
||||||
|
# Create growing ranges that interpolate between data points
|
||||||
|
ranges = []
|
||||||
|
|
||||||
|
for i, size in enumerate(unique_sizes):
|
||||||
|
if i == 0:
|
||||||
|
# First range: 0 to midpoint between first and second size
|
||||||
|
if len(unique_sizes) > 1:
|
||||||
|
next_size = unique_sizes[i + 1]
|
||||||
|
max_size = (size + next_size) // 2
|
||||||
|
else:
|
||||||
|
max_size = size
|
||||||
|
min_size = 0
|
||||||
|
elif i == len(unique_sizes) - 1:
|
||||||
|
# Last range: previous max + 1 to current size (and beyond)
|
||||||
|
min_size = ranges[-1][1] + 1
|
||||||
|
max_size = size
|
||||||
|
else:
|
||||||
|
# Intermediate ranges: previous max + 1 to midpoint with next size
|
||||||
|
min_size = ranges[-1][1] + 1
|
||||||
|
next_size = unique_sizes[i + 1]
|
||||||
|
max_size = (size + next_size) // 2
|
||||||
|
|
||||||
|
ranges.append((min_size, max_size))
|
||||||
|
|
||||||
|
topology_ranges[topology_key] = ranges
|
||||||
|
|
||||||
|
print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:")
|
||||||
|
for i, (min_size, max_size) in enumerate(ranges):
|
||||||
|
# Count data points that fall in this range for this dimension
|
||||||
|
count = sum(1 for item in items if min_size <= item.size_bytes <= max_size)
|
||||||
|
actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size))
|
||||||
|
if actual_sizes:
|
||||||
|
size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3])
|
||||||
|
if len(actual_sizes) > 3:
|
||||||
|
size_list += f", ... (+{len(actual_sizes)-3} more)"
|
||||||
|
print(f" Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})")
|
||||||
|
|
||||||
|
return topology_ranges
|
||||||
|
|
||||||
|
def load_data(self, csv_file: str) -> List[PerformanceData]:
|
||||||
|
"""Load performance data from CSV file"""
|
||||||
|
data = []
|
||||||
|
try:
|
||||||
|
with open(csv_file, 'r') as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
for row in reader:
|
||||||
|
try:
|
||||||
|
data.append(PerformanceData(row))
|
||||||
|
except (ValueError, KeyError) as e:
|
||||||
|
print(f"Warning: Skipping invalid row: {row} - {e}")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: File {csv_file} not found")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading {csv_file}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"Loaded {len(data)} performance data points")
|
||||||
|
|
||||||
|
# Auto-determine size ranges if enabled
|
||||||
|
if self.auto_size_ranges and data:
|
||||||
|
self.topology_size_ranges = self.auto_determine_size_ranges(data)
|
||||||
|
else:
|
||||||
|
# Use default ranges for all topologies
|
||||||
|
self.topology_size_ranges = {(-1, -1): self.size_ranges}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool:
|
||||||
|
"""Determine if new_data is better than current_best"""
|
||||||
|
if self.optimization_metric == 'bandwidth_gbps':
|
||||||
|
return new_data.bandwidth_gbps > current_best.bandwidth_gbps
|
||||||
|
elif self.optimization_metric == 'latency_us':
|
||||||
|
return new_data.latency_us < current_best.latency_us
|
||||||
|
else:
|
||||||
|
# Default to latency
|
||||||
|
return new_data.latency_us < current_best.latency_us
|
||||||
|
|
||||||
|
def optimize_configurations(self, data: List[PerformanceData]) -> List[str]:
|
||||||
|
"""Find optimal configurations and return as NCCL config strings"""
|
||||||
|
# Group data by configuration key and size range
|
||||||
|
grouped_data = defaultdict(lambda: defaultdict(list))
|
||||||
|
|
||||||
|
for item in data:
|
||||||
|
config_key = item.get_config_key()
|
||||||
|
size_range = item.get_size_range_key(self.topology_size_ranges)
|
||||||
|
grouped_data[config_key][size_range].append(item)
|
||||||
|
|
||||||
|
# Store optimal configurations before combining ranges
|
||||||
|
optimal_configs = []
|
||||||
|
|
||||||
|
for config_key, size_ranges_dict in grouped_data.items():
|
||||||
|
collective, nodes, ranks, pipeOps, regBuff = config_key
|
||||||
|
|
||||||
|
for (min_size, max_size), items in size_ranges_dict.items():
|
||||||
|
if not items:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find the best performing configuration for this size range
|
||||||
|
best_item = items[0]
|
||||||
|
for item in items[1:]:
|
||||||
|
if self.is_better(item, best_item):
|
||||||
|
best_item = item
|
||||||
|
|
||||||
|
# Store the optimal configuration with its range
|
||||||
|
optimal_configs.append({
|
||||||
|
'collective': collective,
|
||||||
|
'min_size': min_size,
|
||||||
|
'max_size': max_size,
|
||||||
|
'algorithm': best_item.algorithm,
|
||||||
|
'protocol': best_item.protocol,
|
||||||
|
'channels': best_item.channels,
|
||||||
|
'nodes': best_item.nodes,
|
||||||
|
'ranks': best_item.ranks,
|
||||||
|
'pipeOps': best_item.pipeOps,
|
||||||
|
'regBuff': best_item.regBuff,
|
||||||
|
'metric_value': getattr(best_item, self.optimization_metric)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Combine sequential ranges with identical tunings
|
||||||
|
combined_configs = self.combine_sequential_ranges(optimal_configs)
|
||||||
|
|
||||||
|
# Generate config strings
|
||||||
|
configs = []
|
||||||
|
for config in combined_configs:
|
||||||
|
config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}"
|
||||||
|
configs.append(config_str)
|
||||||
|
|
||||||
|
print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: "
|
||||||
|
f"{config['algorithm']}/{config['protocol']} channels={config['channels']} "
|
||||||
|
f"({self.optimization_metric}={config['metric_value']:.3f})")
|
||||||
|
|
||||||
|
return configs
|
||||||
|
|
||||||
|
def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
|
||||||
|
"""Combine sequential ranges that have identical tuning parameters"""
|
||||||
|
if not configs:
|
||||||
|
return configs
|
||||||
|
|
||||||
|
# Group by collective and topology (nodes, ranks)
|
||||||
|
topology_groups = defaultdict(list)
|
||||||
|
for config in configs:
|
||||||
|
topology_key = (config['collective'], config['nodes'], config['ranks'],
|
||||||
|
config['pipeOps'], config['regBuff'])
|
||||||
|
topology_groups[topology_key].append(config)
|
||||||
|
|
||||||
|
combined_configs = []
|
||||||
|
|
||||||
|
for topology_key, topology_configs in topology_groups.items():
|
||||||
|
# Sort by min_size to ensure proper ordering
|
||||||
|
topology_configs.sort(key=lambda x: x['min_size'])
|
||||||
|
|
||||||
|
# Group by tuning parameters (algorithm, protocol, channels)
|
||||||
|
tuning_groups = defaultdict(list)
|
||||||
|
for config in topology_configs:
|
||||||
|
tuning_key = (config['algorithm'], config['protocol'], config['channels'])
|
||||||
|
tuning_groups[tuning_key].append(config)
|
||||||
|
|
||||||
|
# For each tuning group, combine sequential ranges
|
||||||
|
for tuning_key, tuning_configs in tuning_groups.items():
|
||||||
|
if not tuning_configs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Sort by min_size
|
||||||
|
tuning_configs.sort(key=lambda x: x['min_size'])
|
||||||
|
|
||||||
|
# Combine sequential ranges
|
||||||
|
current_config = tuning_configs[0].copy()
|
||||||
|
|
||||||
|
for next_config in tuning_configs[1:]:
|
||||||
|
# Check if ranges are adjacent or overlapping
|
||||||
|
if current_config['max_size'] + 1 >= next_config['min_size']:
|
||||||
|
# Extend the current range
|
||||||
|
current_config['max_size'] = max(current_config['max_size'], next_config['max_size'])
|
||||||
|
# Update metric value to the better one
|
||||||
|
if self.optimization_metric == 'bandwidth_gbps':
|
||||||
|
if next_config['metric_value'] > current_config['metric_value']:
|
||||||
|
current_config['metric_value'] = next_config['metric_value']
|
||||||
|
else: # latency_us or default
|
||||||
|
if next_config['metric_value'] < current_config['metric_value']:
|
||||||
|
current_config['metric_value'] = next_config['metric_value']
|
||||||
|
else:
|
||||||
|
# Gap between ranges, save current and start new one
|
||||||
|
combined_configs.append(current_config)
|
||||||
|
current_config = next_config.copy()
|
||||||
|
|
||||||
|
# Add the last configuration
|
||||||
|
combined_configs.append(current_config)
|
||||||
|
|
||||||
|
# Sort final configs by collective, nodes, ranks, then min_size
|
||||||
|
combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size']))
|
||||||
|
|
||||||
|
original_count = len(configs)
|
||||||
|
combined_count = len(combined_configs)
|
||||||
|
if combined_count < original_count:
|
||||||
|
print(f"Combined {original_count} ranges into {combined_count} ranges "
|
||||||
|
f"(reduced by {original_count - combined_count})")
|
||||||
|
|
||||||
|
return combined_configs
|
||||||
|
|
||||||
|
def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True):
|
||||||
|
"""Append optimized configurations to NCCL tuner config file"""
|
||||||
|
try:
|
||||||
|
# Create directory if it doesn't exist
|
||||||
|
config_dir = os.path.dirname(config_file)
|
||||||
|
if config_dir and not os.path.exists(config_dir):
|
||||||
|
os.makedirs(config_dir)
|
||||||
|
print(f"Created directory: {config_dir}")
|
||||||
|
|
||||||
|
# Check if file exists and has content
|
||||||
|
file_exists = os.path.exists(config_file)
|
||||||
|
add_separator = False
|
||||||
|
|
||||||
|
if file_exists:
|
||||||
|
with open(config_file, 'r') as f:
|
||||||
|
content = f.read().strip()
|
||||||
|
add_separator = len(content) > 0
|
||||||
|
print(f"Appending to existing file: {config_file}")
|
||||||
|
else:
|
||||||
|
print(f"Creating new file: {config_file}")
|
||||||
|
|
||||||
|
with open(config_file, 'a') as f:
|
||||||
|
if add_separator:
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
if add_header:
|
||||||
|
f.write(f"# Optimized configurations generated by optimize_config.py\n")
|
||||||
|
f.write(f"# Optimization metric: {self.optimization_metric}\n")
|
||||||
|
f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n")
|
||||||
|
|
||||||
|
for config in configs:
|
||||||
|
f.write(f"{config}\n")
|
||||||
|
|
||||||
|
if file_exists:
|
||||||
|
print(f"Appended {len(configs)} optimized configurations to {config_file}")
|
||||||
|
else:
|
||||||
|
print(f"Created {config_file} with {len(configs)} optimized configurations")
|
||||||
|
|
||||||
|
except PermissionError:
|
||||||
|
print(f"Error: Permission denied writing to {config_file}")
|
||||||
|
print("Try running with appropriate permissions or choose a different output location")
|
||||||
|
sys.exit(1)
|
||||||
|
except OSError as e:
|
||||||
|
print(f"Error: Cannot create/write to {config_file}: {e}")
|
||||||
|
print("Check that the path is valid and you have write permissions")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Unexpected error writing to {config_file}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data")
|
||||||
|
parser.add_argument("csv_file", help="Input CSV file with performance data")
|
||||||
|
parser.add_argument("-o", "--output", default="nccl_tuner.conf",
|
||||||
|
help="Output NCCL tuner config file (default: nccl_tuner.conf)")
|
||||||
|
parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'],
|
||||||
|
default='latency_us', help="Optimization metric (default: latency_us)")
|
||||||
|
parser.add_argument("--no-header", action="store_true",
|
||||||
|
help="Don't add header comments to output file")
|
||||||
|
parser.add_argument("--dry-run", action="store_true",
|
||||||
|
help="Print configurations without writing to file")
|
||||||
|
parser.add_argument("--no-auto-ranges", action="store_true",
|
||||||
|
help="Disable automatic size range determination (use default ranges)")
|
||||||
|
parser.add_argument("--size-ranges", type=str,
|
||||||
|
help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
optimizer = ConfigOptimizer(args.metric)
|
||||||
|
|
||||||
|
# Handle size range configuration
|
||||||
|
if args.size_ranges:
|
||||||
|
# Parse custom size ranges
|
||||||
|
try:
|
||||||
|
ranges = []
|
||||||
|
for range_str in args.size_ranges.split(','):
|
||||||
|
min_size, max_size = map(int, range_str.split('-'))
|
||||||
|
ranges.append((min_size, max_size))
|
||||||
|
optimizer.set_size_ranges(ranges)
|
||||||
|
print(f"Using custom size ranges: {ranges}")
|
||||||
|
except ValueError:
|
||||||
|
print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'")
|
||||||
|
sys.exit(1)
|
||||||
|
elif args.no_auto_ranges:
|
||||||
|
# Disable auto-ranging
|
||||||
|
optimizer.auto_size_ranges = False
|
||||||
|
print("Using default hardcoded size ranges")
|
||||||
|
else:
|
||||||
|
# Auto-ranging is enabled by default - creates one bucket per unique size
|
||||||
|
optimizer.auto_size_ranges = True
|
||||||
|
print("Auto-ranging enabled: will create one bucket per unique size in data")
|
||||||
|
|
||||||
|
# Load and optimize data
|
||||||
|
data = optimizer.load_data(args.csv_file)
|
||||||
|
if not data:
|
||||||
|
print("No valid data found in CSV file")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
configs = optimizer.optimize_configurations(data)
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\nGenerated configurations:")
|
||||||
|
for config in configs:
|
||||||
|
print(config)
|
||||||
|
else:
|
||||||
|
optimizer.append_to_config_file(configs, args.output, not args.no_header)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
24
ext-tuner/example/scripts/sample_performance_data.csv
Normal file
24
ext-tuner/example/scripts/sample_performance_data.csv
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
|
||||||
|
allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5
|
||||||
|
allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8
|
||||||
|
allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2
|
||||||
|
allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1
|
||||||
|
allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3
|
||||||
|
allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4
|
||||||
|
allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2
|
||||||
|
allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1
|
||||||
|
allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7
|
||||||
|
allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1
|
||||||
|
broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2
|
||||||
|
broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1
|
||||||
|
broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5
|
||||||
|
broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2
|
||||||
|
broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1
|
||||||
|
broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3
|
||||||
|
allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4
|
||||||
|
allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6
|
||||||
|
allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1
|
||||||
|
allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3
|
||||||
|
allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2
|
||||||
|
allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1
|
||||||
|
allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3
|
|
30
ext-tuner/example/test/Makefile
Normal file
30
ext-tuner/example/test/Makefile
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#
|
||||||
|
# Makefile for NCCL Tuner Plugin Unit Tests
|
||||||
|
#
|
||||||
|
|
||||||
|
CC := gcc
|
||||||
|
CFLAGS := -Wall -Wextra -g -std=c99 -fPIC
|
||||||
|
INC := -I. -I../nccl
|
||||||
|
TARGET := test_plugin
|
||||||
|
SOURCES := test_plugin.c
|
||||||
|
|
||||||
|
# Default target
|
||||||
|
all: $(TARGET)
|
||||||
|
|
||||||
|
# Build the test executable
|
||||||
|
$(TARGET): $(SOURCES)
|
||||||
|
$(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES)
|
||||||
|
|
||||||
|
# Run the tests
|
||||||
|
test: $(TARGET)
|
||||||
|
./$(TARGET) $(TEST_CASE)
|
||||||
|
|
||||||
|
# Run tests with verbose output
|
||||||
|
test-verbose: $(TARGET)
|
||||||
|
NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE)
|
||||||
|
|
||||||
|
# Clean build artifacts
|
||||||
|
clean:
|
||||||
|
rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf
|
||||||
|
|
||||||
|
.PHONY: all test test-verbose clean
|
205
ext-tuner/example/test/README.md
Normal file
205
ext-tuner/example/test/README.md
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
# NCCL Tuner Plugin Unit Tests
|
||||||
|
|
||||||
|
This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates.
|
||||||
|
|
||||||
|
## Test Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
test/
|
||||||
|
├── test_plugin.c # Main unit test file
|
||||||
|
├── Makefile # Build system for tests
|
||||||
|
└── README.md # This file
|
||||||
|
```
|
||||||
|
|
||||||
|
## Building and Running Tests
|
||||||
|
|
||||||
|
### Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build and run all tests
|
||||||
|
make test
|
||||||
|
|
||||||
|
# Or step by step
|
||||||
|
make # Build test executable
|
||||||
|
./test_plugin # Run tests
|
||||||
|
```
|
||||||
|
|
||||||
|
### Advanced Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run with memory leak detection (requires valgrind)
|
||||||
|
make test-memory
|
||||||
|
|
||||||
|
# Run with verbose logging
|
||||||
|
make test-verbose
|
||||||
|
|
||||||
|
# Generate code coverage report (requires gcov)
|
||||||
|
make coverage
|
||||||
|
|
||||||
|
# Create sample test configuration files
|
||||||
|
make test-configs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Test Coverage
|
||||||
|
|
||||||
|
The unit tests cover the following functionality:
|
||||||
|
|
||||||
|
### 1. **Plugin Initialization (`test_plugin_init`)**
|
||||||
|
- Tests successful plugin initialization
|
||||||
|
- Verifies context allocation
|
||||||
|
- Tests cleanup on destroy
|
||||||
|
|
||||||
|
### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)**
|
||||||
|
- Valid CSV format parsing
|
||||||
|
- Comment and empty line handling
|
||||||
|
- Invalid format graceful handling
|
||||||
|
- Environment variable configuration
|
||||||
|
|
||||||
|
### 3. **Collective Type Matching (`test_collective_matching`)**
|
||||||
|
- Correct matching of allreduce, broadcast, etc.
|
||||||
|
- Algorithm/protocol selection
|
||||||
|
- Channel configuration
|
||||||
|
|
||||||
|
### 4. **Size Range Matching (`test_size_matching`)**
|
||||||
|
- Small, medium, large message size handling
|
||||||
|
- Proper range boundary checking
|
||||||
|
- Multiple size-based configurations
|
||||||
|
|
||||||
|
### 5. **Topology Matching (`test_topology_matching`)**
|
||||||
|
- Single-node vs multi-node configurations
|
||||||
|
- Exact nNodes/nRanks matching
|
||||||
|
- Wildcard matching (-1 values)
|
||||||
|
|
||||||
|
### 6. **Default Channels (`test_default_channels`)**
|
||||||
|
- Proper handling of -1 channel specification
|
||||||
|
- Preservation of NCCL default behavior
|
||||||
|
|
||||||
|
### 7. **Registered Buffer Matching (`test_regbuff_matching`)**
|
||||||
|
- Configurations based on regBuff parameter
|
||||||
|
- Registered vs non-registered buffer handling
|
||||||
|
- Backward compatibility with configs missing regBuff
|
||||||
|
|
||||||
|
### 8. **Pipeline Operations Matching (`test_pipeops_matching`)**
|
||||||
|
- Configurations based on numPipeOps parameter
|
||||||
|
- Single vs multiple pipeline operation handling
|
||||||
|
- Backward compatibility with configs missing numPipeOps
|
||||||
|
|
||||||
|
### 9. **Fallback Behavior (`test_no_match_fallback`)**
|
||||||
|
- Default behavior when no config matches
|
||||||
|
- Ring/Simple algorithm fallback
|
||||||
|
|
||||||
|
## Test Output
|
||||||
|
|
||||||
|
Successful test run:
|
||||||
|
```
|
||||||
|
Running NCCL Tuner Plugin Unit Tests
|
||||||
|
=====================================
|
||||||
|
PASS: test_plugin_init
|
||||||
|
PASS: test_config_parsing_valid
|
||||||
|
PASS: test_config_parsing_invalid
|
||||||
|
PASS: test_collective_matching
|
||||||
|
PASS: test_size_matching
|
||||||
|
PASS: test_topology_matching
|
||||||
|
PASS: test_default_channels
|
||||||
|
PASS: test_regbuff_matching
|
||||||
|
PASS: test_pipeops_matching
|
||||||
|
PASS: test_no_match_fallback
|
||||||
|
|
||||||
|
=====================================
|
||||||
|
Test Results: 9/9 tests passed
|
||||||
|
All tests PASSED!
|
||||||
|
```
|
||||||
|
|
||||||
|
Failed test example:
|
||||||
|
```
|
||||||
|
FAIL: test_collective_matching - Tree/Simple should have low cost
|
||||||
|
Test Results: 8/9 tests passed
|
||||||
|
Some tests FAILED!
|
||||||
|
```
|
||||||
|
|
||||||
|
## Mock NCCL Implementation
|
||||||
|
|
||||||
|
The tests use the actual NCCL header files from the `../nccl/` directory:
|
||||||
|
|
||||||
|
- `tuner.h` - Complete NCCL tuner interface and type definitions
|
||||||
|
- `common.h` - Common NCCL types and logging functions
|
||||||
|
- `err.h` - NCCL error codes
|
||||||
|
|
||||||
|
This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation.
|
||||||
|
|
||||||
|
## Integration with CI/CD
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install tests for CI/CD pipeline
|
||||||
|
make install-test
|
||||||
|
|
||||||
|
# Run as part of automated testing
|
||||||
|
make test && echo "Tests passed" || echo "Tests failed"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Memory Testing
|
||||||
|
|
||||||
|
The tests can be run with valgrind for memory leak detection:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test-memory
|
||||||
|
```
|
||||||
|
|
||||||
|
This will detect:
|
||||||
|
- Memory leaks
|
||||||
|
- Invalid memory access
|
||||||
|
- Use of uninitialized memory
|
||||||
|
|
||||||
|
## Code Coverage
|
||||||
|
|
||||||
|
Generate code coverage reports to ensure comprehensive testing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make coverage
|
||||||
|
# Creates test_plugin.c.gcov with line-by-line coverage
|
||||||
|
```
|
||||||
|
|
||||||
|
## Adding New Tests
|
||||||
|
|
||||||
|
To add a new test:
|
||||||
|
|
||||||
|
1. Create a new test function in `test_plugin.c`:
|
||||||
|
```c
|
||||||
|
int test_new_feature() {
|
||||||
|
// Test setup
|
||||||
|
TEST_ASSERT(condition, "description");
|
||||||
|
// Test cleanup
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Add the test to the main function:
|
||||||
|
```c
|
||||||
|
total++; passed += test_new_feature();
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Rebuild and run:
|
||||||
|
```bash
|
||||||
|
make test
|
||||||
|
```
|
||||||
|
|
||||||
|
## Debugging Tests
|
||||||
|
|
||||||
|
For debugging failed tests:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Compile with debug symbols
|
||||||
|
make CFLAGS="-g -O0 -DDEBUG"
|
||||||
|
|
||||||
|
# Run with gdb
|
||||||
|
gdb ./test_plugin
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cleaning Up
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Remove all build artifacts and temporary files
|
||||||
|
make clean
|
||||||
|
```
|
||||||
|
|
||||||
|
This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases.
|
856
ext-tuner/example/test/test_plugin.c
Normal file
856
ext-tuner/example/test/test_plugin.c
Normal file
@ -0,0 +1,856 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Unit tests for NCCL Tuner Plugin
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#define _GNU_SOURCE // Enable setenv/unsetenv and other GNU extensions
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
|
||||||
|
|
||||||
|
// Include NCCL tuner header (which includes common.h and err.h)
|
||||||
|
#include "tuner.h"
|
||||||
|
|
||||||
|
// Include plugin source for testing
|
||||||
|
#include "../plugin.c"
|
||||||
|
|
||||||
|
// Test framework macros
|
||||||
|
#define TEST_ASSERT(condition, message) \
|
||||||
|
do { \
|
||||||
|
if (!(condition)) { \
|
||||||
|
printf("FAIL: %s - %s\n", __func__, message); \
|
||||||
|
return 0; \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#define TEST_PASS() \
|
||||||
|
do { \
|
||||||
|
printf("PASS: %s\n", __func__); \
|
||||||
|
return 1; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
// Global test state
|
||||||
|
static int test_log_count = 0;
|
||||||
|
|
||||||
|
// Mock logger function
|
||||||
|
void mock_logger(ncclDebugLogLevel level, unsigned long flags,
|
||||||
|
const char* file, int line, const char* fmt, ...) {
|
||||||
|
(void)flags; // Suppress unused parameter warning
|
||||||
|
test_log_count++;
|
||||||
|
|
||||||
|
// Check if we should print based on NCCL_DEBUG level
|
||||||
|
const char* debug_level = getenv("NCCL_DEBUG");
|
||||||
|
int should_print = 0;
|
||||||
|
|
||||||
|
if (debug_level) {
|
||||||
|
if (strcmp(debug_level, "TRACE") == 0) {
|
||||||
|
should_print = 1; // Print everything
|
||||||
|
} else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) {
|
||||||
|
should_print = 1; // Print INFO and below
|
||||||
|
} else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) {
|
||||||
|
should_print = 1; // Print WARN and below
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!should_print) return;
|
||||||
|
|
||||||
|
// Convert log level to string
|
||||||
|
const char* level_str;
|
||||||
|
switch(level) {
|
||||||
|
case NCCL_LOG_NONE: level_str = "NONE"; break;
|
||||||
|
case NCCL_LOG_VERSION: level_str = "VERSION"; break;
|
||||||
|
case NCCL_LOG_WARN: level_str = "WARN"; break;
|
||||||
|
case NCCL_LOG_INFO: level_str = "INFO"; break;
|
||||||
|
case NCCL_LOG_ABORT: level_str = "ABORT"; break;
|
||||||
|
case NCCL_LOG_TRACE: level_str = "TRACE"; break;
|
||||||
|
default: level_str = "UNKNOWN"; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print log header
|
||||||
|
printf("[TUNER:%s:%s:%d] ", level_str, file, line);
|
||||||
|
|
||||||
|
// Print formatted message
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
vprintf(fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to create test config file
|
||||||
|
void create_test_config(const char* filename, const char* content) {
|
||||||
|
FILE* f = fopen(filename, "w");
|
||||||
|
if (f) {
|
||||||
|
fprintf(f, "%s", content);
|
||||||
|
fclose(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 1: Plugin initialization
|
||||||
|
int test_plugin_init() {
|
||||||
|
void* context = NULL;
|
||||||
|
|
||||||
|
// Test successful initialization
|
||||||
|
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
|
||||||
|
TEST_ASSERT(context != NULL, "Context should be allocated");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 2: Configuration file parsing - valid CSV
|
||||||
|
int test_config_parsing_valid() {
|
||||||
|
const char* test_config =
|
||||||
|
"# Test configuration\n"
|
||||||
|
"allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"
|
||||||
|
"broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n"
|
||||||
|
"# Comment line\n"
|
||||||
|
"\n" // Empty line
|
||||||
|
"reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n";
|
||||||
|
|
||||||
|
create_test_config("test_valid.conf", test_config);
|
||||||
|
|
||||||
|
// Set environment variable to use our test config
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink("test_valid.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 3: Configuration file parsing - invalid CSV
|
||||||
|
int test_config_parsing_invalid() {
|
||||||
|
const char* test_config =
|
||||||
|
"allreduce,0,65536,tree,simple,2,1 # Missing nRanks and other fields\n"
|
||||||
|
"invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n"
|
||||||
|
"broadcast,abc,def,ring,simple,1,1,1,-1,-1\n"; // Invalid numbers
|
||||||
|
|
||||||
|
create_test_config("test_invalid.conf", test_config);
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
|
||||||
|
// Should still succeed but with no valid configs loaded
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink("test_invalid.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 4: Collective type matching
|
||||||
|
int test_collective_matching() {
|
||||||
|
const char* test_config =
|
||||||
|
"allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n"
|
||||||
|
"broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n";
|
||||||
|
|
||||||
|
create_test_config("test_match.conf", test_config);
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
pluginInit(8, 1, mock_logger, &context);
|
||||||
|
|
||||||
|
// Create mock cost table
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0; // Default high cost
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int nChannels;
|
||||||
|
|
||||||
|
// Test allreduce matching (should match first config)
|
||||||
|
ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
|
||||||
|
&cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 8, "Should set 8 channels");
|
||||||
|
|
||||||
|
// Test broadcast matching (should match second config)
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0; // Reset costs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
|
||||||
|
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 4, "Should set 4 channels");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink("test_match.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 5: Size range matching
|
||||||
|
int test_size_matching() {
|
||||||
|
const char* test_config =
|
||||||
|
"allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n"
|
||||||
|
"allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n"
|
||||||
|
"allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n";
|
||||||
|
|
||||||
|
create_test_config("test_size.conf", test_config);
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
pluginInit(8, 1, mock_logger, &context);
|
||||||
|
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int nChannels = 1;
|
||||||
|
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
|
||||||
|
&cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels");
|
||||||
|
|
||||||
|
// Test medium message (should match second config)
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
|
||||||
|
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels");
|
||||||
|
|
||||||
|
// Test large message (should match third config)
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
|
||||||
|
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink("test_size.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 6: Topology matching
|
||||||
|
int test_topology_matching() {
|
||||||
|
const char* test_config =
|
||||||
|
"allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n" // Single node only
|
||||||
|
"allreduce,0,65536,ring,simple,4,4,32,-1,-1\n" // 4 nodes, 32 ranks exactly
|
||||||
|
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any topology
|
||||||
|
|
||||||
|
create_test_config("test_topo.conf", test_config);
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1);
|
||||||
|
|
||||||
|
// Test with single node setup
|
||||||
|
void* context1 = NULL;
|
||||||
|
pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node
|
||||||
|
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int nChannels;
|
||||||
|
pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
|
||||||
|
TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
|
||||||
|
|
||||||
|
pluginDestroy(context1);
|
||||||
|
|
||||||
|
// Test with 4 nodes, 32 ranks setup
|
||||||
|
void* context2 = NULL;
|
||||||
|
pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes
|
||||||
|
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config");
|
||||||
|
TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
unlink("test_topo.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 7: Default channels behavior (-1)
|
||||||
|
int test_default_channels() {
|
||||||
|
const char* test_config =
|
||||||
|
"allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n"; // Use default channels
|
||||||
|
|
||||||
|
create_test_config("test_default.conf", test_config);
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
pluginInit(8, 1, mock_logger, &context);
|
||||||
|
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int nChannels = 99; // Set to known value
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol");
|
||||||
|
TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink("test_default.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 8: regBuff matching
|
||||||
|
int test_regbuff_matching() {
|
||||||
|
const char* test_config =
|
||||||
|
"allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n" // Registered buffers only
|
||||||
|
"allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n" // Non-registered buffers only
|
||||||
|
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any buffer type (backward compatible)
|
||||||
|
|
||||||
|
create_test_config("test_regbuff.conf", test_config);
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
pluginInit(8, 1, mock_logger, &context);
|
||||||
|
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int nChannels;
|
||||||
|
|
||||||
|
// Test registered buffer (should match first config)
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
1, &nChannels); // regBuff = 1 (registered)
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels");
|
||||||
|
|
||||||
|
// Test non-registered buffer (should match second config)
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels); // regBuff = 0 (non-registered)
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels");
|
||||||
|
|
||||||
|
// Test backward compatibility - config without regBuff should match any regBuff value
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// First try with regBuff=2 (unusual value, should match third config)
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
2, &nChannels); // regBuff = 2 (only third config should match)
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink("test_regbuff.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 9: numPipeOps matching
|
||||||
|
int test_pipeops_matching() {
|
||||||
|
const char* test_config =
|
||||||
|
"allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n" // Single pipeline op
|
||||||
|
"allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n" // Multiple pipeline ops
|
||||||
|
"allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any pipeline ops (backward compatible)
|
||||||
|
|
||||||
|
create_test_config("test_pipeops.conf", test_config);
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
pluginInit(8, 1, mock_logger, &context);
|
||||||
|
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int nChannels;
|
||||||
|
|
||||||
|
// Test single pipeline op (should match first config)
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels");
|
||||||
|
|
||||||
|
// Test multiple pipeline ops (should match second config)
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels");
|
||||||
|
|
||||||
|
// Test different number of pipeline ops (should match third config - backward compatible)
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost");
|
||||||
|
TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink("test_pipeops.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 10: No matching configuration (fallback behavior)
|
||||||
|
int test_no_match_fallback() {
|
||||||
|
const char* test_config =
|
||||||
|
"broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n"; // Only broadcast config
|
||||||
|
|
||||||
|
create_test_config("test_fallback.conf", test_config);
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
pluginInit(8, 1, mock_logger, &context);
|
||||||
|
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int nChannels;
|
||||||
|
// Try allreduce (should not match, use fallback)
|
||||||
|
pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
|
||||||
|
&cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
|
||||||
|
TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified");
|
||||||
|
TEST_ASSERT(nChannels == 1, "Should use default channels");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink("test_fallback.conf");
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 11: Large configuration files (testing dynamic allocation)
|
||||||
|
int test_large_config() {
|
||||||
|
const char* large_config_file = "test_large.conf";
|
||||||
|
|
||||||
|
// Create a large configuration file with many entries
|
||||||
|
// This tests the dynamic allocation functionality
|
||||||
|
FILE* f = fopen(large_config_file, "w");
|
||||||
|
TEST_ASSERT(f != NULL, "Should be able to create large config file");
|
||||||
|
|
||||||
|
// Write header comment
|
||||||
|
fprintf(f, "# Large configuration file for testing dynamic allocation\n");
|
||||||
|
fprintf(f, "# This file contains many configurations to test memory allocation\n");
|
||||||
|
|
||||||
|
// Generate a large number of configurations (much more than the old MAX_CONFIGS=100)
|
||||||
|
const int num_configs = 500; // 5x the old static limit
|
||||||
|
const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"};
|
||||||
|
const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"};
|
||||||
|
const char* protocols[] = {"simple", "ll", "ll128"};
|
||||||
|
|
||||||
|
for (int i = 0; i < num_configs; i++) {
|
||||||
|
// Vary the configurations to create realistic test data
|
||||||
|
const char* coll = collectives[i % 5];
|
||||||
|
const char* algo = algorithms[i % 4];
|
||||||
|
const char* proto = protocols[i % 3];
|
||||||
|
|
||||||
|
size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB
|
||||||
|
size_t max_bytes = min_bytes + 65536; // 64KB range
|
||||||
|
int channels = (i % 8) + 1; // 1-8 channels
|
||||||
|
int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes
|
||||||
|
int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks
|
||||||
|
int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps
|
||||||
|
int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1
|
||||||
|
|
||||||
|
fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n",
|
||||||
|
coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff);
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(f);
|
||||||
|
|
||||||
|
// Set environment to use our large config file
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1);
|
||||||
|
|
||||||
|
// Initialize plugin with large config
|
||||||
|
void* context = NULL;
|
||||||
|
ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
|
||||||
|
TEST_ASSERT(context != NULL, "Context should be allocated");
|
||||||
|
|
||||||
|
// Verify that configurations were loaded
|
||||||
|
TunerContext* ctx = (TunerContext*)context;
|
||||||
|
TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file");
|
||||||
|
TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size");
|
||||||
|
TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated");
|
||||||
|
|
||||||
|
// Test that we can access configurations throughout the array
|
||||||
|
// (This would have failed with the old static MAX_CONFIGS=100 limit)
|
||||||
|
for (int i = 0; i < ctx->numConfigs; i++) {
|
||||||
|
TuningConfig* config = &ctx->configs[i];
|
||||||
|
// Basic sanity checks on the loaded configurations
|
||||||
|
TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce,
|
||||||
|
"Collective type should be valid");
|
||||||
|
TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes");
|
||||||
|
TEST_ASSERT(config->nChannels > 0, "nChannels should be positive");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test specific configuration access at various indices
|
||||||
|
// Index 0 (first config)
|
||||||
|
TuningConfig* first_config = &ctx->configs[0];
|
||||||
|
TEST_ASSERT(first_config != NULL, "First config should be accessible");
|
||||||
|
|
||||||
|
// Index in middle
|
||||||
|
TuningConfig* mid_config = &ctx->configs[num_configs / 2];
|
||||||
|
TEST_ASSERT(mid_config != NULL, "Middle config should be accessible");
|
||||||
|
|
||||||
|
// Index near end (this would have crashed with static array of 100)
|
||||||
|
TuningConfig* late_config = &ctx->configs[num_configs - 1];
|
||||||
|
TEST_ASSERT(late_config != NULL, "Last config should be accessible");
|
||||||
|
|
||||||
|
// Test memory allocation size - verify we didn't over-allocate
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs);
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"Memory allocated for %d configurations (%zu bytes total)",
|
||||||
|
ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig));
|
||||||
|
|
||||||
|
// Test that the plugin can still find matching configurations from the large set
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0; // Default high cost
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int nChannels;
|
||||||
|
// Try to find a matching configuration - should work with large config set
|
||||||
|
result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink(large_config_file);
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 12: Very large configuration stress test
|
||||||
|
int test_very_large_config_stress() {
|
||||||
|
const char* stress_config_file = "test_stress.conf";
|
||||||
|
|
||||||
|
// Create an even larger configuration file to stress test the implementation
|
||||||
|
FILE* f = fopen(stress_config_file, "w");
|
||||||
|
TEST_ASSERT(f != NULL, "Should be able to create stress test config file");
|
||||||
|
|
||||||
|
fprintf(f, "# Stress test configuration with very large number of entries\n");
|
||||||
|
|
||||||
|
// Generate an extremely large number of configurations
|
||||||
|
const int stress_configs = 2000; // 20x the old static limit
|
||||||
|
|
||||||
|
for (int i = 0; i < stress_configs; i++) {
|
||||||
|
// Create varied but valid configurations
|
||||||
|
fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n",
|
||||||
|
i * 512, (i * 512) + 1024);
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(f);
|
||||||
|
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1);
|
||||||
|
|
||||||
|
// Test initialization with stress config
|
||||||
|
void* context = NULL;
|
||||||
|
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
|
||||||
|
|
||||||
|
TunerContext* ctx = (TunerContext*)context;
|
||||||
|
TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations");
|
||||||
|
TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated");
|
||||||
|
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"Stress test - loaded %d configurations successfully", stress_configs);
|
||||||
|
mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
|
||||||
|
"Memory usage: %zu bytes for configuration array",
|
||||||
|
stress_configs * sizeof(TuningConfig));
|
||||||
|
|
||||||
|
// Verify we can access configurations throughout the entire range
|
||||||
|
for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config
|
||||||
|
TuningConfig* config = &ctx->configs[i];
|
||||||
|
TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type");
|
||||||
|
TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink(stress_config_file);
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 13: Edge case - empty config file
|
||||||
|
int test_empty_config() {
|
||||||
|
const char* empty_config_file = "test_empty.conf";
|
||||||
|
|
||||||
|
// Create empty config file (only comments)
|
||||||
|
create_test_config(empty_config_file,
|
||||||
|
"# Empty configuration file\n"
|
||||||
|
"# No actual configurations\n"
|
||||||
|
"\n"
|
||||||
|
"\n");
|
||||||
|
|
||||||
|
setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
|
||||||
|
|
||||||
|
void* context = NULL;
|
||||||
|
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
|
||||||
|
|
||||||
|
TunerContext* ctx = (TunerContext*)context;
|
||||||
|
TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations");
|
||||||
|
TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations");
|
||||||
|
TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config");
|
||||||
|
|
||||||
|
// Test that plugin still works with no configurations (fallback behavior)
|
||||||
|
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||||
|
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||||
|
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||||
|
cost_table_ptr[i] = cost_table[i];
|
||||||
|
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||||
|
cost_table[i][j] = 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int nChannels;
|
||||||
|
result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
|
||||||
|
cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||||
|
0, &nChannels);
|
||||||
|
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
pluginDestroy(context);
|
||||||
|
unlink(empty_config_file);
|
||||||
|
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||||
|
|
||||||
|
TEST_PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test runner function pointer type
|
||||||
|
typedef int (*TestFunction)(void);
|
||||||
|
|
||||||
|
// Test registry
|
||||||
|
typedef struct {
|
||||||
|
const char* name;
|
||||||
|
TestFunction func;
|
||||||
|
const char* description;
|
||||||
|
} TestCase;
|
||||||
|
|
||||||
|
// All available tests
|
||||||
|
TestCase test_cases[] = {
|
||||||
|
{"init", test_plugin_init, "Plugin initialization"},
|
||||||
|
{"config-valid", test_config_parsing_valid, "Valid configuration parsing"},
|
||||||
|
{"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"},
|
||||||
|
{"collective", test_collective_matching, "Collective type matching"},
|
||||||
|
{"size", test_size_matching, "Size range matching"},
|
||||||
|
{"topology", test_topology_matching, "Topology matching"},
|
||||||
|
{"channels", test_default_channels, "Default channels behavior"},
|
||||||
|
{"regbuff", test_regbuff_matching, "Registered buffer matching"},
|
||||||
|
{"pipeops", test_pipeops_matching, "Pipeline operations matching"},
|
||||||
|
{"fallback", test_no_match_fallback, "Fallback behavior"},
|
||||||
|
{"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
|
||||||
|
{"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
|
||||||
|
{"empty-config", test_empty_config, "Empty configuration file handling"},
|
||||||
|
{NULL, NULL, NULL} // End marker
|
||||||
|
};
|
||||||
|
|
||||||
|
// Show help/usage information
|
||||||
|
void show_help(const char* program_name) {
|
||||||
|
printf("Usage: %s [test_name ...]\n\n", program_name);
|
||||||
|
printf("Available tests:\n");
|
||||||
|
for (int i = 0; test_cases[i].name != NULL; i++) {
|
||||||
|
printf(" %-15s - %s\n", test_cases[i].name, test_cases[i].description);
|
||||||
|
}
|
||||||
|
printf("\nExamples:\n");
|
||||||
|
printf(" %s # Run all tests\n", program_name);
|
||||||
|
printf(" %s init # Run only initialization test\n", program_name);
|
||||||
|
printf(" %s init collective # Run initialization and collective tests\n", program_name);
|
||||||
|
printf(" %s --help # Show this help\n", program_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find test by name
|
||||||
|
TestFunction find_test(const char* name) {
|
||||||
|
for (int i = 0; test_cases[i].name != NULL; i++) {
|
||||||
|
if (strcmp(test_cases[i].name, name) == 0) {
|
||||||
|
return test_cases[i].func;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main test runner
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
int passed = 0, total = 0;
|
||||||
|
|
||||||
|
// Check for help
|
||||||
|
if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) {
|
||||||
|
show_help(argv[0]);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Running NCCL Tuner Plugin Unit Tests\n");
|
||||||
|
printf("=====================================\n");
|
||||||
|
|
||||||
|
if (argc == 1) {
|
||||||
|
// No arguments - run all tests
|
||||||
|
for (int i = 0; test_cases[i].name != NULL; i++) {
|
||||||
|
total++;
|
||||||
|
passed += test_cases[i].func();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Run specific tests
|
||||||
|
for (int arg = 1; arg < argc; arg++) {
|
||||||
|
TestFunction test_func = find_test(argv[arg]);
|
||||||
|
if (test_func) {
|
||||||
|
total++;
|
||||||
|
passed += test_func();
|
||||||
|
} else {
|
||||||
|
printf("ERROR: Unknown test '%s'\n", argv[arg]);
|
||||||
|
printf("Use --help to see available tests\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n=====================================\n");
|
||||||
|
printf("Test Results: %d/%d tests passed\n", passed, total);
|
||||||
|
|
||||||
|
if (passed == total) {
|
||||||
|
printf("All tests PASSED!\n");
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
printf("Some tests FAILED!\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
@ -40,10 +40,12 @@ ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
|
|||||||
CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
|
CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
|
||||||
endif
|
endif
|
||||||
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
||||||
|
CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
|
||||||
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
||||||
CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
|
CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
|
||||||
CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
|
CUDA12_8_GENCODE = -gencode=arch=compute_100,code=sm_100 \
|
||||||
-gencode=arch=compute_120,code=sm_120
|
-gencode=arch=compute_120,code=sm_120
|
||||||
|
CUDA13_GENCODE = -gencode=arch=compute_110,code=sm_110
|
||||||
|
|
||||||
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
||||||
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
||||||
@ -53,10 +55,10 @@ CUDA13_PTX = -gencode=arch=compute_120,code=compute_120
|
|||||||
|
|
||||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
|
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
|
||||||
# Prior to SM75 is deprecated from CUDA13.0 onwards
|
# Prior to SM75 is deprecated from CUDA13.0 onwards
|
||||||
NVCC_GENCODE ?= $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
|
NVCC_GENCODE ?= $(CUDA10_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
|
||||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
|
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
|
||||||
# Include Blackwell support if we're using CUDA12.8 or above
|
# Include Blackwell support if we're using CUDA12.8 or above
|
||||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
|
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_PTX)
|
||||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
|
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
|
||||||
# Include Hopper support if we're using CUDA11.8 or above
|
# Include Hopper support if we're using CUDA11.8 or above
|
||||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
|
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 27
|
NCCL_MINOR := 27
|
||||||
NCCL_PATCH := 3
|
NCCL_PATCH := 5
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -36,9 +36,8 @@ define COMPILE
|
|||||||
$(call COMPILE$(or $3,$(suffix $2)),$1,$2)
|
$(call COMPILE$(or $3,$(suffix $2)),$1,$2)
|
||||||
endef
|
endef
|
||||||
|
|
||||||
ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12080))"),1)
|
ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12090))"),1)
|
||||||
NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a \
|
NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100f,code=sm_100f
|
||||||
-gencode=arch=compute_120a,code=sm_120a
|
|
||||||
else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1)
|
else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1)
|
||||||
NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a
|
NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a
|
||||||
else
|
else
|
||||||
|
@ -1009,7 +1009,7 @@ struct Apply_LoadMultimem {
|
|||||||
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_bfloat16, bf16x2, 4)
|
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_bfloat16, bf16x2, 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210
|
#if NCCL_CUDA_ARCH_SPECIFIC == 1000 || NCCL_CUDA_ARCH_SPECIFIC == 1010 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210
|
||||||
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
|
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
|
||||||
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
|
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
|
||||||
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4)
|
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4)
|
||||||
|
@ -108,7 +108,7 @@ def required_cuda(k):
|
|||||||
if k.algo in ldmc_algos:
|
if k.algo in ldmc_algos:
|
||||||
cudart = 12070
|
cudart = 12070
|
||||||
arch = None
|
arch = None
|
||||||
specific_sms = [100, 120]
|
specific_sms = ["100a", "101a", "100f", "101f", "120a", "121a"]
|
||||||
return (cudart, arch, specific_sms)
|
return (cudart, arch, specific_sms)
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
@ -145,7 +145,7 @@ def kernel_conds(k):
|
|||||||
if not specific_sms:
|
if not specific_sms:
|
||||||
arch_cond = "__CUDA_ARCH__ >= %d"%arch
|
arch_cond = "__CUDA_ARCH__ >= %d"%arch
|
||||||
else:
|
else:
|
||||||
arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_SPECIFIC==%d"%(10*sm) for sm in specific_sms])
|
arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_%sSPECIFIC==%d"%("FAMILY_" if sm[-1] == "f" else "", 10*int(sm.replace('a', '').replace('f', ''))) for sm in specific_sms])
|
||||||
return cudart_cond, arch_cond
|
return cudart_cond, arch_cond
|
||||||
|
|
||||||
def instantiate(k):
|
def instantiate(k):
|
||||||
|
@ -175,6 +175,13 @@ ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int mergePathType(int type0, int type1){
|
||||||
|
int max = std::max(type0,type1);
|
||||||
|
int min = std::min(type0,type1);
|
||||||
|
if(max == PATH_PHB && min == PATH_C2C) return PATH_P2C;
|
||||||
|
else return max;
|
||||||
|
}
|
||||||
|
|
||||||
static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
|
static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
|
||||||
struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
|
struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
|
||||||
struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
|
struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
|
||||||
@ -187,7 +194,7 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
|
|||||||
|
|
||||||
// Update path characteristics
|
// Update path characteristics
|
||||||
srcNode->paths[t2][i2].count = l;
|
srcNode->paths[t2][i2].count = l;
|
||||||
srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
|
srcNode->paths[t2][i2].type = mergePathType(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
|
||||||
if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
|
if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
|
||||||
srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
|
srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -674,9 +681,9 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
|||||||
int c;
|
int c;
|
||||||
NCCLCHECK(ncclGetLocalCpu(system, g, &c));
|
NCCLCHECK(ncclGetLocalCpu(system, g, &c));
|
||||||
if (c == -1) continue;
|
if (c == -1) continue;
|
||||||
if (gpuNode->paths[NET][n].type == PATH_PHB && gpuNode->paths[CPU][c].type == PATH_C2C) {
|
if (mergePathType(gpuNode->paths[CPU][c].type, netNode->paths[CPU][c].type) == PATH_P2C) {
|
||||||
gpuNode->paths[NET][n].type = PATH_P2C;
|
gpuNode->paths[NET][n].type = std::min(PATH_P2C, gpuNode->paths[NET][n].type);
|
||||||
netNode->paths[GPU][g].type = PATH_P2C;
|
netNode->paths[GPU][g].type = std::min(PATH_P2C, netNode->paths[GPU][g].type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -695,16 +702,15 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
|||||||
// PXN = PCI + NVLink.
|
// PXN = PCI + NVLink.
|
||||||
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
|
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
|
||||||
// Only use PXN for NIC n if remote GPU p ...
|
// Only use PXN for NIC n if remote GPU p ...
|
||||||
if (/* (1) is either connected to the NIC with PXB*/
|
int pxnType = ncclParamPxnC2c() ? PATH_P2C : PATH_PXB;
|
||||||
(peerNode->paths[NET][n].type <= PATH_PXB ||
|
if (/* (1) is connected to the NIC with PxN type*/
|
||||||
/* or with P2C and PxN over C2C is enabled */
|
peerNode->paths[NET][n].type <= pxnType &&
|
||||||
(ncclParamPxnC2c() && peerNode->paths[NET][n].type == PATH_P2C)) &&
|
|
||||||
/* and (2) is connected to us through NVLink */
|
/* and (2) is connected to us through NVLink */
|
||||||
peerNode->paths[GPU][g].type <= PATH_NVL &&
|
peerNode->paths[GPU][g].type <= PATH_NVL &&
|
||||||
/* and (3) is on the same node as us */
|
/* and (3) is on the same node as us */
|
||||||
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) &&
|
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) &&
|
||||||
/* and (4) has either higher bw to that NIC or avoid going through the CPU*/
|
/* and (4) has either higher bw to that NIC or avoid going through the CPU*/
|
||||||
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXB))
|
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > pxnType))
|
||||||
// We can use that GPU as relay to communicate with that NIC.
|
// We can use that GPU as relay to communicate with that NIC.
|
||||||
// Only enabling it in the GPU->NIC direction for now to favor
|
// Only enabling it in the GPU->NIC direction for now to favor
|
||||||
// receiving locally and sending remotely (consistent with net.cc)
|
// receiving locally and sending remotely (consistent with net.cc)
|
||||||
@ -725,6 +731,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pre-compute NET local gpus to accelerate search
|
||||||
|
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||||
|
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||||
|
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &net->net.localGpu));
|
||||||
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -437,25 +437,46 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build a sorted list of the NETs to try.
|
// Add the preferred NICs ordered by GPU first
|
||||||
//
|
static ncclResult_t ncclTopoPrefNetsGpuFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) {
|
||||||
// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
|
const int nGpus = (gpu == -1) ? system->nodes[GPU].count : 1;
|
||||||
// index when trying to get back to the NIC.
|
int gpuCount = nGpus;
|
||||||
//
|
int gpuIds[NCCL_TOPO_MAX_NODES] = {gpu};
|
||||||
// The list is built the following way:
|
int firstNets[NCCL_TOPO_MAX_NODES];
|
||||||
// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
|
if (gpu == -1)
|
||||||
// 2. add other NETs satisfying typeInter but not already in the list.
|
for (int g = 0; g < nGpus; g++) gpuIds[g] = g;
|
||||||
|
|
||||||
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
|
for (int c = 0; c < MAXCHANNELS; c++) {
|
||||||
ncclResult_t ret = ncclSuccess;
|
for (int g = 0; g < nGpus; g++) {
|
||||||
int netCount = 0;
|
if (gpuIds[g] == -1) continue;
|
||||||
int localNetCount;
|
int localNet;
|
||||||
int localNets[MAXCHANNELS];
|
int64_t netId;
|
||||||
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes + gpuIds[g];
|
||||||
|
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
|
||||||
|
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &localNet));
|
||||||
|
// store the first net found for each GPU in case of duplicates
|
||||||
|
if(c == 0) firstNets[g] = localNet;
|
||||||
|
// if the NET has already been returned for channel 0, that GPU is done
|
||||||
|
if (c > 0 && firstNets[g] == localNet) {
|
||||||
|
gpuIds[g] = -1;
|
||||||
|
gpuCount--;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// only add it to the list if it doesn't already exist
|
||||||
|
int found = 0;
|
||||||
|
while (found < (*netCount) && nets[found] != localNet) found++;
|
||||||
|
if (found == (*netCount)) nets[(*netCount)++] = localNet;
|
||||||
|
}
|
||||||
|
if (gpuCount == 0) break;
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
// First add the preferred NICs
|
// Add the preferred NICs ordered by channels first
|
||||||
|
static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) {
|
||||||
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
||||||
if (gpu != -1 && gpu != g) continue;
|
if (gpu != -1 && gpu != g) continue;
|
||||||
localNetCount = 0;
|
int localNetCount = 0, localNets[MAXCHANNELS];
|
||||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes + g;
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes + g;
|
||||||
for (int c = 0; c < MAXCHANNELS; c++) {
|
for (int c = 0; c < MAXCHANNELS; c++) {
|
||||||
int64_t netId;
|
int64_t netId;
|
||||||
@ -468,16 +489,40 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
|||||||
for (int i = 0; i < localNetCount; i++) {
|
for (int i = 0; i < localNetCount; i++) {
|
||||||
int n = localNets[i];
|
int n = localNets[i];
|
||||||
int found = 0;
|
int found = 0;
|
||||||
while (found<netCount && nets[found] != n) found++;
|
while (found < (*netCount) && nets[found] != n) found++;
|
||||||
if (found == netCount) nets[netCount++] = n;
|
if (found == (*netCount)) nets[(*netCount)++] = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a sorted list of the NETs to try.
|
||||||
|
//
|
||||||
|
// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
|
||||||
|
// index when trying to get back to the NIC.
|
||||||
|
//
|
||||||
|
// The list is built the following way:
|
||||||
|
// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
|
||||||
|
// 2. add other NETs satisfying typeInter but not already in the list.
|
||||||
|
NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1);
|
||||||
|
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
|
int netCount = 0;
|
||||||
|
|
||||||
|
// First add the preferred NETs.
|
||||||
|
if (system->nHosts > 1 && ncclParamScatterEnable()) {
|
||||||
|
// For MNNVL systems, we sort the devices by GPU first, then by channel
|
||||||
|
NCCLCHECK(ncclTopoPrefNetsGpuFirst(system, gpu, nets, &netCount));
|
||||||
|
} else {
|
||||||
|
// For other systems, we sort the devices by channel first, then by GPU
|
||||||
|
NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount));
|
||||||
|
}
|
||||||
|
|
||||||
// Then add others satisfying typeInter
|
// Then add others satisfying typeInter
|
||||||
for (int t=0; t <= typeInter; t++) {
|
for (int t=0; t <= typeInter; t++) {
|
||||||
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
||||||
if (gpu != -1 && gpu != g) continue;
|
if (gpu != -1 && gpu != g) continue;
|
||||||
localNetCount = 0;
|
int localNetCount = 0, localNets[MAXCHANNELS];
|
||||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||||
struct ncclTopoLinkList* paths = gpu->paths[NET];
|
struct ncclTopoLinkList* paths = gpu->paths[NET];
|
||||||
for (int n=0; n<system->nodes[NET].count && n<MAXCHANNELS; n++) {
|
for (int n=0; n<system->nodes[NET].count && n<MAXCHANNELS; n++) {
|
||||||
@ -625,8 +670,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
|||||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
|
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
|
||||||
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
|
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
|
||||||
if (graph->nChannels < netCount) {
|
if (graph->nChannels < netCount) {
|
||||||
int gpu;
|
int gpu = net->net.localGpu;
|
||||||
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
|
|
||||||
if (gpu != -1) {
|
if (gpu != -1) {
|
||||||
int duplicate = 0;
|
int duplicate = 0;
|
||||||
// check whether there is duplicate head when one GPU connects with multiple NICs
|
// check whether there is duplicate head when one GPU connects with multiple NICs
|
||||||
@ -643,13 +687,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (graph->nChannels > 0) {
|
if (graph->nChannels > 0 && graph->sameChannels == 1) {
|
||||||
// Try to replay the last channel
|
// Try to replay the last channel
|
||||||
int g;
|
int g;
|
||||||
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
|
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
|
||||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
|
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
|
||||||
}
|
} else {
|
||||||
if (graph->nChannels == 0 || graph->sameChannels == 0) {
|
|
||||||
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
|
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
|
||||||
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
|
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
|
||||||
int t = 1 << 10;
|
int t = 1 << 10;
|
||||||
@ -658,11 +701,16 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Then try the most local GPUs
|
// Then try the most local GPUs
|
||||||
|
int localGpu = net->net.localGpu;
|
||||||
|
if (localGpu != -1) {
|
||||||
|
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpu));
|
||||||
|
}
|
||||||
int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType;
|
int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType;
|
||||||
NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType));
|
NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType));
|
||||||
// if no GPUs are connected, skip this net
|
// if no GPUs are connected, skip this net
|
||||||
if (pathType == PATH_DIS) continue;
|
if (pathType == PATH_DIS) continue;
|
||||||
for (int g = 0; g < localGpuCount; ++g) {
|
for (int g = 0; g < localGpuCount; ++g) {
|
||||||
|
if (localGpus[g] == localGpu) continue; // We already tried this one
|
||||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g]));
|
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -749,8 +797,8 @@ struct kvDict kvDictLinkType[] = {
|
|||||||
{ "NVB", PATH_NVB },
|
{ "NVB", PATH_NVB },
|
||||||
{ "PIX", PATH_PIX },
|
{ "PIX", PATH_PIX },
|
||||||
{ "PXB", PATH_PXB },
|
{ "PXB", PATH_PXB },
|
||||||
{ "PXN", PATH_PXN },
|
|
||||||
{ "P2C", PATH_P2C },
|
{ "P2C", PATH_P2C },
|
||||||
|
{ "PXN", PATH_PXN },
|
||||||
{ "PHB", PATH_PHB },
|
{ "PHB", PATH_PHB },
|
||||||
{ "SYS", PATH_SYS },
|
{ "SYS", PATH_SYS },
|
||||||
{ NULL, 0 }
|
{ NULL, 0 }
|
||||||
@ -798,8 +846,10 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
|
|||||||
NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
|
NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
|
||||||
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
|
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
|
||||||
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
|
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
|
||||||
if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
|
|
||||||
const char* str;
|
const char* str;
|
||||||
|
NCCLCHECK(xmlGetAttr(xmlGraph, "latencyinter", &str));
|
||||||
|
if (!str) INFO(NCCL_GRAPH, "latencyinter not found in graph, using 0.0");
|
||||||
|
graph->latencyInter = str ? strtof(str, NULL) : 0.0;
|
||||||
NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
|
NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
|
||||||
NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
|
NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
|
||||||
NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
|
NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
|
||||||
@ -910,7 +960,7 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0,
|
|||||||
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
|
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
|
||||||
|
|
||||||
float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 };
|
float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 };
|
||||||
float sm100SpeedArrayInter[] = { 47.9, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
float sm100SpeedArrayInter[] = { 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||||
#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
|
#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
|
||||||
#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
|
#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
|
||||||
|
|
||||||
@ -1136,8 +1186,12 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
|||||||
offset = strlen(line);
|
offset = strlen(line);
|
||||||
}
|
}
|
||||||
for (int i=0; i<ngpus; i++) {
|
for (int i=0; i<ngpus; i++) {
|
||||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
|
int g;
|
||||||
|
ncclTopoRankToIndex(system, graph->intra[ngpus * c + i], &g, true);
|
||||||
|
int64_t topoId = system->nodes[GPU].nodes[g].id;
|
||||||
|
sprintf(line + offset, " %s/%lx-%lx", topoNodeTypeStr[GPU], NCCL_TOPO_ID_SYSTEM_ID(topoId), NCCL_TOPO_ID_LOCAL_ID(topoId));
|
||||||
offset = strlen(line);
|
offset = strlen(line);
|
||||||
|
if (graph->id == 3) break; // NVLS graphs only use the first GPU
|
||||||
}
|
}
|
||||||
if (system->nodes[NET].count > 0) {
|
if (system->nodes[NET].count > 0) {
|
||||||
sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1]));
|
sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1]));
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
|
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
|
||||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "", "SYS", "NET" };
|
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "", "SYS", "NET" };
|
||||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" };
|
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "P2C", "PXN", "PHB", "SYS", "NET", "DIS" };
|
||||||
|
|
||||||
/******************************************************************/
|
/******************************************************************/
|
||||||
/******************* Graph Creation Functions *********************/
|
/******************* Graph Creation Functions *********************/
|
||||||
@ -677,7 +677,14 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
|
|||||||
struct ncclXmlNode* node = topNode->subs[s];
|
struct ncclXmlNode* node = topNode->subs[s];
|
||||||
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
|
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
|
||||||
}
|
}
|
||||||
for (int systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId;
|
|
||||||
|
int systemId = 0;
|
||||||
|
while (systemId < system->nHosts && system->hostHashes[systemId] != localHostHash) systemId++;
|
||||||
|
system->systemId = systemId;
|
||||||
|
if(systemId == system->nHosts){
|
||||||
|
WARN("localHostHash = 0x%lx not found in the list of system hostHashes",localHostHash);
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
|
||||||
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
|
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
|
||||||
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
|
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
|
||||||
@ -1143,8 +1150,8 @@ struct kvDict nicPathKvList[] = {
|
|||||||
{ "PORT", PATH_PORT },
|
{ "PORT", PATH_PORT },
|
||||||
{ "PIX", PATH_PIX },
|
{ "PIX", PATH_PIX },
|
||||||
{ "PXB", PATH_PXB },
|
{ "PXB", PATH_PXB },
|
||||||
{ "PXN", PATH_PXN },
|
|
||||||
{ "P2C", PATH_P2C },
|
{ "P2C", PATH_P2C },
|
||||||
|
{ "PXN", PATH_PXN },
|
||||||
{ "PHB", PATH_PHB },
|
{ "PHB", PATH_PHB },
|
||||||
{ "SYS", PATH_SYS },
|
{ "SYS", PATH_SYS },
|
||||||
{ NULL, 0 }
|
{ NULL, 0 }
|
||||||
@ -1421,7 +1428,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Only update our topo tracking structure if we aren't dumping (separate steps)
|
// Only update our topo tracking structure if we aren't dumping (separate steps)
|
||||||
if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
|
if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, getHostHash()), ret, fail);
|
||||||
|
|
||||||
exit:
|
exit:
|
||||||
if (!comm->MNNVL && localRanks) free(localRanks);
|
if (!comm->MNNVL && localRanks) free(localRanks);
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
#define SM80_NVLINK_BW 20.0
|
#define SM80_NVLINK_BW 20.0
|
||||||
#define SM90_NVLINK_BW 20.6
|
#define SM90_NVLINK_BW 20.6
|
||||||
#define SM86_NVLINK_BW 12.0
|
#define SM86_NVLINK_BW 12.0
|
||||||
#define SM100_NVLINK_BW 40.0
|
#define SM100_NVLINK_BW 40.1
|
||||||
#define PCI_BW 12.0 // PCI Gen3 x16
|
#define PCI_BW 12.0 // PCI Gen3 x16
|
||||||
#define AMD_BW 16.0
|
#define AMD_BW 16.0
|
||||||
#define BDW_QPI_BW 6.0
|
#define BDW_QPI_BW 6.0
|
||||||
@ -76,11 +76,11 @@ extern const char* topoLinkTypeStr[];
|
|||||||
// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
|
// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
|
||||||
#define PATH_PXB 5
|
#define PATH_PXB 5
|
||||||
|
|
||||||
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
|
|
||||||
#define PATH_PXN 6
|
|
||||||
|
|
||||||
// Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC
|
// Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC
|
||||||
#define PATH_P2C 7
|
#define PATH_P2C 6
|
||||||
|
|
||||||
|
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
|
||||||
|
#define PATH_PXN 7
|
||||||
|
|
||||||
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
|
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
|
||||||
#define PATH_PHB 8
|
#define PATH_PHB 8
|
||||||
@ -143,6 +143,7 @@ struct ncclTopoNode {
|
|||||||
int gdrSupport;
|
int gdrSupport;
|
||||||
int collSupport;
|
int collSupport;
|
||||||
int maxChannels;
|
int maxChannels;
|
||||||
|
int localGpu;
|
||||||
}net;
|
}net;
|
||||||
struct {
|
struct {
|
||||||
int arch;
|
int arch;
|
||||||
|
@ -455,9 +455,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
|
int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
|
||||||
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
||||||
// Enable LL128 by default only on Volta/Ampere/Hopper/Blackwell+NVLink. Other cases are not tested and may cause silent data corruption.
|
|
||||||
pEnable = 1;
|
pEnable = 1;
|
||||||
pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= (ncclParamLl128C2c() ? PATH_P2C : PATH_PXN)));
|
if (ncclParamLl128C2c() && minCompCap >= 90) {
|
||||||
|
// Enable LL128 by default only on Hopper/Blackwell for all connections up to P2C and PXN.
|
||||||
|
pEnable &= (graphs[a]->typeInter <= PATH_PXN);
|
||||||
|
} else {
|
||||||
|
// Enable LL128 only up to PXB. Don't enable LL128 over PxN because PxN can encapsulate PxB or P2C links.
|
||||||
|
pEnable &= (graphs[a]->typeInter <= PATH_PXB);
|
||||||
|
if (!ncclParamLl128C2c() && minCompCap >= 90)
|
||||||
|
INFO(NCCL_GRAPH, "Disabling LL128 over all PxN connections (PXB and C2C). This ensures that no C2C link will be used by LL128.");
|
||||||
|
}
|
||||||
pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
|
pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
|
||||||
pEnable &= (minCompCap == maxCompCap);
|
pEnable &= (minCompCap == maxCompCap);
|
||||||
pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2));
|
pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2));
|
||||||
|
14
src/init.cc
14
src/init.cc
@ -1507,7 +1507,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
|
|||||||
int minCTAsEnv;
|
int minCTAsEnv;
|
||||||
int maxCTAsEnv;
|
int maxCTAsEnv;
|
||||||
int splitShareEnv;
|
int splitShareEnv;
|
||||||
int collnetEnableEnv;
|
const char* collnetEnableEnv;
|
||||||
int ctaPolicyEnv;
|
int ctaPolicyEnv;
|
||||||
int shrinkShareEnv;
|
int shrinkShareEnv;
|
||||||
int nvlsCTAsEnv;
|
int nvlsCTAsEnv;
|
||||||
@ -1561,9 +1561,15 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
|
|||||||
comm->config.shrinkShare = shrinkShareEnv;
|
comm->config.shrinkShare = shrinkShareEnv;
|
||||||
}
|
}
|
||||||
|
|
||||||
collnetEnableEnv = ncclParamCollnetEnable();
|
// NCCL_COLLNET_ENABLE needs to be reloaded each time for comm init
|
||||||
if (collnetEnableEnv != NCCL_CONFIG_UNDEF_INT) {
|
// since users might change the env on the fly to enable/disable collnet
|
||||||
comm->config.collnetEnable = collnetEnableEnv;
|
collnetEnableEnv = ncclGetEnv("NCCL_COLLNET_ENABLE");
|
||||||
|
if (collnetEnableEnv != NULL) {
|
||||||
|
int collnetEnableInt = (int)strtol(collnetEnableEnv, NULL, 0);
|
||||||
|
if (collnetEnableInt != NCCL_CONFIG_UNDEF_INT) {
|
||||||
|
comm->config.collnetEnable = collnetEnableInt;
|
||||||
|
INFO(NCCL_ENV, "NCCL_COLLNET_ENABLE set by environment to %d.", collnetEnableInt);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ctaPolicyEnv = ncclParamCtaPolicy();
|
ctaPolicyEnv = ncclParamCtaPolicy();
|
||||||
|
@ -52,6 +52,9 @@ ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
|
|||||||
#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \
|
#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \
|
||||||
cast = (void**)&funcptr; \
|
cast = (void**)&funcptr; \
|
||||||
*cast = dlvsym(handle, symbol, version); \
|
*cast = dlvsym(handle, symbol, version); \
|
||||||
|
if (*cast == NULL) { \
|
||||||
|
INFO(NCCL_NET, "dlvsym failed on %s - %s version %s", symbol, dlerror(), version); \
|
||||||
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported);
|
LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported);
|
||||||
|
@ -21,7 +21,6 @@ struct ncclStrongStreamCapture {
|
|||||||
cudaGraph_t graph;
|
cudaGraph_t graph;
|
||||||
unsigned long long graphId;
|
unsigned long long graphId;
|
||||||
cudaStream_t captureStream;
|
cudaStream_t captureStream;
|
||||||
cudaGraphNode_t lastRecord;
|
|
||||||
void* acquiredBy;
|
void* acquiredBy;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -216,7 +215,6 @@ ncclResult_t ncclStrongStreamAcquire(
|
|||||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
|
CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
|
||||||
}
|
}
|
||||||
cap->graphId = graph.graphId;
|
cap->graphId = graph.graphId;
|
||||||
cap->lastRecord = nullptr;
|
|
||||||
cap->acquiredBy = localThreadId();
|
cap->acquiredBy = localThreadId();
|
||||||
// Push to capturing list.
|
// Push to capturing list.
|
||||||
cap->next = ss->captureHead;
|
cap->next = ss->captureHead;
|
||||||
@ -296,16 +294,6 @@ ncclResult_t ncclStrongStreamRelease(
|
|||||||
cudaGraphNode_t recordNode;
|
cudaGraphNode_t recordNode;
|
||||||
CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
|
CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
|
||||||
|
|
||||||
// Make this record order after previous record on this stream.
|
|
||||||
if (cap->lastRecord != nullptr) {
|
|
||||||
#if CUDART_VERSION >= 13000
|
|
||||||
CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &cap->lastRecord, &recordNode, nullptr, 1));
|
|
||||||
#else
|
|
||||||
CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
cap->lastRecord = recordNode;
|
|
||||||
|
|
||||||
// Get current nodes from work stream so we can add them as dependencies.
|
// Get current nodes from work stream so we can add them as dependencies.
|
||||||
cudaStreamCaptureStatus status;
|
cudaStreamCaptureStatus status;
|
||||||
cudaGraphNode_t const* nodes;
|
cudaGraphNode_t const* nodes;
|
||||||
@ -338,6 +326,22 @@ ncclResult_t ncclStrongStreamRelease(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Make every future operation captured on cap->captureStream depend on 'recordNode'.
|
||||||
|
#if CUDART_VERSION >= 13000
|
||||||
|
CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(
|
||||||
|
cap->captureStream,
|
||||||
|
&recordNode, /* dependencies */
|
||||||
|
/*edges =*/ nullptr, /* no edge annotations */
|
||||||
|
1, /* count */
|
||||||
|
cudaStreamSetCaptureDependencies));
|
||||||
|
#else
|
||||||
|
CUDACHECK(cudaStreamUpdateCaptureDependencies(
|
||||||
|
cap->captureStream,
|
||||||
|
&recordNode,
|
||||||
|
1,
|
||||||
|
cudaStreamSetCaptureDependencies));
|
||||||
|
#endif
|
||||||
|
|
||||||
if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
|
if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
|
||||||
WARN("%s", launchRaceFatalMsg);
|
WARN("%s", launchRaceFatalMsg);
|
||||||
return ncclInvalidUsage;
|
return ncclInvalidUsage;
|
||||||
|
@ -61,8 +61,6 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
|
|||||||
char eNoEntNameList[PATH_MAX] = { 0 };
|
char eNoEntNameList[PATH_MAX] = { 0 };
|
||||||
|
|
||||||
if (libName && strlen(libName)) {
|
if (libName && strlen(libName)) {
|
||||||
// match names that start with 'lib' and end with '.so'
|
|
||||||
if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) {
|
|
||||||
snprintf(libName_, MAX_STR_LEN, "%s", libName);
|
snprintf(libName_, MAX_STR_LEN, "%s", libName);
|
||||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||||
if (libHandles[type]) {
|
if (libHandles[type]) {
|
||||||
@ -74,7 +72,9 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
|
|||||||
} else {
|
} else {
|
||||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
|
// libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so')
|
||||||
|
if (strchr(libName, '/') == nullptr && (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") || strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) {
|
||||||
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
|
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
|
||||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||||
if (libHandles[type]) {
|
if (libHandles[type]) {
|
||||||
|
@ -494,7 +494,9 @@ static int ibvSpeeds[] = {
|
|||||||
14000, /* FDR */
|
14000, /* FDR */
|
||||||
25000, /* EDR */
|
25000, /* EDR */
|
||||||
50000, /* HDR */
|
50000, /* HDR */
|
||||||
100000 /* NDR */ };
|
100000, /* NDR */
|
||||||
|
200000 /* XDR */
|
||||||
|
};
|
||||||
|
|
||||||
static int firstBitSet(int val, int max) {
|
static int firstBitSet(int val, int max) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
@ -654,7 +656,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
|
|||||||
ibProvider = IB_PROVIDER_MLX5;
|
ibProvider = IB_PROVIDER_MLX5;
|
||||||
snprintf(dataDirectDevicePath, PATH_MAX, "/sys");
|
snprintf(dataDirectDevicePath, PATH_MAX, "/sys");
|
||||||
if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) {
|
if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) {
|
||||||
INFO(NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
|
INFO(NCCL_INIT|NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
|
||||||
if(ncclParamIbDataDirect()) dataDirectSupported = 1;
|
if(ncclParamIbDataDirect()) dataDirectSupported = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user