nccl/ext-tuner/example/scripts/optimize_config.py
Kamil Iskra 3ea7eedf3b NCCL 2.27.5-1
Improvements for GB200 systems
* Optimize the network performance by alternating the direction of the
  rings and the NIC to GPU assignment across communicators to limit
  unnecessary sharing.
* Fix the detection of C2C links in case GPU Direct RDMA is disabled
  between a GPU and a NIC.
* Fix PXN support on MNNVL systems, where NCCL would try (and fail) to
  share regular host memory across multiple nodes.
* Fix P2C (PXN over C2C), which is now preferred over regular PXN.  This
  support is currently preliminary and is disabled by default; use
  NCCL_PXN_C2C=1 to enable.

Further reduce the overheads of CUDA graph capturing, which increased in
NCCL 2.26.2 for large graphs.

Optimize the network performance on DGX B200 systems by adjusting the
bandwidths provided to the graph search algorithm.

Enable fp8 reductions in symmetric kernels on Blackwell with CUDA 12.8.

Restore the plugin name handling logic to make it possible to specify a
path to the plugin (Issue #1732).

Restore the ability to change NCCL_COLLNET_ENABLE during execution
(Issue #1741).

Add an example tuner plugin with CSV-based overrides.

Remove an x86 dependency from the example profiler.
2025-06-18 10:34:47 -07:00

431 lines
19 KiB
Python

#!/usr/bin/env python3
"""
NCCL Tuner Configuration Optimizer
Reads a CSV file containing performance data across different tuning parameters
and generates optimal NCCL tuner configurations based on the best performing
combinations.
By default, creates growing size ranges that interpolate between the actual data sizes
for each unique dimension (node count, rank count combination). This ensures that
different cluster configurations get their own optimized size boundaries, as
performance characteristics often vary significantly between topologies.
Each dimension gets its own set of ranges starting from 0 and extending to the maximum
size for that dimension, with boundaries at midpoints between consecutive data sizes.
CSV Input Format:
collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us
Output Format (NCCL Tuner Config):
collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
Usage Examples:
# Auto-create dimension-specific interpolated ranges (default)
python3 optimize_config.py data.csv
# Use custom size ranges (applied to all topologies)
python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576"
# Use hardcoded default ranges (applied to all topologies)
python3 optimize_config.py data.csv --no-auto-ranges
"""
import csv
import argparse
import sys
import os
from collections import defaultdict
from typing import Dict, List, Tuple, Any
class PerformanceData:
def __init__(self, row: Dict[str, str]):
self.collective = row['collective']
self.size_bytes = int(row['size_bytes'])
self.algorithm = row['algorithm']
self.protocol = row['protocol']
self.channels = int(row['channels']) if row['channels'] != '-1' else -1
self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1
self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1
self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1
self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1
# Performance metrics
self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0)) # Higher is better
self.latency_us = float(row.get('latency_us', 0)) # Lower is better
def get_config_key(self) -> Tuple:
"""Generate a key for grouping similar configurations"""
return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff)
def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]:
"""Find which size range this data point belongs to for its dimension"""
topology_key = (self.nodes, self.ranks)
# Get size ranges for this dimension, or fall back to default
if topology_key in topology_size_ranges:
size_ranges = topology_size_ranges[topology_key]
elif (-1, -1) in topology_size_ranges:
size_ranges = topology_size_ranges[(-1, -1)]
else:
# Fallback to first available dimension ranges
size_ranges = next(iter(topology_size_ranges.values()))
for min_size, max_size in size_ranges:
if min_size <= self.size_bytes <= max_size:
return (min_size, max_size)
# If no range found, create a single-point range
return (self.size_bytes, self.size_bytes)
class ConfigOptimizer:
def __init__(self, optimization_metric: str = 'latency_us'):
self.optimization_metric = optimization_metric
# Default size ranges - will be overridden by auto-detection
self.size_ranges = [
(0, 1024),
(1025, 64*1024),
(64*1024+1, 1024*1024),
(1024*1024+1, 16*1024*1024),
(16*1024*1024+1, 4*1024*1024*1024-1)
]
self.auto_size_ranges = True
def set_size_ranges(self, ranges: List[Tuple[int, int]]):
"""Set custom size ranges for optimization"""
self.size_ranges = ranges
self.auto_size_ranges = False
def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
"""Create growing size ranges for each unique (nodes, ranks) dimension"""
if not data:
return {(-1, -1): self.size_ranges}
# Group data by dimension (nodes, ranks)
topology_data = defaultdict(list)
for item in data:
topology_key = (item.nodes, item.ranks)
topology_data[topology_key].append(item)
topology_ranges = {}
for topology_key, items in topology_data.items():
nodes, ranks = topology_key
# Extract unique sizes for this dimension and sort them
unique_sizes = sorted(set(item.size_bytes for item in items))
if len(unique_sizes) <= 1:
# Only one size, create a single range from 0 to that size
size = unique_sizes[0] if unique_sizes else 0
ranges = [(0, size)]
else:
# Create growing ranges that interpolate between data points
ranges = []
for i, size in enumerate(unique_sizes):
if i == 0:
# First range: 0 to midpoint between first and second size
if len(unique_sizes) > 1:
next_size = unique_sizes[i + 1]
max_size = (size + next_size) // 2
else:
max_size = size
min_size = 0
elif i == len(unique_sizes) - 1:
# Last range: previous max + 1 to current size (and beyond)
min_size = ranges[-1][1] + 1
max_size = size
else:
# Intermediate ranges: previous max + 1 to midpoint with next size
min_size = ranges[-1][1] + 1
next_size = unique_sizes[i + 1]
max_size = (size + next_size) // 2
ranges.append((min_size, max_size))
topology_ranges[topology_key] = ranges
print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:")
for i, (min_size, max_size) in enumerate(ranges):
# Count data points that fall in this range for this dimension
count = sum(1 for item in items if min_size <= item.size_bytes <= max_size)
actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size))
if actual_sizes:
size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3])
if len(actual_sizes) > 3:
size_list += f", ... (+{len(actual_sizes)-3} more)"
print(f" Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})")
return topology_ranges
def load_data(self, csv_file: str) -> List[PerformanceData]:
"""Load performance data from CSV file"""
data = []
try:
with open(csv_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
try:
data.append(PerformanceData(row))
except (ValueError, KeyError) as e:
print(f"Warning: Skipping invalid row: {row} - {e}")
except FileNotFoundError:
print(f"Error: File {csv_file} not found")
sys.exit(1)
except Exception as e:
print(f"Error reading {csv_file}: {e}")
sys.exit(1)
print(f"Loaded {len(data)} performance data points")
# Auto-determine size ranges if enabled
if self.auto_size_ranges and data:
self.topology_size_ranges = self.auto_determine_size_ranges(data)
else:
# Use default ranges for all topologies
self.topology_size_ranges = {(-1, -1): self.size_ranges}
return data
def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool:
"""Determine if new_data is better than current_best"""
if self.optimization_metric == 'bandwidth_gbps':
return new_data.bandwidth_gbps > current_best.bandwidth_gbps
elif self.optimization_metric == 'latency_us':
return new_data.latency_us < current_best.latency_us
else:
# Default to latency
return new_data.latency_us < current_best.latency_us
def optimize_configurations(self, data: List[PerformanceData]) -> List[str]:
"""Find optimal configurations and return as NCCL config strings"""
# Group data by configuration key and size range
grouped_data = defaultdict(lambda: defaultdict(list))
for item in data:
config_key = item.get_config_key()
size_range = item.get_size_range_key(self.topology_size_ranges)
grouped_data[config_key][size_range].append(item)
# Store optimal configurations before combining ranges
optimal_configs = []
for config_key, size_ranges_dict in grouped_data.items():
collective, nodes, ranks, pipeOps, regBuff = config_key
for (min_size, max_size), items in size_ranges_dict.items():
if not items:
continue
# Find the best performing configuration for this size range
best_item = items[0]
for item in items[1:]:
if self.is_better(item, best_item):
best_item = item
# Store the optimal configuration with its range
optimal_configs.append({
'collective': collective,
'min_size': min_size,
'max_size': max_size,
'algorithm': best_item.algorithm,
'protocol': best_item.protocol,
'channels': best_item.channels,
'nodes': best_item.nodes,
'ranks': best_item.ranks,
'pipeOps': best_item.pipeOps,
'regBuff': best_item.regBuff,
'metric_value': getattr(best_item, self.optimization_metric)
})
# Combine sequential ranges with identical tunings
combined_configs = self.combine_sequential_ranges(optimal_configs)
# Generate config strings
configs = []
for config in combined_configs:
config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}"
configs.append(config_str)
print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: "
f"{config['algorithm']}/{config['protocol']} channels={config['channels']} "
f"({self.optimization_metric}={config['metric_value']:.3f})")
return configs
def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
"""Combine sequential ranges that have identical tuning parameters"""
if not configs:
return configs
# Group by collective and topology (nodes, ranks)
topology_groups = defaultdict(list)
for config in configs:
topology_key = (config['collective'], config['nodes'], config['ranks'],
config['pipeOps'], config['regBuff'])
topology_groups[topology_key].append(config)
combined_configs = []
for topology_key, topology_configs in topology_groups.items():
# Sort by min_size to ensure proper ordering
topology_configs.sort(key=lambda x: x['min_size'])
# Group by tuning parameters (algorithm, protocol, channels)
tuning_groups = defaultdict(list)
for config in topology_configs:
tuning_key = (config['algorithm'], config['protocol'], config['channels'])
tuning_groups[tuning_key].append(config)
# For each tuning group, combine sequential ranges
for tuning_key, tuning_configs in tuning_groups.items():
if not tuning_configs:
continue
# Sort by min_size
tuning_configs.sort(key=lambda x: x['min_size'])
# Combine sequential ranges
current_config = tuning_configs[0].copy()
for next_config in tuning_configs[1:]:
# Check if ranges are adjacent or overlapping
if current_config['max_size'] + 1 >= next_config['min_size']:
# Extend the current range
current_config['max_size'] = max(current_config['max_size'], next_config['max_size'])
# Update metric value to the better one
if self.optimization_metric == 'bandwidth_gbps':
if next_config['metric_value'] > current_config['metric_value']:
current_config['metric_value'] = next_config['metric_value']
else: # latency_us or default
if next_config['metric_value'] < current_config['metric_value']:
current_config['metric_value'] = next_config['metric_value']
else:
# Gap between ranges, save current and start new one
combined_configs.append(current_config)
current_config = next_config.copy()
# Add the last configuration
combined_configs.append(current_config)
# Sort final configs by collective, nodes, ranks, then min_size
combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size']))
original_count = len(configs)
combined_count = len(combined_configs)
if combined_count < original_count:
print(f"Combined {original_count} ranges into {combined_count} ranges "
f"(reduced by {original_count - combined_count})")
return combined_configs
def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True):
"""Append optimized configurations to NCCL tuner config file"""
try:
# Create directory if it doesn't exist
config_dir = os.path.dirname(config_file)
if config_dir and not os.path.exists(config_dir):
os.makedirs(config_dir)
print(f"Created directory: {config_dir}")
# Check if file exists and has content
file_exists = os.path.exists(config_file)
add_separator = False
if file_exists:
with open(config_file, 'r') as f:
content = f.read().strip()
add_separator = len(content) > 0
print(f"Appending to existing file: {config_file}")
else:
print(f"Creating new file: {config_file}")
with open(config_file, 'a') as f:
if add_separator:
f.write("\n\n")
if add_header:
f.write(f"# Optimized configurations generated by optimize_config.py\n")
f.write(f"# Optimization metric: {self.optimization_metric}\n")
f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n")
for config in configs:
f.write(f"{config}\n")
if file_exists:
print(f"Appended {len(configs)} optimized configurations to {config_file}")
else:
print(f"Created {config_file} with {len(configs)} optimized configurations")
except PermissionError:
print(f"Error: Permission denied writing to {config_file}")
print("Try running with appropriate permissions or choose a different output location")
sys.exit(1)
except OSError as e:
print(f"Error: Cannot create/write to {config_file}: {e}")
print("Check that the path is valid and you have write permissions")
sys.exit(1)
except Exception as e:
print(f"Unexpected error writing to {config_file}: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data")
parser.add_argument("csv_file", help="Input CSV file with performance data")
parser.add_argument("-o", "--output", default="nccl_tuner.conf",
help="Output NCCL tuner config file (default: nccl_tuner.conf)")
parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'],
default='latency_us', help="Optimization metric (default: latency_us)")
parser.add_argument("--no-header", action="store_true",
help="Don't add header comments to output file")
parser.add_argument("--dry-run", action="store_true",
help="Print configurations without writing to file")
parser.add_argument("--no-auto-ranges", action="store_true",
help="Disable automatic size range determination (use default ranges)")
parser.add_argument("--size-ranges", type=str,
help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'")
args = parser.parse_args()
optimizer = ConfigOptimizer(args.metric)
# Handle size range configuration
if args.size_ranges:
# Parse custom size ranges
try:
ranges = []
for range_str in args.size_ranges.split(','):
min_size, max_size = map(int, range_str.split('-'))
ranges.append((min_size, max_size))
optimizer.set_size_ranges(ranges)
print(f"Using custom size ranges: {ranges}")
except ValueError:
print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'")
sys.exit(1)
elif args.no_auto_ranges:
# Disable auto-ranging
optimizer.auto_size_ranges = False
print("Using default hardcoded size ranges")
else:
# Auto-ranging is enabled by default - creates one bucket per unique size
optimizer.auto_size_ranges = True
print("Auto-ranging enabled: will create one bucket per unique size in data")
# Load and optimize data
data = optimizer.load_data(args.csv_file)
if not data:
print("No valid data found in CSV file")
sys.exit(1)
configs = optimizer.optimize_configurations(data)
if args.dry_run:
print("\nGenerated configurations:")
for config in configs:
print(config)
else:
optimizer.append_to_config_file(configs, args.output, not args.no_header)
if __name__ == "__main__":
main()