Add tree algorithms for allreduce to improve performance at scale. Add ncclCommAbort() and ncclCommGetAsyncError() to properly handle network errors and be permit recover. Detect initial CPU affinity and no longer escape it.
36 lines
840 B
Bash
36 lines
840 B
Bash
#!/bin/bash
|
|
#
|
|
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# See LICENSE.txt for license information
|
|
#
|
|
|
|
# To run from $BUILDDIR/
|
|
|
|
cd ..
|
|
NCCLDIR=`basename $PWD`
|
|
|
|
echo "Checking for unclean directory ..."
|
|
git clean -x -i
|
|
echo "Clean done."
|
|
echo "Checking for uncommited files ..."
|
|
if [ "`git status -s | wc -l`" != "0" ]; then
|
|
git status -s
|
|
echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
|
|
read
|
|
fi
|
|
|
|
cd ..
|
|
NCCL_MAJOR=${nccl:Major}
|
|
NCCL_MINOR=${nccl:Minor}
|
|
NCCL_PATCH=${nccl:Patch}
|
|
NCCL_SUFFIX=${nccl:Suffix}
|
|
NCCL_BUILD=${pkg:Revision}
|
|
|
|
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
|
|
|
|
tar --exclude build \
|
|
--exclude ".git*" \
|
|
--exclude pkg/srctxz \
|
|
--transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
|