From 79fb0326ac3889663d0f1bd38bb7a83a98ff0dfb Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 13 Sep 2022 16:05:21 -0400 Subject: [PATCH 1/3] Fix intermittent 11.6 builds: generate unique .cu file for each object file --- src/collectives/device/Makefile | 2 +- src/collectives/device/gen_rules.sh | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile index 04bce8e..a2498a0 100644 --- a/src/collectives/device/Makefile +++ b/src/collectives/device/Makefile @@ -29,7 +29,7 @@ all: $(STATICLIB) all_deps: $(DEPENDFILES) # Auto-generating the rules per op/reduction/datatype/algorithm -$(RULESFILE) : +$(RULESFILE) : gen_rules.sh @printf "Generating %-35s > %s\n" rules $@ @mkdir -p $(OBJDIR) @CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@ diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh index aaf3685..a43f005 100755 --- a/src/collectives/device/gen_rules.sh +++ b/src/collectives/device/gen_rules.sh @@ -13,6 +13,9 @@ then datatypes+=" bf16" fi +echo "CURDIR := \$(dir \$(realpath \$(word \$(words \$(math\$(MAKEFILE_LIST))-1), \$(MAKEFILE_LIST))))" ++echo "" + targets="GENOBJS := \\\\\n" for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do @@ -21,10 +24,17 @@ for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do dtn=0 # Order must match that of the ncclDataType_t enum for dt in ${datatypes}; do - echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep" + # Generate a unique filename for each compilation unit, + # otherwise the __nv_module_id may conflict at link time + echo "${dir}/${base}_${opn}_${dtn}.cu :" + echo " echo \"#include \\\"\$(CURDIR)${base}.cu\\\"\" > \$@" + echo "" + # Compile the file + echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${opn}_${dtn}.cu ${base}.cu ${dir}/${base}.dep" + echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" echo " mkdir -p ${dir}" - echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o" + echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@" echo "" targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" dtn=$(($dtn + 1)) From f89fd4777d2ef9229c039ff750ae21da01626f52 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Wed, 14 Sep 2022 11:16:17 -0400 Subject: [PATCH 2/3] address review comments --- src/collectives/device/gen_rules.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh index a43f005..8c7387c 100755 --- a/src/collectives/device/gen_rules.sh +++ b/src/collectives/device/gen_rules.sh @@ -13,9 +13,6 @@ then datatypes+=" bf16" fi -echo "CURDIR := \$(dir \$(realpath \$(word \$(words \$(math\$(MAKEFILE_LIST))-1), \$(MAKEFILE_LIST))))" -+echo "" - targets="GENOBJS := \\\\\n" for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do @@ -26,11 +23,12 @@ for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do for dt in ${datatypes}; do # Generate a unique filename for each compilation unit, # otherwise the __nv_module_id may conflict at link time - echo "${dir}/${base}_${opn}_${dtn}.cu :" - echo " echo \"#include \\\"\$(CURDIR)${base}.cu\\\"\" > \$@" + echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu" + echo " @printf \"Copying %-35s > %s\\\\n\" \$< \$@" + echo " cp \$< \$@" echo "" # Compile the file - echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${opn}_${dtn}.cu ${base}.cu ${dir}/${base}.dep" + echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep" echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" echo " mkdir -p ${dir}" From ecab28a7c959af73cd2030b8c2ed281213dc906e Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 22 Sep 2022 01:04:50 -0700 Subject: [PATCH 3/3] Fix potential deadlock during init in multi-thread mode. Make sure all calls calling cudaMalloc (including devCommSetup) are called before the last bootstrapBarrier. That way, we avoid calls to cudaMalloc be blocked by a NCCL kernel launched on another GPU by another thread which completed init faster. Resolve #623. --- src/init.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/init.cc b/src/init.cc index 25c8d5d..42c1090 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1037,6 +1037,8 @@ collnet_cleanup: } } + NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore); + /* Local intra-node barrier */ NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); @@ -1087,7 +1089,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { } NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); - NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); // update communicator state comm->initState = ncclSuccess;