From c7ba70ff90b357b40bf571ea3366d61e1249a0be Mon Sep 17 00:00:00 2001 From: Luke Yeager Date: Tue, 7 Jan 2020 13:29:42 -0800 Subject: [PATCH 1/2] [build] Allow setting CXXFLAGS on the command line --- makefiles/common.mk | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/makefiles/common.mk b/makefiles/common.mk index 37e81be..2e44826 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -42,9 +42,10 @@ else endif #$(info NVCC_GENCODE is ${NVCC_GENCODE}) -CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden -CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -CXXFLAGS += -I $(CUDA_INC) +CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ + -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \ + -I $(CUDA_INC) \ + $(CXXFLAGS) # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) # 512 : 120, 640 : 96, 768 : 80, 1024 : 60 # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. From 7a18fe07847300fbe7fec8d5512b3b44d8bc1716 Mon Sep 17 00:00:00 2001 From: Luke Yeager Date: Tue, 7 Jan 2020 13:29:57 -0800 Subject: [PATCH 2/2] [topology] remove NET links when trimming system This fixes a memory leak. --- src/graph/paths.cc | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/graph/paths.cc b/src/graph/paths.cc index ce1772c..eba1964 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -179,11 +179,18 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int // Remove/free paths for a given type static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) { for (int t=0; tnodes[t].count; n++) { struct ncclTopoNode* node = system->nodes[t].nodes+n; free(node->paths[nodeType]); node->paths[nodeType] = NULL; } + // Remove links _from_ the given type + for (int n=0; nnodes[nodeType].count; n++) { + struct ncclTopoNode* node = system->nodes[nodeType].nodes+n; + free(node->paths[t]); + node->paths[t] = NULL; + } } } @@ -309,6 +316,22 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* // Trim network ncclTopoRemovePathType(system, NET); system->nodes[NET].count = 0; + for (int t=0; tnodes[t].count; n++) { + struct ncclTopoNode* node = system->nodes[t].nodes+n; + for (int l=0; lnlinks; l++) { + struct ncclTopoLink* link = &(node->links[l]); + if (link->remNode->type == NET) { + // Remove the link + for (int i=l; i<(node->nlinks-1); i++) { + memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink)); + } + node->nlinks--; + l--; // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop + } + } + } + } } free(domains); free(ids);