From e1d9b273b00fa419e88295e63668f29e84c16a3d Mon Sep 17 00:00:00 2001 From: Ching-Hsiang Chu Date: Wed, 3 Aug 2022 20:47:40 -0700 Subject: [PATCH] fix NCCL_DEBUG_FILE Summary: NCCL_DEBUG_FILE does not work properly since the recent v2.13.4 updates (https://github.com/NVIDIA/nccl/pull/682) because it nows sets `ncclDebugLevel` after parse `NCCL_DEBUG_FILE`. This patch move parsing `tempNcclDebugLevel` before processing `NCCL_DEBUG_FILE` to ensure `NCCL_DEBUG_FILE` is parsed only when `NCCL_DEBUG > NCCL_LOG_VERSION` (same as previous behavior) Differential Revision: D38415208 fbshipit-source-id: 5689bbb798e73efb9e8594557666987f07e89a30 --- src/debug.cc | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/debug.cc b/src/debug.cc index 1c184d0..e2d6f47 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -26,6 +26,20 @@ void ncclDebugInit() { pthread_mutex_lock(&ncclDebugLock); if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } const char* nccl_debug = getenv("NCCL_DEBUG"); + int tempNcclDebugLevel = -1; + if (nccl_debug == NULL) { + tempNcclDebugLevel = NCCL_LOG_NONE; + } else if (strcasecmp(nccl_debug, "VERSION") == 0) { + tempNcclDebugLevel = NCCL_LOG_VERSION; + } else if (strcasecmp(nccl_debug, "WARN") == 0) { + tempNcclDebugLevel = NCCL_LOG_WARN; + } else if (strcasecmp(nccl_debug, "INFO") == 0) { + tempNcclDebugLevel = NCCL_LOG_INFO; + } else if (strcasecmp(nccl_debug, "ABORT") == 0) { + tempNcclDebugLevel = NCCL_LOG_ABORT; + } else if (strcasecmp(nccl_debug, "TRACE") == 0) { + tempNcclDebugLevel = NCCL_LOG_TRACE; + } /* Parse the NCCL_DEBUG_SUBSYS env var * This can be a comma separated list such as INIT,COLL @@ -80,7 +94,7 @@ void ncclDebugInit() { * NCCL_DEBUG level is > VERSION */ const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE"); - if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { + if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { int c = 0; char debugFn[PATH_MAX+1] = ""; char *dfn = debugFn; @@ -115,21 +129,6 @@ void ncclDebugInit() { } } - int tempNcclDebugLevel = -1; - if (nccl_debug == NULL) { - tempNcclDebugLevel = NCCL_LOG_NONE; - } else if (strcasecmp(nccl_debug, "VERSION") == 0) { - tempNcclDebugLevel = NCCL_LOG_VERSION; - } else if (strcasecmp(nccl_debug, "WARN") == 0) { - tempNcclDebugLevel = NCCL_LOG_WARN; - } else if (strcasecmp(nccl_debug, "INFO") == 0) { - tempNcclDebugLevel = NCCL_LOG_INFO; - } else if (strcasecmp(nccl_debug, "ABORT") == 0) { - tempNcclDebugLevel = NCCL_LOG_ABORT; - } else if (strcasecmp(nccl_debug, "TRACE") == 0) { - tempNcclDebugLevel = NCCL_LOG_TRACE; - } - ncclEpoch = std::chrono::steady_clock::now(); __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE); pthread_mutex_unlock(&ncclDebugLock);