fix NCCL_DEBUG_FILE

Summary: NCCL_DEBUG_FILE does not work properly since the recent v2.13.4 updates (https://github.com/NVIDIA/nccl/pull/682) because it nows sets `ncclDebugLevel` after parse `NCCL_DEBUG_FILE`. This patch move parsing `tempNcclDebugLevel` before processing `NCCL_DEBUG_FILE` to ensure `NCCL_DEBUG_FILE` is parsed only when `NCCL_DEBUG > NCCL_LOG_VERSION` (same as previous behavior)

Differential Revision: D38415208

fbshipit-source-id: 5689bbb798e73efb9e8594557666987f07e89a30
This commit is contained in:
Ching-Hsiang Chu 2022-08-03 20:47:40 -07:00 committed by Sylvain Jeaugey
parent 19ab67d172
commit e1d9b273b0

View File

@ -26,6 +26,20 @@ void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock); pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
const char* nccl_debug = getenv("NCCL_DEBUG"); const char* nccl_debug = getenv("NCCL_DEBUG");
int tempNcclDebugLevel = -1;
if (nccl_debug == NULL) {
tempNcclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
tempNcclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
tempNcclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
tempNcclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
tempNcclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
tempNcclDebugLevel = NCCL_LOG_TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var /* Parse the NCCL_DEBUG_SUBSYS env var
* This can be a comma separated list such as INIT,COLL * This can be a comma separated list such as INIT,COLL
@ -80,7 +94,7 @@ void ncclDebugInit() {
* NCCL_DEBUG level is > VERSION * NCCL_DEBUG level is > VERSION
*/ */
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE"); const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
int c = 0; int c = 0;
char debugFn[PATH_MAX+1] = ""; char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn; char *dfn = debugFn;
@ -115,21 +129,6 @@ void ncclDebugInit() {
} }
} }
int tempNcclDebugLevel = -1;
if (nccl_debug == NULL) {
tempNcclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
tempNcclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
tempNcclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
tempNcclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
tempNcclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
tempNcclDebugLevel = NCCL_LOG_TRACE;
}
ncclEpoch = std::chrono::steady_clock::now(); ncclEpoch = std::chrono::steady_clock::now();
__atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE); __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
pthread_mutex_unlock(&ncclDebugLock); pthread_mutex_unlock(&ncclDebugLock);