fix NCCL_DEBUG_FILE
Summary: NCCL_DEBUG_FILE does not work properly since the recent v2.13.4 updates (https://github.com/NVIDIA/nccl/pull/682) because it nows sets `ncclDebugLevel` after parse `NCCL_DEBUG_FILE`. This patch move parsing `tempNcclDebugLevel` before processing `NCCL_DEBUG_FILE` to ensure `NCCL_DEBUG_FILE` is parsed only when `NCCL_DEBUG > NCCL_LOG_VERSION` (same as previous behavior) Differential Revision: D38415208 fbshipit-source-id: 5689bbb798e73efb9e8594557666987f07e89a30
This commit is contained in:
parent
19ab67d172
commit
e1d9b273b0
31
src/debug.cc
31
src/debug.cc
@ -26,6 +26,20 @@ void ncclDebugInit() {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
|
||||
const char* nccl_debug = getenv("NCCL_DEBUG");
|
||||
int tempNcclDebugLevel = -1;
|
||||
if (nccl_debug == NULL) {
|
||||
tempNcclDebugLevel = NCCL_LOG_NONE;
|
||||
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_VERSION;
|
||||
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_WARN;
|
||||
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_INFO;
|
||||
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_ABORT;
|
||||
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_TRACE;
|
||||
}
|
||||
|
||||
/* Parse the NCCL_DEBUG_SUBSYS env var
|
||||
* This can be a comma separated list such as INIT,COLL
|
||||
@ -80,7 +94,7 @@ void ncclDebugInit() {
|
||||
* NCCL_DEBUG level is > VERSION
|
||||
*/
|
||||
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
|
||||
if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
|
||||
if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
|
||||
int c = 0;
|
||||
char debugFn[PATH_MAX+1] = "";
|
||||
char *dfn = debugFn;
|
||||
@ -115,21 +129,6 @@ void ncclDebugInit() {
|
||||
}
|
||||
}
|
||||
|
||||
int tempNcclDebugLevel = -1;
|
||||
if (nccl_debug == NULL) {
|
||||
tempNcclDebugLevel = NCCL_LOG_NONE;
|
||||
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_VERSION;
|
||||
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_WARN;
|
||||
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_INFO;
|
||||
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_ABORT;
|
||||
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
|
||||
tempNcclDebugLevel = NCCL_LOG_TRACE;
|
||||
}
|
||||
|
||||
ncclEpoch = std::chrono::steady_clock::now();
|
||||
__atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
|
Loading…
x
Reference in New Issue
Block a user