Add support for external network.

Dynamically load external network from libnccl-net.so. Add init function in networks. Move PCI scoring to net.cu, only ask transport to provide a path. Simplify CUDA PCI path detection. Add dummy external network
2018-11-13 10:37:20 -08:00 · 2018-11-13 10:37:20 -08:00 · 0d3a20f96d
commit 0d3a20f96d
parent d7a58cfa58
29 changed files with 437 additions and 361 deletions
--- a/ext-net/dummy/Makefile
+++ b/ext-net/dummy/Makefile
@ -0,0 +1,17 @@
 #
 # Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
 NCCL_HOME:=../../build/
 CUDA_HOME:=/usr/local/cuda
 INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
 PLUGIN_SO:=libnccl-net.so
 default: $(PLUGIN_SO)
 $(PLUGIN_SO): plugin.c
 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
 clean:
 	rm -f $(PLUGIN_SO)
--- a/ext-net/dummy/plugin.c
+++ b/ext-net/dummy/plugin.c
@ -0,0 +1,44 @@
 /*************************************************************************
 * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
 #include <stdio.h>
 #include <nccl.h>
 #include <nccl_net.h>
 #define __hidden __attribute__ ((visibility("hidden")))
 __hidden ncclResult_t pluginInit() { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int type, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, int type, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size) { return ncclInternalError; }
 __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
 ncclNet_t NCCL_PLUGIN_SYMBOL = {
  "Dummy",
  pluginInit,
  pluginDevices,
  pluginPciPath,
  pluginPtrSupport,
  pluginListen,
  pluginConnect,
  pluginAccept,
  pluginIsend,
  pluginIrecv,
  pluginFlush,
  pluginTest,
  pluginCloseSend,
  pluginCloseRecv,
  pluginCloseListen
 };
--- a/src/Makefile
+++ b/src/Makefile
@ -8,7 +8,7 @@ include ../makefiles/common.mk
 include ../makefiles/version.mk
 ##### src files
-INCEXPORTS  := nccl.h
+INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
 		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
 		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
@ -80,6 +80,11 @@ $(INCDIR)/%.h : %.h
 	mkdir -p $(INCDIR)
 	cp -f $< $@
 $(INCDIR)/nccl_%.h : include/nccl_%.h
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(INCDIR)
 	cp -f $< $@
 $(OBJDIR)/%.o : %.cu
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cu
@ -217,7 +217,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
  int rank = state->rank;
  int nranks = state->nranks;
-  TRACE(INIT, "rank %d nranks %d size %d", rank, nranks, size);
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
  /* Simple ring based AllGather
   * At each step i receive data from (rank-i-1) from left
@ -233,7 +233,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
    NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
  }
-  TRACE(INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
  return ncclSuccess;
 }
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cu
@ -12,7 +12,7 @@
 ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
  if (comm->nRanks == 1) {
    if (sendbuff != recvbuff)
      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cu
@ -12,7 +12,7 @@
 ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
  if (comm->nRanks == 1) {
    if (sendbuff != recvbuff)
      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cu
@ -12,7 +12,7 @@
 ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
  if (comm->nRanks == 1) {
    if (sendbuff != recvbuff)
      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cu
@ -12,7 +12,7 @@
 ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
  if (comm->nRanks == 1) {
    if (sendbuff != recvbuff)
      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cu
@ -12,7 +12,7 @@
 ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
  if (comm->nRanks == 1) {
    if (sendbuff != recvbuff)
      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
--- a/src/include/core.h
+++ b/src/include/core.h
@ -271,7 +271,7 @@ struct ncclComm {
  while (ret == -1) { \
    SYSCHECKVAL(call, name, ret); \
    if (ret == -1) { \
-      INFO(ALL,"Got %s, retrying", strerror(errno));   \
+      INFO(NCCL_ALL,"Got %s, retrying", strerror(errno));   \
    }\
  } \
 } while (0);
@ -313,7 +313,7 @@ struct ncclComm {
  ncclResult_t res = call; \
  if (res != ncclSuccess) { \
    /* Print the back trace*/ \
-    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
    return res; \
  } \
 } while (0);
@ -322,7 +322,7 @@ struct ncclComm {
  res = call; \
  if (res != ncclSuccess) { \
    /* Print the back trace*/ \
-    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
    goto label; \
  } \
 } while (0);
--- a/src/include/debug.h
+++ b/src/include/debug.h
@ -16,65 +16,24 @@
 #include <limits.h>
 #include <string.h>
 #include "nccl.h"
 #include "nccl_net.h"
 #define gettid() (pid_t) syscall(SYS_gettid)
-typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel;
+extern int ncclDebugLevel;
 typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys;
 extern DebugLevel ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen);
-#define WARN(...) do {                                           \
+extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
  if (ncclDebugLevel >= WARN) {                                  \
    char hostname[1024];                                         \
    getHostName(hostname, 1024);                                 \
    int cudaDev;                                                 \
    cudaGetDevice(&cudaDev);                                     \
    pthread_mutex_lock(&ncclDebugOutputLock);                    \
    fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \
    fprintf(ncclDebugFile,__VA_ARGS__);                          \
    fprintf(ncclDebugFile,"\n");                                 \
    fflush(ncclDebugFile);                                       \
    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
    if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \
  }                                                              \
 } while(0)
-#define INFO(FLAGS, ...) do {                                    \
+#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
-  if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) {     \
+#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
    char hostname[1024];                                         \
    getHostName(hostname, 1024);                                 \
    int cudaDev;                                                 \
    cudaGetDevice(&cudaDev);                                     \
    pthread_mutex_lock(&ncclDebugOutputLock);                    \
    fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \
    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
    fflush(ncclDebugFile);                                       \
    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
  }                                                              \
 } while(0)
 #ifdef ENABLE_TRACE
-#define TRACE(FLAGS, ...) do {                                   \
+#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
  if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) {    \
    char hostname[1024];                                         \
    getHostName(hostname, 1024);                                 \
    int cudaDev;                                                 \
    cudaGetDevice(&cudaDev);                                     \
    pthread_mutex_lock(&ncclDebugOutputLock);                    \
    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \
    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; \
    fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \
    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
    fflush(ncclDebugFile);                                       \
    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
  }                                                              \
 } while(0)
 extern std::chrono::high_resolution_clock::time_point ncclEpoch;
 #else
 #define TRACE(...)
 #endif
@ -84,17 +43,17 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
 static inline void initDebug() {
  const char* nccl_debug = getenv("NCCL_DEBUG");
  if (nccl_debug == NULL) {
-    ncclDebugLevel = NONE;
+    ncclDebugLevel = NCCL_LOG_NONE;
  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = VERSION;
+    ncclDebugLevel = NCCL_LOG_VERSION;
  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = WARN;
+    ncclDebugLevel = NCCL_LOG_WARN;
  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = INFO;
+    ncclDebugLevel = NCCL_LOG_INFO;
  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = ABORT;
+    ncclDebugLevel = NCCL_LOG_ABORT;
  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
-    ncclDebugLevel = TRACE;
+    ncclDebugLevel = NCCL_LOG_TRACE;
  }
  /* Parse the NCCL_DEBUG_SUBSYS env var
@ -109,17 +68,17 @@ static inline void initDebug() {
      uint64_t mask = 0;
      if (subsys[0] == '^') { invert = 1; subsys++; }
      if (strcasecmp(subsys, "INIT") == 0) {
-        mask = INIT;
+        mask = NCCL_INIT;
      } else if (strcasecmp(subsys, "COLL") == 0) {
-        mask = COLL;
+        mask = NCCL_COLL;
      } else if (strcasecmp(subsys, "P2P") == 0) {
-        mask = P2P;
+        mask = NCCL_P2P;
      } else if (strcasecmp(subsys, "SHM") == 0) {
-        mask = SHM;
+        mask = NCCL_SHM;
      } else if (strcasecmp(subsys, "NET") == 0) {
-        mask = NET;
+        mask = NCCL_NET;
      } else if (strcasecmp(subsys, "ALL") == 0) {
-        mask = ALL;
+        mask = NCCL_ALL;
      }
      if (mask) {
        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
@ -133,7 +92,7 @@ static inline void initDebug() {
   * NCCL_DEBUG level is > VERSION
   */
  const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
-  if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) {
+  if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
    int c = 0;
    char debug_fn[PATH_MAX+1] = "";
    char *dfn = debug_fn;
@ -164,7 +123,7 @@ static inline void initDebug() {
    if (debug_fn[0] != '\0') {
      FILE *file = fopen(debug_fn, "w");
      if (file != NULL) {
-        INFO(ALL,"DEBUG file is '%s'", debug_fn);
+        INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
        ncclDebugFile = file;
      }
    }
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@ -9,25 +9,25 @@
 #include "nccl.h"
 #define NCCL_NET_MAJOR 1
 #define NCCL_NET_MINOR 0
 #define NCCL_NET_HANDLE_MAXSIZE 64
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
-#define NCCL_MAX_SCORE 0x7
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
 typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
-  // Return the number of network devices along with their scores relative to the
+  // Initialize the network.
-  // current CUDA device. The per device score should be a value from 1-7 with a
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // higher score representing a better choice for performance.
+  // Return the number of adapters.
-  // This call should allocate the 'scores' array using malloc(3), and it
+  ncclResult_t (*devices)(int* ndev);
-  // will then be freed automatically by NCCL.
+  // Return the device path in /sys. NCCL will call free on this path.
-  ncclResult_t (*devices)(int* ndev, int** scores);
+  ncclResult_t (*pciPath)(int dev, char** path);
  // Return whether this device supports host pointers and/or CUDA pointers
  // as data from the current GPU. Supported types should be composed with
  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
@ -53,12 +53,10 @@ typedef struct {
  ncclResult_t (*closeSend)(void* sendComm);
  ncclResult_t (*closeRecv)(void* recvComm);
  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_t;
+} ncclNet_v1_t;
-extern
+typedef ncclNet_v1_t ncclNet_t;
-#ifdef __cplusplus
+
-"C"
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
 #endif
 ncclNet_t* ncclNet;
 #endif // end include guard
--- a/src/include/net.h
+++ b/src/include/net.h
@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_net.h"
 extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 /* Socket Interface Selection type */
@ -19,7 +20,8 @@ typedef enum { findSubnetIf   = -1,
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
-static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; }
+static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
 static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
 static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
@ -32,7 +34,6 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
 extern bool ncclIbSupport();
 extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
--- a/src/include/param.h
+++ b/src/include/param.h
@ -67,10 +67,10 @@ int64_t ncclParam##name() { \
      errno = 0; \
      int64_t v = strtoll(str, NULL, 0); \
      if (errno) { \
-        INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
+        INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
      } else { \
        value = v; \
-        INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value);  \
+        INFO(NCCL_ALL,"%s set by environment to %lu.", "NCCL_" env, value);  \
      } \
    } \
  } \
--- a/src/include/socket.h
+++ b/src/include/socket.h
@ -76,7 +76,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
    if (family != AF_INET && family != AF_INET6)
      continue;
-    TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+    TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
    /* Allow the caller to force the socket family type */
    if (sock_family != -1 && family != sock_family)
@ -106,7 +106,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
      // Store the IP address
      int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
      memcpy(addrs+found, interface->ifa_addr, salen);
-      INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+      INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
      found++;
    }
  }
@ -183,7 +183,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
    // Store the interface name
    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
-    INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
    found++;
    if (found == maxIfs) break;
  }
@ -333,7 +333,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
 #ifdef ENABLE_TRACE
  char line[1024];
-  TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
+  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
 #endif
  /* Put the socket in listen mode */
@ -363,7 +363,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
 #ifdef ENABLE_TRACE
  char line[1024];
-  TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
+  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
 #endif
  SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
@ -381,7 +381,7 @@ static ncclResult_t socketReceive(int fd, void* ptr, int size) {
      return ncclSystemError;
    }
    if (recvsize == -1) {
-      INFO(NET,"Recv : got retcode %d, retrying", errno);
+      INFO(NCCL_NET,"Recv : got retcode %d, retrying", errno);
      continue;
    }
    data += recvsize;
@ -397,7 +397,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
    int sendsize;
    SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
    if (sendsize == -1) {
-      INFO(NET,"Send : got retcode %d, retrying", errno);
+      INFO(NCCL_NET,"Send : got retcode %d, retrying", errno);
      continue;
    }
    data += sendsize;
--- a/src/include/topo.h
+++ b/src/include/topo.h
@ -8,53 +8,26 @@
 #define NCCL_TOPO_H_
 #include "nccl.h"
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
 #define MAXPATHSIZE 1024
 static ncclResult_t getCudaPath(int cudaDev, char** path) {
  char busId[16];
  CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
  for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/device";
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
-  char* cudaRpath = realpath(busPath, NULL);
+  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, sizeof("0000:00:00.0")-1);
-  char pathname[MAXPATHSIZE];
+  *path = realpath(busPath, NULL);
  strncpy(pathname, cudaRpath, MAXPATHSIZE);
  strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname));
  strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname));
  free(cudaRpath);
  *path = realpath(pathname, NULL);
  if (*path == NULL) {
-    WARN("Could not find real path of %s", pathname);
+    WARN("Could not find real path of %s", busPath);
    return ncclSystemError;
  }
  return ncclSuccess;
 }
-static ncclResult_t getMlxPath(char* ibName, char** path) {
+enum ncclPathDist {
  char devicepath[MAXPATHSIZE];
  snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName);
  *path = realpath(devicepath, NULL);
  if (*path == NULL) {
    WARN("Could not find real path of %s", devicepath);
    return ncclSystemError;
  }
  return ncclSuccess;
 }
 static ncclResult_t getSockPath(char* ifName, char** path) {
  char devicepath[MAXPATHSIZE];
  snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName);
  *path = realpath(devicepath, NULL);
  if (*path == NULL) {
    INFO(NET|INIT, "Could not find real path of %s", devicepath);
    return ncclSystemError;
  }
  return ncclSuccess;
 }
 enum ncclIbPathDist {
  PATH_PIX = 0,
  PATH_PXB = 1,
  PATH_PHB = 2,
@ -74,7 +47,7 @@ static int pciDistance(char* path1, char* path2) {
      if (same == 1) score++;
    }
  }
-  if (score == 3) return PATH_SOC;
+  if (score <= 3) return PATH_SOC;
  if (score == 4) return PATH_PHB;
  if (score == depth-1) return PATH_PIX;
  return PATH_PXB;
--- a/src/init.cu
+++ b/src/init.cu
@ -28,9 +28,13 @@
 #include <string.h>
 #include <errno.h>
 #include <assert.h>
 #include <dlfcn.h>
-DebugLevel ncclDebugLevel;
+#define STR2(v) #v
-uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT
+#define STR(v) STR2(v)
 int ncclDebugLevel;
 uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
 pthread_mutex_t ncclDebugOutputLock;
 FILE *ncclDebugFile = stdout;
@ -48,7 +52,6 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 extern "C" __attribute__ ((visibility("default")))
 ncclNet_t* ncclNet = NULL;
 // We define this as weak to let tests redefine their own
@ -69,13 +72,53 @@ int ncclCudaFullCompCap() {
  return ccMajor*10+ccMinor;
 }
-void initNet() {
+ncclResult_t initNet(ncclNet_t* net) {
-  if (ncclNet != NULL) {
+  int ndev;
-    INFO(INIT,"Using external Network %s", ncclNetName());
+  NCCLCHECK(net->init(ncclDebugLog));
-  } else {
+  NCCLCHECK(net->devices(&ndev));
-    ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket;
+  if (ndev <= 0) {
-    INFO(INIT,"Using internal Network %s", ncclNetName());
+    INFO(NCCL_INIT, "Net/%s: call to devices() returned 0 devices.", net->name);
    return ncclSystemError;
  }
  return ncclSuccess;
 }
 ncclResult_t initNetPlugin(ncclNet_t** net) {
  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
  if (netPluginLib == NULL) {
    INFO(NCCL_INIT, "Unable to load libnccl-net.so : %s", dlerror());
    return ncclSuccess;
  }
  ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
  if (extNet == NULL) {
    INFO(NCCL_INIT, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
    goto cleanup;
  }
  if (initNet(extNet) == ncclSuccess) {
    *net = extNet;
    return ncclSuccess;
  }
 cleanup:
  if (netPluginLib != NULL) dlclose(netPluginLib);
  return ncclSuccess;
 }
 ncclResult_t initNet() {
  // Always initialize sockets as we use it for bootstrap
  NCCLCHECK(initNet(&ncclNetSocket));
  NCCLCHECK(initNetPlugin(&ncclNet));
  if (ncclNet != NULL) {
    INFO(NCCL_INIT, "Using external Network %s", ncclNetName());
    return ncclSuccess;
  }
  if (initNet(&ncclNetIb) == ncclSuccess) {
    ncclNet = &ncclNetIb;
  } else {
    ncclNet = &ncclNetSocket;
  }
  INFO(NCCL_INIT,"Using internal Network %s", ncclNetName());
  return ncclSuccess;
 }
 NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
@ -171,7 +214,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  struct ncclComm* comm;
  NCCLCHECK(ncclCalloc(&comm, 1));
-  INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
  comm->rank = rank;
  comm->nRanks = ndev;
  cudaGetDevice(&comm->cudaDev);
@ -204,16 +247,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
 }
 // Pre-process the string so that running "strings" on the lib can quickly reveal the version.
 #define STR2(v) #v
 #define STR(v) STR2(v)
 #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
 static void showVersion() {
  static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= VERSION) {
+  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
    printf("%s\n", VERSION_STRING);
    fflush(stdout);
    if (ncclDebugFile != stdout)
-      INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
+      INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
    shown = 1;
  }
 }
@ -294,12 +335,12 @@ void dumpMatrix(int* connectMatrix, int nranks) {
  line[STRLENGTH] = '\0';
  memset(line, ' ', STRLENGTH);
  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
-  INFO(INIT,"%s", line);
+  INFO(NCCL_INIT,"%s", line);
  for (int i=0; i<nranks; i++) {
    memset(line, ' ', STRLENGTH);
    sprintf(line, "%3d ", i);
    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
-    INFO(INIT,"%s", line);
+    INFO(NCCL_INIT,"%s", line);
  }
 }
@ -308,12 +349,12 @@ void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
  line[STRLENGTH] = '\0';
  memset(line, ' ', STRLENGTH);
  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
-  INFO(INIT,"%s", line);
+  INFO(NCCL_INIT,"%s", line);
  for (int i=0; i<nranks; i++) {
    memset(line, ' ', STRLENGTH);
    sprintf(line, "%3d ", i);
    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
-    INFO(INIT,"%s", line);
+    INFO(NCCL_INIT,"%s", line);
  }
 }
@ -325,7 +366,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
  memset(line, ' ', STRLENGTH);
  strncpy(line, prefix, PREFIXLEN);
  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
-  INFO(INIT,"%s", line);
+  INFO(NCCL_INIT,"%s", line);
 }
 static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
@ -477,7 +518,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
  for (int i=0; i<nranks; i++)
    comm->nThreads = std::max(allData[i], comm->nThreads);
-  if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads);
+  if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
  // Determine the minimum CUDA Compute capability of all GPUs
  int myCompCap = ncclCudaCompCap();
@ -486,7 +527,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
  for (int i=0; i<nranks; i++)
    minCompCap = std::min(allData[i], minCompCap);
-  if (rank == 0) INFO(INIT,"Min Comp Cap %d", minCompCap);
+  if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
  // Find min nrings across ranks
  allData[rank] = nrings;
@ -547,7 +588,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
      multiNode = 1;
    }
  }
-  TRACE(INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
      rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
  if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
@ -596,7 +637,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
-  INFO(INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
  return ncclSuccess;
 cleanup:
@ -615,7 +656,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
  NCCLCHECK(ncclInit());
  if (myrank == 0) showVersion();
-  INFO(INIT,"rank %d nranks %d", myrank, nranks);
+  INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
  // Make sure the CUDA runtime is initialized.
  CUDACHECK(cudaFree(NULL));
@ -679,8 +720,8 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
  free(prev);
  free(next);
-  INFO(INIT,"Using %d threads", nthreads);
+  INFO(NCCL_INIT,"Using %d threads", nthreads);
-  INFO(INIT,"Min Comp Cap %d", minCompCap);
+  INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
  int* rings;
  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
@ -733,7 +774,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
  NCCLCHECK(wrapNvmlInit());
  showVersion();
-  INFO(INIT,"nranks %d", ndev);
+  INFO(NCCL_INIT,"nranks %d", ndev);
  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
  if (ndev < 1) {
@ -793,7 +834,7 @@ cleanup:
 final:
  if(wrapNvmlShutdown() != ncclSuccess)
-    INFO(INIT,"NCCL did not shutdown nvml properly");
+    INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
  cudaSetDevice(savedDevice);
  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
  return res;
--- a/src/misc/enqueue.cu
+++ b/src/misc/enqueue.cu
@ -175,7 +175,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
  // We can't print the CG mode before the first barrier happened.
  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
    *comm->intraCGMode ^= 0x10;
-    INFO(INIT,"Launch mode %s%s%s",
+    INFO(NCCL_INIT,"Launch mode %s%s%s",
        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
        *comm->intraCGMode ? "/CGMD" : "",
        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
--- a/src/misc/group.cu
+++ b/src/misc/group.cu
@ -58,7 +58,7 @@ ncclResult_t ncclSetDevice(int cudaDev) {
 #define CHECK(a) do { \
  if ((args->ret = (a)) != ncclSuccess) { \
-    INFO(INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
    return args; \
  } \
 } while(0)
--- a/src/misc/nvmlwrap.cu
+++ b/src/misc/nvmlwrap.cu
@ -61,7 +61,7 @@ ncclResult_t wrapNvmlSymbols(void) {
    cast = (void**)&funcptr;                             \
    tmp = dlsym(handle, symbol);                         \
    if (tmp == NULL) {                                   \
-      INFO(INIT,"dlsym failed on %s, ignoring", symbol); \
+      INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \
    }                                                    \
    *cast = tmp;                                         \
  } while (0)
@ -208,7 +208,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
  }
  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
  if (ret != NVML_SUCCESS) {
-    INFO(INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
+    INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
        nvmlInternalErrorString(ret));
    return ncclSystemError;
  }
@ -223,7 +223,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
  if (ret != NVML_SUCCESS) {
    if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
+      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
          nvmlInternalErrorString(ret));
    return ncclSystemError;
  }
@ -239,7 +239,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
  if (ret != NVML_SUCCESS) {
    if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
+      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
          nvmlInternalErrorString(ret));
    return ncclSystemError;
  }
--- a/src/misc/rings.cu
+++ b/src/misc/rings.cu
@ -5,9 +5,10 @@
 ************************************************************************/
 #include "core.h"
 #include "net.h"
 #include "param.h"
 #define NCCL_MAX_SCORE 7
 /* Parse user defined rings. Format is like :
 * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
 * Rings with a non-matching number of ranks are ignored so we can provide
@ -188,11 +189,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
  if (str && strlen(str)>0) {
    int ret = parseRings(str, nrings, nranks, prev, next);
    if (ret == ncclSuccess && *nrings > 0) {
-      if (rank == 0) INFO(INIT,"%d ring(s) set by environment", *nrings);
+      if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
      NCCLCHECK(getEnvThreads(nthreads));
      return ncclSuccess;
    }
-    if (rank == 0) INFO(INIT,"No valid ring found in environment, ignoring");
+    if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
    *nrings = 0;
  }
@ -333,13 +334,13 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
    minNrings = MAXRINGS;
  }
  if (maxNrings > 0 && maxNrings <= *nrings) {
-    if (rank == 0) INFO(INIT,"Limiting to %d rings per user request.", maxNrings);
+    if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
    *nrings = maxNrings;
  } else {
    int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
    if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
    if (minNrings > 0 && minNrings > *nrings) {
-      if (rank == 0 && minNrings > defaultMinNrings) INFO(INIT,"Duplicating rings to %d per user request.", minNrings);
+      if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
      for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
        for (int i=0; i<nranks; i++) {
          prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
--- a/src/misc/utils.cu
+++ b/src/misc/utils.cu
@ -6,8 +6,10 @@
 #include "utils.h"
 #include "debug.h"
 #include "nccl_net.h"
 #include <unistd.h>
 #include <string.h>
 #include <stdarg.h>
 ncclResult_t getHostName(char* hostname, int maxlen) {
  if (gethostname(hostname, maxlen) != 0) {
@ -20,6 +22,53 @@ ncclResult_t getHostName(char* hostname, int maxlen) {
  return ncclSuccess;
 }
 /* Common logging function used by the INFO, WARN and TRACE macros
 * Also exported to the dynamically loadable Net transport modules so
 * they can share the debugging mechanisms and output files
 */
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
  if (ncclDebugLevel <= NCCL_LOG_NONE) return;
  char hostname[1024];
  getHostName(hostname, 1024);
  int cudaDev;
  cudaGetDevice(&cudaDev);
  char buffer[1024];
  size_t len = 0;
  pthread_mutex_lock(&ncclDebugOutputLock);
  if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
    len = snprintf(buffer, sizeof(buffer),
                   "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
  else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
    len = snprintf(buffer, sizeof(buffer),
                   "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
 #ifdef ENABLE_TRACE
  else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
    len = snprintf(buffer, sizeof(buffer),
                   "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
  }
 #endif
  if (len) {
    va_list vargs;
    va_start(vargs, fmt);
    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
    va_end(vargs);
    fprintf(ncclDebugFile,"%s\n", buffer);
    fflush(ncclDebugFile);
  }
  pthread_mutex_unlock(&ncclDebugOutputLock);
  // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
  if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
    fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
            hostname, getpid(), gettid(), cudaDev, filefunc, line);
    abort();
  }
 }
 uint64_t getHash(const char* string) {
  // Based on DJB2, result = result * 33 + char
  uint64_t result = 5381;
@ -51,7 +100,7 @@ uint64_t getHostHash(void) {
  offset += len;
  // Trailing '\0'
  uname[offset]='\0';
-  TRACE(INIT,"unique hostname '%s'", uname);
+  TRACE(NCCL_INIT,"unique hostname '%s'", uname);
  return getHash(uname);
 }
@ -71,7 +120,7 @@ uint64_t getPidHash(void) {
  if (len < 0) len = 0;
  pname[plen+len]='\0';
-  TRACE(INIT,"unique PID '%s'", pname);
+  TRACE(NCCL_INIT,"unique PID '%s'", pname);
  return getHash(pname);
 }
--- a/src/ring.cu
+++ b/src/ring.cu
@ -26,7 +26,7 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) {
  NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
  ring->devMemRecv = recvMem;
-  TRACE(INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
+  TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
  // Pre-configure send/recv pointers. Those are the default, they may change later.
  ring->recv.conn.buff = recvMem->buff;
--- a/src/transport.cu
+++ b/src/transport.cu
@ -113,8 +113,8 @@ ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRoun
  int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
  int nsteps = nstepsPerRound * nrounds * substeps;
-  TRACE(NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
+  TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
-  TRACE(NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
+  TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
  for (int r=0; r<nrings; r++) {
    struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
    struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
@ -159,7 +159,7 @@ ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclCo
  struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
  threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
  if (proxyfunc) {
-    TRACE(NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
+    TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
    struct transportProxyInfo* info;
    NCCLCHECK(ncclCalloc(&info, 1));
    connector->proxyInfo = info;
--- a/src/transport/net.cu
+++ b/src/transport/net.cu
@ -19,11 +19,21 @@
 #define NET_BITS_PER_IF 3
 #define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
 static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
 static ncclTvalue_t getTvalue(short* distances, int ndev) {
  ncclTvalue_t tvalue = 0;
  for (int d=0; d<ndev; d++) {
    int score = 1 + PATH_SOC - distances[d];
    // Keep 3 bits of score info per dev
    tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
  }
  return tvalue;
 }
 struct netInfo {
  int rank;
  int ndev;
-  short scores[NET_MAX_IFS];
+  ncclTvalue_t tValue;
  short distances[NET_MAX_IFS];
 };
 struct netConnectInfo {
@ -38,7 +48,7 @@ struct netSendResources {
  struct ncclRecvMem* devHostRecvMem;
  struct ncclSendMem* hostDevMem;
  int netDev;
-  bool cudaSupport;
+  int useGdr;
  struct ncclRecvMem* devNetMem;
  uint64_t llStep;
  uint64_t llLastCleaning;
@ -53,7 +63,7 @@ struct netRecvResources {
  struct ncclRecvMem* devHostRecvMem;
  struct ncclRecvMem* hostDevMem;
  int netDev;
-  bool cudaSupport;
+  int useGdr;
  uint64_t llStep;
  uint64_t llLastCleaning;
 };
@ -64,26 +74,37 @@ ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
  struct netInfo* info = (struct netInfo*)opaqueInfo;
  static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
  info->rank = rank;
-  int *scores;
+  NCCLCHECK(ncclNetDevices(&info->ndev));
  NCCLCHECK(ncclNetDevices(&info->ndev, &scores));
  if (info->ndev == 0) {
    WARN("Error : Network returned 0 device");
    return ncclSystemError;
  }
  if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
-  for (int d=0; d<info->ndev; d++) info->scores[d] = scores[d];
+
-  free(scores);
+  // Find distance with current GPU
  int cudaDev;
  cudaGetDevice(&cudaDev);
  char* cudaPath;
  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
  char line[1024];
  sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName());
  for (int d=0; d<info->ndev; d++) {
    char* nicPath;
    ncclResult_t err = ncclNetPciPath(d, &nicPath);
    info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
    sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]);
    if (err == ncclSuccess) free(nicPath);
  }
  INFO(NCCL_INIT|NCCL_NET, "%s", line);
  free(cudaPath);
  return ncclSuccess;
 }
 /* Determine if we can communicate with the peer */
 ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
  ret[0] = 0;
  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  for (int d=0; d<myInfo->ndev; d++) {
+  ret[0] = getTvalue(myInfo->distances, myInfo->ndev);
    // Keep 3 bits of score info per dev
    ret[0] |= ((myInfo->scores[d] & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
  }
  return ncclSuccess;
 }
@ -175,13 +196,13 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
  return ncclSuccess;
 }
-int getDev(int ringId, int nDev, short* scores) {
+int getDev(int ringId, int nDev, short* distances) {
-  int maxScore = 0;
+  int minDistance = PATH_SOC;
-  for (int d=0; d<nDev; d++) if (scores[d] > maxScore) maxScore = scores[d];
+  for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d];
  int skip = ringId+1;
  while (skip) {
    for (int d=0; d<nDev; d++) {
-      if (scores[d] == maxScore) {
+      if (distances[d] == minDistance) {
        skip--;
        if (skip == 0) return d;
      }
@ -191,6 +212,40 @@ int getDev(int ringId, int nDev, short* scores) {
 }
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
 static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
  *useGdr = 0;
  int cudaDev;
  CUDACHECK(cudaGetDevice(&cudaDev));
  if (read) { // For reads (sends) only enable under certain conditions
    int gdrReadParam = ncclParamNetGdrRead();
    if (gdrReadParam == 0) return ncclSuccess;
    else if (gdrReadParam < 0) { // default : enable only on DGX2
      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
      CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
      int nvlinks = getNumNvlinks(busId);
      if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
    }
  }
  // Check if we are close enough that it makes sense to enable GDR
  int netGdrLevel = ncclParamNetGdrLevel();
  if (distance >= netGdrLevel) {
    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel);
    return ncclSuccess;
  }
  // Finally, check if the NIC supports it
  int flags;
  NCCLCHECK(ncclNetPtrSupport(dev, &flags));
  if (flags & NCCL_PTR_CUDA == 0) return ncclSuccess;
  *useGdr = 1;
  INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
  return ncclSuccess;
 }
 /* Determine if we will use this transport for this peer and return connect
 * information for this peer */
@ -200,34 +255,11 @@ ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
  ring->send.transportResources = resources;
  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
+  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
-  resources->cudaSupport = false;
+  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr));
  // Get user's GDR READ setting
  int gdrReadParam = ncclParamNetGdrRead();
  // Determine whether the GPU has NVLink
  int cudaDev;
  CUDACHECK(cudaGetDevice(&cudaDev));
  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
  int nvlinks = getNumNvlinks(busId);
  // Enable GDR read when:
  // 1) user sets it, or
  // 2) we are on a NVSwitch platform (i.e. no P2P traffic over PCI-E switch) AND the GPU is Volta
  bool enableGdrRead = (gdrReadParam > 0) || (nvlinks >= CONNECT_NVSWITCH && ncclCudaCompCap() > 6 && gdrReadParam != 0);
  if (enableGdrRead) {
    int flags;
    NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
    if (flags & NCCL_PTR_CUDA)
      resources->cudaSupport = true;
  }
  if (resources->cudaSupport)
    INFO(INIT|NET, "Net: enabling net device %d to read from rank %d", resources->netDev, myInfo->rank);
  int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  if (resources->cudaSupport) {
+  if (resources->useGdr) {
    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
  }
@ -243,10 +275,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
  ring->recv.transportResources = resources;
  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
+  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
-  int flags;
+  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr));
  NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
  resources->cudaSupport = (flags & NCCL_PTR_CUDA) ? true : false;
  int sendSize = sizeof(struct ncclSendMem);
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@ -255,8 +285,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
  struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
-  INFO(INIT|NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
-      resources->cudaSupport ? "/GDRDMA" : "",
+      resources->useGdr ? "/GDRDMA" : "",
      (resources->hostDevMem != NULL) ? "/GDCopy" : "");
  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
@ -267,7 +297,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  // Setup device pointers
  struct netSendResources* resources = (struct netSendResources*)send->transportResources;
-  if (resources->cudaSupport) {
+  if (resources->useGdr) {
    send->conn.buff = resources->devNetMem->buff;
    // We don't use devMem for llMode because the CPU has to read the data
    send->conn.llBuff = resources->devHostRecvMem->llBuff;
@ -299,7 +329,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  recv->conn.head = &resources->devHostSendMem->head;
  recv->conn.llHead = &resources->devHostSendMem->llHead;
-  if (resources->cudaSupport == false) {
+  if (resources->useGdr == 0) {
    recv->conn.buff = resources->devHostRecvMem->buff;
    recv->conn.llBuff = resources->devHostRecvMem->llBuff;
  }
@ -320,7 +350,7 @@ ncclResult_t netSendFree(void* transportResources) {
  struct netSendResources* resources = (struct netSendResources*)transportResources;
  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
-  if (resources->cudaSupport)
+  if (resources->useGdr)
    CUDACHECK(cudaFree(resources->devNetMem));
  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
  free(resources);
@ -344,9 +374,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
  volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
  struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
  uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
-  struct ncclRecvMem* localMem = resources->cudaSupport ? resources->devNetMem : resources->hostRecvMem;
+  struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem;
  char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
-  int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
  volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
  int sliceSize = buffSize / args->substeps;
@ -362,8 +392,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
  if (!args->needProxy) goto nextColl;
-  TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
+  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
+  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
  // Update in case we skipped some collectives
  if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
@ -440,10 +470,10 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
  int llMode = args->llMode;
  volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
-  struct ncclRecvMem* localMem = resources->cudaSupport ? ring->devMemRecv : resources->hostRecvMem;
+  struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem;
  char* localBuff = llMode ? localMem->llBuff : localMem->buff;
-  char* nextBuff = (resources->cudaSupport == false && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
+  char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
-  int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
  uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
@ -458,8 +488,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
  if (!args->needProxy) goto nextColl;
-  TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
+  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
+  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
  if (llMode == 0) {
    // Waiting for next opCount is only needed before writing nextTail.
--- a/src/transport/net_ib.cu
+++ b/src/transport/net_ib.cu
@ -82,8 +82,12 @@ static void* ncclIbAsyncThreadMain(void* args) {
  return NULL;
 }
-static void initDevices() {
+NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
-  if(wrap_ibv_symbols() != ncclSuccess) { return; }
+
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
  if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
  if (ncclParamIbDisable()) return ncclInternalError;
  if (ncclNIbDevs == -1) {
    pthread_mutex_lock(&ncclIbLock);
    wrap_ibv_fork_init();
@ -91,9 +95,9 @@ static void initDevices() {
      ncclNIbDevs = 0;
      if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
        WARN("NET/IB : No IP interface found.");
-        return;
+        return ncclInternalError;
      }
-      INFO(INIT|NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
+      INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
      // Detect IB cards
      int nIbDevs;
@ -105,7 +109,7 @@ static void initDevices() {
      bool searchNot = userIbEnv && userIbEnv[0] == '^';
      int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
-      if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return;
+      if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
      for (int d=0; d<nIbDevs; d++) {
        struct ibv_context * context;
@ -134,7 +138,7 @@ static void initDevices() {
            if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
              continue;
            }
-            INFO(INIT|NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
+            INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
                portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
            ncclIbDevs[ncclNIbDevs].device = d;
            ncclIbDevs[ncclNIbDevs].port = port;
@ -145,38 +149,29 @@ static void initDevices() {
            pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
          }
-          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return; } }
+          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } }
        }
      }
-      if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return; };
+      if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
    }
    pthread_mutex_unlock(&ncclIbLock);
  }
  return ncclSuccess;
 }
-ncclResult_t ncclIbDevices(int* ndev, int** scores) {
+ncclResult_t ncclIbDevices(int* ndev) {
  initDevices();
  *ndev = ncclNIbDevs;
-  int cudaDev;
+  return ncclSuccess;
-  cudaGetDevice(&cudaDev);
+}
-  char* cudaPath;
+
-  ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
+ncclResult_t ncclIbPciPath(int dev, char** path) {
-  int* sc;
+  char devicepath[PATH_MAX];
-  NCCLCHECK(ncclCalloc(&sc, ncclNIbDevs));
+  snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName);
-  char line[1024];
+  *path = realpath(devicepath, NULL);
-  sprintf(line, "CUDA Dev %d, IB Ports : ", cudaDev);
+  if (*path == NULL) {
-  for (int d=0; d<ncclNIbDevs; d++) {
+    WARN("Could not find real path of %s", devicepath);
-    char* mlxPath;
+    return ncclSystemError;
    ncclResult_t err2 = getMlxPath(ncclIbDevs[d].devName, &mlxPath);
    int distance = (err1 != ncclSuccess || err2 != ncclSuccess || mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
    sprintf(line+strlen(line), "%s/%d(%s) ", ncclIbDevs[d].devName, ncclIbDevs[d].port, pathDists[distance]);
    sc[d] = 1+PATH_SOC-distance;
    if (err2 == ncclSuccess) free(mlxPath);
  }
  INFO(INIT|NET,"%s", line);
  if (err1 == ncclSuccess) free(cudaPath);
  *scores = sc;
  return ncclSuccess;
 }
@ -207,45 +202,21 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
  return ret;
 }
 NCCL_PARAM(IbGdrLevel, "IB_GDR_LEVEL", -2);
 NCCL_PARAM(IbCudaSupport, "IB_CUDA_SUPPORT", -2);
 ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
  initDevices();
  *supportedTypes = NCCL_PTR_HOST;
  int cudaDev;
-  if (cudaGetDevice(&cudaDev) != cudaSuccess) return ncclSuccess;
+  CUDACHECK(cudaGetDevice(&cudaDev));
-  int ibGdrLevel = PATH_PHB;
+  if (ncclIbGdrSupport(dev) != ncclSuccess) {
-  if (ncclParamIbCudaSupport() != -2) ibGdrLevel = ncclParamIbCudaSupport() ? PATH_SOC + 1 : 0;
+    INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
-  if (ncclParamIbGdrLevel() != -2) ibGdrLevel = ncclParamIbGdrLevel();
+    return ncclSuccess;
  if (ibGdrLevel > 0) {
    int gdrSupport = ncclIbGdrSupport(dev);
    if (gdrSupport > 0) {
      INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (%s)", cudaDev, ncclIbDevs[dev].devName, gdrSupport == 1 ? "no module" : "not supported by GPU");
      ibGdrLevel = 0;
    }
  }
  if (ibGdrLevel <= 0) return ncclSuccess;
  char* cudaPath;
  if (getCudaPath(cudaDev, &cudaPath) != ncclSuccess) return ncclSuccess;
  char* mlxPath;
  if (getMlxPath(ncclIbDevs[dev].devName, &mlxPath) != ncclSuccess) { free(cudaPath); return ncclSuccess; }
  int distance = (mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
  free(mlxPath); free(cudaPath);
  if (distance < ibGdrLevel) {
    *supportedTypes |= NCCL_PTR_CUDA;
  } else {
    INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (distance %d >= %d)", cudaDev, ncclIbDevs[dev].devName, distance, ibGdrLevel);
  }
  *supportedTypes |= NCCL_PTR_CUDA;
  return ncclSuccess;
 }
 static ncclResult_t GetSocketAddr(union socketAddress* addr) {
  if (ncclNIbDevs == -1) initDevices();
  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
  return ncclSuccess;
 }
@ -442,7 +413,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
  *sendComm = comm;
  // IB Setup
  initDevices(); /*NOTE: We need to do this for ncclNet unit test that bypasses nccl initialization*/
  ibv_context* ctx = ncclIbDevs[dev].context;
  NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
  uint8_t ib_port = ncclIbDevs[dev].port;
@ -464,13 +434,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
  // RoCE support
  qpInfo.lid = portAttr.lid;
  if (qpInfo.lid) { // IB
-    INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
+    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
  } else { // RoCE
    union ibv_gid gid;
    NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
    qpInfo.spn = gid.global.subnet_prefix;
    qpInfo.iid = gid.global.interface_id;
-    INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
  }
  NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
@ -649,7 +619,7 @@ ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct
  NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
  *mrRet = verbs->mrPool+elem;
  verbs->mrPool[elem].refcnt++;
-  TRACE(INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
+  TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
  return ncclSuccess;
 }
@ -903,7 +873,9 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
 ncclNet_t ncclNetIb = {
  "IB",
  ncclIbInit,
  ncclIbDevices,
  ncclIbPciPath,
  ncclIbPtrSupport,
  ncclIbListen,
  ncclIbConnect,
@ -917,10 +889,3 @@ ncclNet_t ncclNetIb = {
  ncclIbCloseListen
 };
 NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
 bool ncclIbSupport() {
  if (ncclParamIbDisable()) return 0;
  initDevices();
  return ncclNIbDevs > 0;
 }
--- a/src/transport/net_socket.cu
+++ b/src/transport/net_socket.cu
@ -8,67 +8,58 @@
 #include "core.h"
 #include "socket.h"
 #include "net.h"
 #include "topo.h"
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <poll.h>
 #include <limits.h>
 /* Init functions */
 static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
 static union socketAddress ncclNetIfAddrs[MAX_IFS];
 static int ncclNetIfs = -1;
 pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
 ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
  if (ncclNetIfs == -1) {
    pthread_mutex_lock(&ncclSocketLock);
    if (ncclNetIfs == -1) {
      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
      INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
      if (ncclNetIfs <= 0) {
        WARN("NET/Socket : no interface found");
        return ncclInternalError;
      }
    }
    pthread_mutex_unlock(&ncclSocketLock);
  }
  return ncclSuccess;
 }
 ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
  *supportedTypes = NCCL_PTR_HOST;
  return ncclSuccess;
 }
-static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+ncclResult_t ncclSocketDevices(int* ndev) {
-static union socketAddress ncclNetIfAddrs[MAX_IFS];
+  *ndev = ncclNetIfs;
-static int ncclNetIfs = -1;
+  return ncclSuccess;
 pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
 static void initDevices() {
  if (ncclNetIfs == -1) {
    pthread_mutex_lock(&ncclSocketLock);
    if (ncclNetIfs == -1) {
      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
      INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs);
      if (ncclNetIfs <= 0) {
        WARN("NET/Socket : no interface found");
      }
    }
    pthread_mutex_unlock(&ncclSocketLock);
  }
 }
-ncclResult_t ncclSocketDevices(int* ndev, int** scores) {
+ncclResult_t ncclSocketPciPath(int dev, char** path) {
-  initDevices();
+  char devicepath[PATH_MAX];
-  *ndev = ncclNetIfs;
+  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
-  int cudaDev;
+  *path = realpath(devicepath, NULL);
-  cudaGetDevice(&cudaDev);
+  if (*path == NULL) {
-  char* cudaPath;
+    INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
-  ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
+    return ncclSystemError;
  int* sc;
  NCCLCHECK(ncclCalloc(&sc, ncclNetIfs));
  char line[1024];
  sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev);
  for (int i=0; i<ncclNetIfs; i++) {
    char* sockPath;
    ncclResult_t err2 = getSockPath(ncclNetIfNames+i*MAX_IF_NAME_SIZE, &sockPath);
    int distance = (err1 != ncclSuccess || err2 != ncclSuccess || sockPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(sockPath, cudaPath);
    sprintf(line+strlen(line), "%s(%s) ", ncclNetIfNames+i*MAX_IF_NAME_SIZE, pathDists[distance]);
    sc[i] = 1+PATH_SOC-distance;
    if (err2 == ncclSuccess) free(sockPath);
  }
  INFO(INIT|NET,"%s", line);
  if (err1 == ncclSuccess) free(cudaPath);
  *scores = sc;
  return ncclSuccess;
 }
 static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
  if (ncclNetIfs == -1) initDevices();
  if (dev >= ncclNetIfs) return ncclInternalError;
  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
  return ncclSuccess;
@ -223,7 +214,9 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
 ncclNet_t ncclNetSocket = {
  "Socket",
  ncclSocketInit,
  ncclSocketDevices,
  ncclSocketPciPath,
  ncclSocketPtrSupport,
  ncclSocketListen,
  ncclSocketConnect,
--- a/src/transport/p2p.cu
+++ b/src/transport/p2p.cu
@ -85,7 +85,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
  // See if CUDA can do P2P
  int p2p;
  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
-    INFO(INIT|P2P,"peer query failed between dev %d and dev %d",
+    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
        myInfo->cudaDev, peerInfo->cudaDev);
    return ncclSuccess;
  }
@ -454,7 +454,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
    info.direct = 1;
    info.directPtr = ring->devMemSend;
    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(INIT|P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
    } else {
      // Enable P2P access
      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
@ -465,7 +465,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
            peerInfo->cudaDev, err, cudaGetErrorString(err));
        return ncclInternalError;
      }
-      INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
          ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
    }
  } else {
@ -477,7 +477,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
      return ncclInternalError;
    }
-    INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
        ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
    //TRACE_DUMP_IPC(&info.devIpc);
  }
@ -495,7 +495,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
    info.direct = 1;
    info.directPtr = ring->devMemRecv;
    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      TRACE(INIT|P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
+      TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
    } else {
      // Enable P2P access
      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
@ -506,7 +506,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
            peerInfo->cudaDev, err, cudaGetErrorString(err));
        return ncclInternalError;
      }
-      TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
    }
  } else {
    info.direct = 0;
@ -517,7 +517,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
      return ncclInternalError;
    }
-    TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
    //TRACE_DUMP_IPC(&info.devIpc);
  }
  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
--- a/src/transport/shm.cu
+++ b/src/transport/shm.cu
@ -168,10 +168,10 @@ ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
  char shmName[MAX_SHM_NAME_LEN];
  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
  info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
-  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
-  INFO(INIT|SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
  memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
@ -189,7 +189,7 @@ ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
  char shmName[MAX_SHM_NAME_LEN];
  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
@ -207,7 +207,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  char shmName[MAX_SHM_NAME_LEN];
  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
  resources->remShmSize = info->shmSize;
-  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
  // Remove the file to ensure proper clean-up
  NCCLCHECK(shmUnlink(shmName));
@ -231,7 +231,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  char shmName[MAX_SHM_NAME_LEN];
  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
  resources->remShmSize = info->shmSize;
-  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
  NCCLCHECK(shmUnlink(shmName));
  recv->conn.head = &resources->devRemHostMem->head;