nccl/src/misc/nvmlwrap.cc

/*************************************************************************
 * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#include "nvmlwrap.h"
#include "checks.h"
#include "debug.h"

#include <initializer_list>
#include <memory>
#include <mutex>

int ncclNvmlDeviceCount = 0;
ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];

#if NCCL_NVML_DIRECT
  #define NCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name;
#else
  #include <dlfcn.h>
  #define NCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr;
#endif

namespace {
  NCCL_NVML_FN(nvmlInit, nvmlReturn_t, ())
  NCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ())
  NCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ())
  NCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*))
  NCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))
  NCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device))
  NCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t *device))
  NCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index))
  NCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r))
  NCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive))
  NCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci))
  NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult))
  NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
  NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
  NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))

  std::mutex lock; // NVML has had some thread safety bugs
  bool initialized = false;
  thread_local bool threadInitialized = false;
  ncclResult_t initResult;
}

ncclResult_t ncclNvmlEnsureInitialized() {
  // Optimization to avoid repeatedly grabbing the lock when we only want to
  // read from the global tables.
  if (threadInitialized) return initResult;
  threadInitialized = true;

  std::lock_guard<std::mutex> locked(lock);

  if (initialized) return initResult;
  initialized = true;

  #if !NCCL_NVML_DIRECT
  if (pfn_nvmlInit == nullptr) {
    void *libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
    if (libhandle == nullptr) {
      WARN("Failed to open libnvidia-ml.so.1");
      initResult = ncclSystemError;
      return initResult;
    }

    struct Symbol { void **ppfn; char const *name; };
    std::initializer_list<Symbol> symbols = {
      {(void**)&pfn_nvmlInit, "nvmlInit"},
      {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"},
      {(void**)&pfn_nvmlShutdown, "nvmlShutdown"},
      {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"},
      {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"},
      {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"},
      {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
      {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"},
      {(void**)&pfn_nvmlErrorString, "nvmlErrorString"},
      {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"},
      {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
      {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
      {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
      {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
      {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}
    };
    for(Symbol sym: symbols) {
      *sym.ppfn = dlsym(libhandle, sym.name);
    }
  }
  #endif

  #if NCCL_NVML_DIRECT
    bool have_v2 = true;
  #else
    bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the NCCL_NVML_DIRECT=1 case then GCC warns about it never being null
  #endif
  nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)();
  if (res1 != NVML_SUCCESS) {
    WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
    initResult = ncclSystemError;
    return initResult;
  }

  unsigned int ndev;
  res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev);
  if (res1 != NVML_SUCCESS) {
    WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" :"", pfn_nvmlErrorString(res1));
    initResult = ncclSystemError;
    return initResult;
  }

  ncclNvmlDeviceCount = int(ndev);
  if (ncclNvmlMaxDevices < ncclNvmlDeviceCount) {
    WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (ncclNvmlMaxDevices=%d)", ncclNvmlDeviceCount, ncclNvmlMaxDevices);
    initResult = ncclInternalError;
    return initResult;
  }

  for(int a=0; a < ncclNvmlDeviceCount; a++) {
    res1 = pfn_nvmlDeviceGetHandleByIndex(a, &ncclNvmlDevices[a].handle);
    if (res1 != NVML_SUCCESS) {
      WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
      initResult = ncclSystemError;
      return initResult;
    }

    res1 = pfn_nvmlDeviceGetCudaComputeCapability(ncclNvmlDevices[a].handle, &ncclNvmlDevices[a].computeCapabilityMajor, &ncclNvmlDevices[a].computeCapabilityMinor);
    if (res1 != NVML_SUCCESS) {
      WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
      initResult = ncclSystemError;
      return initResult;
    }
  }

  for(int a=0; a < ncclNvmlDeviceCount; a++) {
    for(int b=0; b < ncclNvmlDeviceCount; b++) {
      nvmlDevice_t da = ncclNvmlDevices[a].handle;
      nvmlDevice_t db = ncclNvmlDevices[b].handle;

      res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &ncclNvmlDevicePairs[a][b].p2pStatusRead);
      if (res1 != NVML_SUCCESS) {
        WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
        initResult = ncclSystemError;
        return initResult;
      }

      res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &ncclNvmlDevicePairs[a][b].p2pStatusWrite);
      if (res1 != NVML_SUCCESS) {
        WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
        initResult = ncclSystemError;
        return initResult;
      }
    }
  }

  initResult = ncclSuccess;
  return initResult;
}

#define NVMLCHECK(name, ...) do { \
  nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
  if (e44241808 != NVML_SUCCESS) { \
    WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
    return ncclSystemError; \
  } \
} while(0)

#define NVMLTRY(name, ...) do { \
  if (!NCCL_NVML_DIRECT && pfn_##name == nullptr) \
    return ncclInternalError; /* missing symbol is not a warned error */ \
  nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
  if (e44241808 != NVML_SUCCESS) { \
    if (e44241808 != NVML_ERROR_NOT_SUPPORTED) \
      INFO(NCCL_INIT, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
    return ncclSystemError; \
  } \
} while(0)

ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
  NCCLCHECK(ncclNvmlEnsureInitialized());
  std::lock_guard<std::mutex> locked(lock);
  NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device);
  return ncclSuccess;
}

ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
  NCCLCHECK(ncclNvmlEnsureInitialized());
  *device = ncclNvmlDevices[index].handle;
  return ncclSuccess;
}

ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
  NCCLCHECK(ncclNvmlEnsureInitialized());
  for (int d=0; d < ncclNvmlDeviceCount; d++) {
    if (ncclNvmlDevices[d].handle == device) {
      *index = d;
      return ncclSuccess;
    }
  }
  return ncclInvalidArgument;
}

ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
  NCCLCHECK(ncclNvmlEnsureInitialized());
  std::lock_guard<std::mutex> locked(lock);
  NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive);
  return ncclSuccess;
}

ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
  NCCLCHECK(ncclNvmlEnsureInitialized());
  std::lock_guard<std::mutex> locked(lock);
  NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci);
  return ncclSuccess;
}

ncclResult_t ncclNvmlDeviceGetNvLinkCapability(
    nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability,
    unsigned int *capResult
  ) {
  NCCLCHECK(ncclNvmlEnsureInitialized());
  std::lock_guard<std::mutex> locked(lock);
  NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult);
  return ncclSuccess;
}

ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
  NCCLCHECK(ncclNvmlEnsureInitialized());

  for(int d=0; d < ncclNvmlDeviceCount; d++) {
    if(device == ncclNvmlDevices[d].handle) {
      *major = ncclNvmlDevices[d].computeCapabilityMajor;
      *minor = ncclNvmlDevices[d].computeCapabilityMinor;
      return ncclSuccess;
    }
  }
  return ncclInvalidArgument;
}

ncclResult_t ncclNvmlDeviceGetP2PStatus(
    nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,
    nvmlGpuP2PStatus_t* p2pStatus
  ) {
  NCCLCHECK(ncclNvmlEnsureInitialized());

  if (p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) {
    int a = -1, b = -1;
    for(int d=0; d < ncclNvmlDeviceCount; d++) {
      if(device1 == ncclNvmlDevices[d].handle) a = d;
      if(device2 == ncclNvmlDevices[d].handle) b = d;
    }
    if (a == -1 || b == -1) return ncclInvalidArgument;
    if (p2pIndex == NVML_P2P_CAPS_INDEX_READ)
      *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusRead;
    else
      *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusWrite;
  }
  else {
    std::lock_guard<std::mutex> locked(lock);
    NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus);
  }
  return ncclSuccess;
}

ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values) {
  NCCLCHECK(ncclNvmlEnsureInitialized());
  std::lock_guard<std::mutex> locked(lock);
  NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
  return ncclSuccess;
}