/************************************************************************* * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ************************************************************************/ #include #include #include "core.h" #include "libwrap.h" #include #include #include #include #include #include #include #include #include #include DebugLevel ncclDebugLevel; extern "C" DSOGLOBAL ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { pid_t pid = getpid(); static int count = 0; int commId = __sync_fetch_and_add(&count, 1); int len = snprintf(out->internal, NCCL_UNIQUE_ID_BYTES, "nccl-%d-%d", pid, commId); if(strlen(out->internal) < len) { WARN("ncclUniqueId truncated"); return ncclInternalError; } return ncclSuccess; } static ncclResult_t shmOpen(const char* shmname, size_t bytes, void** ptr) { int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); if (fd == -1) { WARN("shm_open failed to open %s", shmname); return ncclSystemError; } if (ftruncate(fd, bytes) == -1) { WARN("ftruncate failed to allocate %ld bytes", bytes); shm_unlink(shmname); close(fd); return ncclSystemError; } *ptr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (*ptr == MAP_FAILED) { WARN("failure in mmap"); shm_unlink(shmname); close(fd); return ncclSystemError; } close(fd); return ncclSuccess; } static ncclResult_t shmUnlink(const char* shmname) { if(shm_unlink(shmname) == -1) { WARN("smh_unlink failed"); return ncclSystemError; } else { return ncclSuccess; } } static ncclResult_t shmUnmap(void* ptr, size_t bytes) { if(munmap(ptr, bytes) == -1) { WARN("munmap failed"); return ncclSystemError; } else { return ncclSuccess; } } typedef struct { int rank; int ndev; int cudaDev; int ncclId; pid_t pid; ncclMem* hostptr; ncclMem* devptr; CUipcMemHandle devipc; size_t buffSize; } RankEntry; static int compRanks(const void* a, const void* b) { const RankEntry* A = (const RankEntry*)a; const RankEntry* B = (const RankEntry*)b; if (A->ncclId < B->ncclId) return -1; if (A->ncclId > B->ncclId) return 1; return 0; } static void orderRanks(RankEntry* ranks, int count) { qsort(ranks, count, sizeof(RankEntry), compRanks); for(int i=0; iranks[rank] = myInfo; bar_tmp = tmp->bar - 1; bool swapped; do { bar_tmp += 1; if (bar_tmp == ndev-1) { // everyone is done ncclResult_t res = shmUnlink(commId.internal); if (res != ncclSuccess) { WARN("rank %d failed to unlink shm segment for gather", rank); shmUnmap(tmp, bytes); return res; } orderRanks(tmp->ranks, ndev); } swapped = __sync_bool_compare_and_swap(&tmp->bar, bar_tmp, bar_tmp+1); } while(!swapped); while (tmp->bar < ndev) sched_yield(); __sync_synchronize(); *gather = tmp; return ncclSuccess; } static void syncRingDirect(RankGather* gather, int* ringDirectOk) { int bar_tmp = gather->bar - 1; int ndev = gather->ranks[0].ndev; bool swapped; do { bar_tmp += 1; swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1); } while(!swapped); while (gather->bar < 2*ndev) // Wait for all ranks to arrive at this second barrier sched_yield(); __sync_synchronize(); *ringDirectOk = gather->ringDirectFail ? 0 : 1; } static ncclResult_t closeGather(RankGather* gather, int ndev) { int bar_tmp = gather->bar - 1; bool swapped; do { bar_tmp += 1; swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1); } while(!swapped); while (gather->bar < 3*ndev) // Wait for all ranks to arrive at this third barrier sched_yield(); __sync_synchronize(); size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry); ncclResult_t res = shmUnmap(gather, bytes); if (res != ncclSuccess) { WARN("failed to unmap %ld bytes of gather", bytes); return res; } return ncclSuccess; } static ncclResult_t allocDevMem(ncclMem** ptr, size_t buffSize) { size_t size = offsetof(struct ncclMem, buff) + buffSize; cudaError_t res = cudaMalloc((void**)ptr, size); if (res != cudaSuccess) { *ptr = NULL; WARN("failed to allocate %lu byte device buffer", size); return ncclCudaMallocFailed; } if (cudaMemset(*ptr, 0, size) != cudaSuccess) { WARN("failed to memset device buffer."); cudaFree(*ptr); *ptr = NULL; return ncclUnhandledCudaError; } return ncclSuccess; } static const int ShmMapped = 1; static const int ShmLinked = 2; static ncclResult_t allocHostMem(ncclMem** ptr, size_t buffSize) { size_t size = offsetof(struct ncclMem, buff) + buffSize; cudaError_t res = cudaMallocHost((void**)ptr, size); if (res != cudaSuccess) { *ptr = NULL; WARN("failed to allocate %lu byte host buffer", size); return ncclSystemError; } memset(*ptr, 0, size); return ncclSuccess; } static ncclResult_t openHostMemShm(const char* shmname, ncclMem** ptr, size_t buffSize) { size_t size = offsetof(struct ncclMem, buff) + buffSize; ncclResult_t res = shmOpen(shmname, size, (void**)ptr); if (res != ncclSuccess) { WARN("failed to allocate %lu byte shm buffer", size); *ptr = NULL; return res; } if(cudaHostRegister(*ptr, size, cudaHostRegisterMapped) != cudaSuccess) { WARN("failed to register host buffer"); shmUnlink(shmname); shmUnmap(*ptr, size); *ptr = NULL; return ncclUnhandledCudaError; } return ncclSuccess; } static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm) { char busId[13]; nvmlDevice_t nvmlHandle; cudaError_t res = cudaDeviceGetPCIBusId(busId, 13, comm->cudaDev); if (res == cudaErrorInvalidDevice) { WARN("rank %d attempted to access an invalid cuda device %d", rank, comm->cudaDev); return ncclInvalidDeviceIndex; } else if (res != cudaSuccess) { WARN("rank %d failed to get PCI Bus Id for device %d", rank, comm->cudaDev); return ncclUnhandledCudaError; } INFO("rank %d using device %d (%s)", rank, comm->cudaDev, busId); if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) { WARN("rank %d failed to get nvml handle for device %s", rank, busId); return ncclUnhandledCudaError; } // Order by nvml index if (wrapNvmlDeviceGetIndex(nvmlHandle, (unsigned*)&info->ncclId) != ncclSuccess) { WARN("rank %d failed to get nvml device index for device %d", rank, comm->cudaDev); return ncclUnhandledCudaError; } info->rank = rank; info->ndev = comm->nDev; info->cudaDev = comm->cudaDev; info->pid = getpid(); info->buffSize = comm->buffSize; info->hostptr = comm->hostMem; info->devptr = comm->devMem; if (wrapCuIpcGetMemHandle(&info->devipc, (CUdeviceptr)comm->devMem) != ncclSuccess) { WARN("rank %d failed to open CUDA IPC handle", rank); return ncclUnhandledCudaError; } return ncclSuccess; } static const int CLEANUP_NONE = 0; static const int CLEANUP_CUIPC = 1; static const int CLEANUP_UNMAP = 2; static ncclResult_t commClearMaps(ncclComm_t comm) { ncclResult_t res, retval = ncclSuccess; cudaError_t cures; for(int d=0; dnDev; ++d) { switch(comm->ptrs[d].remoteCleanup) { case CLEANUP_NONE: break; case CLEANUP_CUIPC: res = wrapCuIpcCloseMemHandle((CUdeviceptr)comm->ptrs[d].cleanupHandle); if (res != ncclSuccess) { WARN("rank %d failed to close IPC handle to rank %d", comm->userFromRing[comm->ncclId], comm->userFromRing[d]); retval = (retval == ncclSuccess) ? res : retval; } break; case CLEANUP_UNMAP: cures = cudaHostUnregister(comm->ptrs[d].cleanupHandle); if (cures != cudaSuccess) { WARN("rank %d failed to unregister handle to rank %d", comm->userFromRing[comm->ncclId], comm->userFromRing[d]); retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval; } res = shmUnmap(comm->ptrs[d].cleanupHandle, offsetof(ncclMem, buff) + comm->buffSize); if (res != ncclSuccess) { WARN("rank %d failed to unmap handle to rank %d", comm->userFromRing[comm->ncclId], comm->userFromRing[d]); retval = (retval == ncclSuccess) ? res : retval; } break; default: WARN("Unknown cleanup type %d", comm->ptrs[d].remoteCleanup); } comm->ptrs[d].remoteCleanup = CLEANUP_NONE; comm->ptrs[d].cleanupHandle = NULL; } if (comm->userFromRing != NULL) memset(comm->userFromRing, 0, sizeof(int)*comm->nDev); if (comm->ringFromUser != NULL) memset(comm->ringFromUser, 0, sizeof(int)*comm->nDev); if (comm->devUserFromRing != NULL) { cudaError_t err = cudaMemset(comm->devUserFromRing, 0, sizeof(int)*comm->nDev); if (err != cudaSuccess) { WARN("Faild to clear dev map: %s", cudaGetErrorString(err)); retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval; } } return retval; } static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* ringDirectFailed) { int ndev = comm->nDev; for(int i=0; ibuffSize || ranks[i].ndev != comm->nDev) { commClearMaps(comm); return ncclRankMismatch; } // Create rank<->nccl maps int iRank = ranks[i].rank; comm->userFromRing[i] = iRank; comm->ringFromUser[iRank] = i; } if (cudaMemcpy(comm->devUserFromRing, comm->userFromRing, ndev*sizeof(int), cudaMemcpyHostToDevice) != cudaSuccess) { WARN("rank %d failed to copy maps to device", rank); commClearMaps(comm); return ncclUnhandledCudaError; } int myId = -1; for (int i=0; incclId = myId; int myDev = ranks[myId].cudaDev; pid_t myPid = ranks[myId].pid; comm->useRemoteRecv = 1; // Assume we directly write to result ptrs. // The order that we link with peers must ensure that // P2P slots are used for high-priority links first. for (int j=0; j %d via P2P device mem", rank, iRank); comm->ptrs[i].local = ranks[myId].devptr; comm->ptrs[i].remote = ranks[i].devptr; comm->ptrs[i].remoteCleanup = CLEANUP_NONE; } else { // go through hostmem INFO("rank access %d -> %d via zero-copy host mem", rank, iRank); if (j <= 2) *ringDirectFailed = 1; if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) { WARN("rank %d failed to map zero copy buffer to device", rank); commClearMaps(comm); return ncclUnhandledCudaError; } if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, ranks[i].hostptr, 0) != cudaSuccess) { WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank); commClearMaps(comm); return ncclUnhandledCudaError; } comm->ptrs[i].remoteCleanup = CLEANUP_NONE; } } else { // multi-process! *ringDirectFailed = 1; if (canpeer || myDev == iDev) { INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank); comm->ptrs[i].local = ranks[myId].devptr; if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote), ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) { WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank); commClearMaps(comm); return ncclUnhandledCudaError; } comm->ptrs[i].remoteCleanup = CLEANUP_CUIPC; comm->ptrs[i].cleanupHandle = comm->ptrs[i].remote; } else { // go through hostmem INFO("rank access %d -> %d via zero copy host shm", rank, iRank); if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) { WARN("rank %d failed to obtain dev ptr to sysmem buffer", rank); commClearMaps(comm); return ncclUnhandledCudaError; } char rankname[1024]; sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank); if (openHostMemShm(rankname, (ncclMem**)&comm->ptrs[i].cleanupHandle, ranks[i].buffSize) != ncclSuccess) { WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank); commClearMaps(comm); return ncclUnhandledCudaError; } if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, comm->ptrs[i].cleanupHandle, 0) != cudaSuccess) { WARN("rank %d failed to obtain dev ptr for rank %d", rank, iRank); commClearMaps(comm); return ncclUnhandledCudaError; } comm->ptrs[i].remoteCleanup = CLEANUP_UNMAP; } } } return ncclSuccess; } static void initDebug() { const char* nccl_debug = getenv("NCCL_DEBUG"); if (nccl_debug == NULL) { ncclDebugLevel = NONE; } else if (strcmp(nccl_debug, "WARN") == 0) { ncclDebugLevel = WARN; } else if (strcmp(nccl_debug, "INFO") == 0) { ncclDebugLevel = INFO; INFO("NCCL debug level set to INFO"); } else if (strcmp(nccl_debug, "ABORT") == 0) { ncclDebugLevel = ABORT; INFO("NCCL debug level set to ABORT"); } } static void commFree(ncclComm_t comm) { if (comm == NULL) return; for(int i=0; ievents.isDone[i] != NULL) if (cudaEventDestroy(comm->events.isDone[i]) != cudaSuccess) INFO("failed to destroy cuda event %d", i); } ncclResult_t res = commClearMaps(comm); if (res != ncclSuccess) INFO("failed to cleanup comm maps"); if (comm->userFromRing != NULL) free(comm->userFromRing); if (comm->devUserFromRing != NULL) if (cudaFree(comm->devUserFromRing) != cudaSuccess) INFO("commFree failed to free dev maps"); if (comm->ringFromUser != NULL) free(comm->ringFromUser); if (comm->devMem != NULL && cudaFree(comm->devMem) != cudaSuccess) INFO("Failed to free devMap"); if (comm->hostMem != NULL) { if (comm->hostMemState & ShmMapped) { if (cudaHostUnregister(comm->hostMem) != cudaSuccess) INFO("Failed to unregister hostMem"); size_t size = offsetof(ncclMem, buff) + comm->buffSize; if (shmUnmap(comm->hostMem, size) != ncclSuccess) INFO("Failed to unmap hostMem"); comm->hostMemState ^= ShmMapped; } else { cudaFreeHost(comm->hostMem); } } free(comm); } static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, const ncclUniqueId* commId, int rank) { if (ndev < 1) { WARN("invalid device count (%d) requested", ndev); return ncclUnsupportedDeviceCount; } if (rank >= ndev || rank < 0) { WARN("rank %d exceeds ndev=%d", rank, ndev); return ncclInvalidRank; } size_t commBytes = offsetof(ncclComm, ptrs) + ndev*sizeof(ncclNodeRef); struct ncclComm* comm = (struct ncclComm*)malloc(commBytes); if (comm == NULL) { WARN("comm allocation failed"); return ncclSystemError; } memset(comm, 0, commBytes); comm->nDev = ndev; cudaGetDevice(&comm->cudaDev); const char* str = getenv("NCCL_BUFFSIZE"); if (str != NULL) { errno = 0; comm->buffSize = strtol(str, NULL, 10); if (errno == ERANGE || comm->buffSize == 0) { INFO("rank %d invalid NCCL_BUFFSIZE: %s, using default %lu", rank, str, DEFAULT_BUFFER_SIZE_BYTES); comm->buffSize = DEFAULT_BUFFER_SIZE_BYTES; } } else { comm->buffSize = DEFAULT_BUFFER_SIZE_BYTES; } INFO("rank %d using buffSize = %lu", rank, comm->buffSize); ncclResult_t res; res = allocDevMem(&comm->devMem, comm->buffSize); if (res != ncclSuccess) { WARN("rank %d failed to allocate device buffer", rank); commFree(comm); return res; } if (cudaMalloc(&comm->devUserFromRing, ndev*sizeof(int)) != cudaSuccess) { WARN("rank %d failed to allocated device maps", rank); commFree(comm); return ncclCudaMallocFailed; } comm->userFromRing = (int*)malloc(ndev*sizeof(int)); if (comm->userFromRing == NULL) { WARN("rank %d failed to allocate host maps", rank); commFree(comm); return ncclSystemError; } comm->ringFromUser = (int*)malloc(ndev*sizeof(int)); if (comm->ringFromUser == NULL) { WARN("rank %d failed to allocate host maps", rank); commFree(comm); return ncclSystemError; } EventQueue* eq = &comm->events; for(int i=0; iisDone+i, cudaEventDisableTiming) != cudaSuccess) { WARN("rank %d failed to create nccl event %d", rank, i); commFree(comm); return ncclUnhandledCudaError; } } if(commId == NULL) { comm->hostMemState = 0; res = allocHostMem(&comm->hostMem, comm->buffSize); } else { char rankname[1024]; sprintf(rankname, "%s-%d", commId->internal, rank); res = openHostMemShm(rankname, &comm->hostMem, comm->buffSize); if (res != ncclSuccess) { WARN("rank %d failed to allocate host buffer", rank); commFree(comm); return res; } comm->hostMemState = ShmMapped | ShmLinked; } *comret = comm; return ncclSuccess; } static ncclResult_t commUnlinkHostMem(ncclComm_t comm, ncclUniqueId commId, int rank) { char rankname[1024]; sprintf(rankname, "%s-%d", commId.internal, rank); if (comm->hostMemState & ShmLinked) comm->hostMemState ^= ShmLinked; return shmUnlink(rankname); } extern "C" DSOGLOBAL ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) { if (strlen(commId.internal) < 1 || strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) { WARN("rank %d invalid commId", myrank); return ncclInvalidArgument; } initDebug(); ncclResult_t res; RankEntry myStuff; RankGather* gath = NULL; res = wrapSymbols(); if (res != ncclSuccess) { WARN("NCCL failed to initialize client libs"); return res; } res = wrapNvmlInit(); if (res != ncclSuccess) { WARN("rank %d failed to initialize nvml", myrank); return res; } res = commAlloc(newcomm, ndev, &commId, myrank); if (res != ncclSuccess) { WARN("rank %d failed to allocate communicator", myrank); return res; } res = populateRankInfo(&myStuff, myrank, *newcomm); if (res != ncclSuccess) { WARN("rank %d failed to obtain rank info", myrank); goto cleanup; } res = initGather(&gath, commId, ndev, myrank, myStuff); if (res != ncclSuccess) { WARN("rank %d failed to gather rank info", myrank); goto cleanup; } res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->ringDirectFail); if (res != ncclSuccess) { WARN("rank %d failed to build comm maps", myrank); goto cleanup; } syncRingDirect(gath, &((*newcomm)->useRemoteRecv)); INFO("PushToRecv algos are %s\n", (*newcomm)->useRemoteRecv ? "enabled" : "disabled"); res = closeGather(gath, ndev); // includes a barrier gath = NULL; if (res != ncclSuccess) { WARN("rank %d failed to close gather", myrank); goto cleanup; } goto final; cleanup: if (gath != NULL) closeGather(gath, ndev); commFree(*newcomm); final: if ((*newcomm)->hostMemState & ShmLinked) { if (commUnlinkHostMem(*newcomm, commId, myrank) != ncclSuccess) INFO("rank %d failed to unlink host mem shm segment", myrank); } if (wrapNvmlShutdown() != ncclSuccess) INFO("rank %d did not shutdown nvml properly", myrank); return res; } extern "C" DSOGLOBAL ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) { initDebug(); ncclResult_t res; int savedDevice; RankEntry* ranks = NULL; int rank, cudaDev; ncclComm_t comm = NULL; char busId[13]; nvmlDevice_t nvmlHandle; int affinity_set = 0; int ringDirectFail = 0; // Assume direct access to recv ptr OK res = wrapSymbols(); if (res != ncclSuccess) { WARN("NCCL failed to initialize client libs"); return res; } cudaGetDevice(&savedDevice); ranks = (RankEntry*)malloc(ndev*sizeof(RankEntry)); if (ranks == NULL) { WARN("NCCL allocation failed"); return ncclSystemError; } memset(ranks, 0, ndev*sizeof(RankEntry)); res = wrapNvmlInit(); if (res != ncclSuccess) { WARN("nccl failed to initialize nvml"); return res; } for(rank=0; rankcudaDev); res = commBuildMaps(comm, NULL, rank, ranks, &ringDirectFail); if (res != ncclSuccess) { WARN("rank %d failed to build comm maps", rank); goto cleanup; } } INFO("PushToRecv algos are %s\n", (ringDirectFail) ? "disabled" : "enabled"); for(rank=0; rankuseRemoteRecv = ringDirectFail ? 0 : 1; } free(ranks); ranks = NULL; res = ncclSuccess; goto final; cleanup: if (ranks != NULL) free(ranks); for(rank=0; rankcudaDev; if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(commDevice)); } commFree(comm); if (savedDevice != commDevice) cudaSetDevice(savedDevice); } extern "C" DSOGLOBAL const char* ncclGetErrorString(ncclResult_t code) { switch (code) { case ncclSuccess : return "no error"; case ncclUnhandledCudaError : return "unhandled cuda error"; case ncclSystemError : return "system error"; case ncclInternalError : return "internal error"; case ncclInvalidDevicePointer : return "invalid device pointer"; case ncclInvalidRank : return "invalid rank"; case ncclUnsupportedDeviceCount : return "unsupported device count"; case ncclDeviceNotFound : return "device not found"; case ncclInvalidDeviceIndex : return "invalid device index"; case ncclLibWrapperNotSet : return "lib wrapper not initialized"; case ncclCudaMallocFailed : return "cuda malloc failed"; case ncclRankMismatch : return "parameter mismatch between ranks"; case ncclInvalidArgument : return "invalid argument"; case ncclInvalidType : return "invalid data type"; case ncclInvalidOperation : return "invalid reduction operations"; } return "unknown result code"; } extern "C" DSOGLOBAL ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { *count = comm->nDev; return ncclSuccess; } extern "C" DSOGLOBAL ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { *devid = comm->cudaDev; return ncclSuccess; } extern "C" DSOGLOBAL ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { *rank = comm->userFromRing[comm->ncclId]; return ncclSuccess; }