/************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "param.h" #define NCCL_MAX_SCORE 7 /* Parse user defined rings. Format is like : * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0" * Rings with a non-matching number of ranks are ignored so we can provide * rings for multiple cases. */ #define MAX_ENV_RANKS 512 static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) { int ranks[MAX_ENV_RANKS]; int nrings = 0; int rank = 0; int offset = 0; int status = 0; // 0 : between numbers, 1 : inside number do { int digit = str[offset] - '0'; if (digit >= 0 && digit <= 9) { if (status == 0) { ranks[rank] = digit; status = 1; } else { ranks[rank] = ranks[rank]*10+digit; } } else { if (status == 1) { rank++; if (rank == MAX_ENV_RANKS) goto end; } status = 0; if (str[offset] == '|' || str[offset] == '\0') { int prevRank = ranks[rank-1]; // Ignore rings if nranks doesn't match if (rank != nranks) goto newring; for (int r=0; r= nranks) goto newring; // Ignore rings with duplicate ranks for (int i=0; i * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 * * Connecting P2P domains with shared memory <13> * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 * * Connecting ranks (only inside the P2P domain) <13> * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 * * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without * risking to explode in terms of combinations, and we scale better. * * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until * we get at least one ring. */ static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) { connected[rank] = 1; for (int r=0; r MAXCHANNELS) newNrings = MAXCHANNELS; for (int r=nrings; r0) { int ret = parseRings(str, nrings, nranks, prev, next); if (ret == ncclSuccess && *nrings > 0) { if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings); NCCLCHECK(getEnvThreads(nthreads)); for (int r = 0; r<*nrings; r++) { for (int i = 0; iindex tables int* coords, *globalIdxToRank, *globalRankToIdx; NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS)); for (int i=0; i=0; t--) { for (int i=0; i t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0; if (!sameLocal) continue; groups[nidx] = coords[r*NTRANSPORTS+t]; subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx; rankToIdx[r] = nidx; idxToRank[nidx] = r; nidx++; } int ngroups = groups[nidx-1] + 1; // Coords should be ordered ncclTvalue_t* subvalues; int *subprev, *subnext; NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx)); NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp)); NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp)); if (ngroups > 1) { /* Extract subvalues */ for (int i=0; i *nrings) { *nrings = nringsTmp; for (int i=0; i 1 && nvlink) { *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut); } if (*nrings == 0) { WARN("Could not create rings, falling back on simple ring"); *nrings = 1; prev[rank] = (rank-1+nranks) % nranks; next[rank] = (rank+1)%nranks; } int maxNrings = ncclParamMaxNrings(); int minNrings = ncclParamMinNrings(); if (maxNrings > 0 && minNrings > maxNrings) { if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS"); minNrings = 0; } if (minNrings > MAXCHANNELS) { if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS); minNrings = MAXCHANNELS; } if (maxNrings > 0 && maxNrings <= *nrings) { if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings); *nrings = maxNrings; } else { int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1; if (minNrings < defaultMinNrings) minNrings = defaultMinNrings; if (minNrings > 0 && minNrings > *nrings) { if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings); *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut); } } NCCLCHECK(getEnvThreads(nthreads)); return ncclSuccess; }