nccl/src/misc/rings.cc
David Addison 0ceaec9cee NCCL 2.4.7-1
Performance tweaks for PowerPC builds only;
      Set default NCCL_MIN_NRINGS to 4
      Disable PCI-E NUMA distance detection
2019-05-10 13:52:16 -07:00

392 lines
14 KiB
C++

/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "param.h"
#define NCCL_MAX_SCORE 7
/* Parse user defined rings. Format is like :
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
* Rings with a non-matching number of ranks are ignored so we can provide
* rings for multiple cases.
*/
#define MAX_ENV_RANKS 512
static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
int ranks[MAX_ENV_RANKS];
int nrings = 0;
int rank = 0;
int offset = 0;
int status = 0; // 0 : between numbers, 1 : inside number
do {
int digit = str[offset] - '0';
if (digit >= 0 && digit <= 9) {
if (status == 0) {
ranks[rank] = digit;
status = 1;
} else {
ranks[rank] = ranks[rank]*10+digit;
}
} else {
if (status == 1) {
rank++;
if (rank == MAX_ENV_RANKS) goto end;
}
status = 0;
if (str[offset] == '|' || str[offset] == '\0') {
int prevRank = ranks[rank-1];
// Ignore rings if nranks doesn't match
if (rank != nranks) goto newring;
for (int r=0; r<nranks; r++) {
int rank = ranks[r];
// Ignore rings with ranks out of bounds
if (rank < 0 || rank >= nranks) goto newring;
// Ignore rings with duplicate ranks
for (int i=0; i<r; i++)
if (ranks[i] == rank) goto newring;
next[nrings*nranks+prevRank] = rank;
prev[nrings*nranks+rank] = prevRank;
prevRank = rank;
}
nrings++;
newring:
rank = 0;
}
}
} while (str[offset++] != 0);
end:
*nringsRet = nrings;
return ncclSuccess;
}
/*
* Ring creation algorithm
*
* First, we establish hierarchical coordinates depending on the way ranks can
* communicate. After fillCoords, we have for each rank a unique 3-int array
* { node, pci_domain, rank } corresponding to the three transports :
* { 2[NET], 1[SHM], 0[P2P] }.
* Also, we renumber ranks (to indexes) based on their growing coordinates.
*
* Then, we ask transports to connect groups together. We start with net, then
* shm, then p2p. We maintain two arrays, prev and next, where values are equal
* to -1 when ranks are not yet connected, and a rank otherwise. We never
* connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
* ranks, if we are rank 13, we should see something like (provided we have a
* single net interface, hence a single ring) :
*
* Connecting all nodes <13>
* 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
*
* Connecting P2P domains with shared memory <13>
* 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
*
* Connecting ranks (only inside the P2P domain) <13>
* 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
*
* Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
* which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
* risking to explode in terms of combinations, and we scale better.
*
* Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
* we get at least one ring.
*/
static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
connected[rank] = 1;
for (int r=0; r<nranks; r++) {
if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
recIsConnected(r, connected, nranks, matrix, transport);
}
}
}
static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
for (int r=0; r<nranks; r++) connected[r] = 0;
recIsConnected(rank, connected, nranks, matrix, transport);
}
#define NEW_IDX(rank) do { \
rankToIdx[rank] = idx; \
idxToRank[idx] = rank; \
for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
idx++; \
} while (0)
int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
for (int r=0; r<nranks; r++) {
if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
}
return -1;
}
static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
int current[NTRANSPORTS];
int* p2pConnected;
NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
int curRank = 0, idx = 0;
while (1) {
// P2P is handled separately as there is no level below it and we need to
// cover the case of being connected to another GPU indirectly.
// So we detect all GPUs in the same P2P domain once and add them all at
// once.
isConnected(curRank, p2pConnected, nranks, matrix, 0);
for (int r=0; r<nranks; r++) {
if (p2pConnected[r]) {
NEW_IDX(r);
curRank = r;
current[0]++;
}
}
current[0] = 0;
if (idx == nranks) {
free(p2pConnected);
return ncclSuccess;
}
// Find next group, either connected through SHM or NET.
int rank;
int transport = 1;
while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
current[transport] = 0;
transport++;
if (transport == NTRANSPORTS) {
WARN("Error : Could not find transport to connect next group\n");
free(p2pConnected);
return ncclInternalError; }
}
curRank = rank;
current[transport]++;
}
}
#ifdef __PPC__
// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes
#define DEFAULT_MIN_NRINGS 4
#else
#define DEFAULT_MIN_NRINGS 0
#endif
NCCL_PARAM(MinNrings, "MIN_NRINGS", DEFAULT_MIN_NRINGS);
NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
/* Users can force the number of threads with an environment variable */
NCCL_PARAM(Nthreads, "NTHREADS", -2);
ncclResult_t getEnvThreads(int* nthreads) {
int64_t nt = ncclParamNthreads();
if (nt != -2)
*nthreads = nt;
return ncclSuccess;
}
static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
for (int r=nrings; r<newNrings; r++) {
for (int i=0; i<nranks; i++) {
a[r*nranks+i] = a[(r-nrings)*nranks+i];
b[r*nranks+i] = b[(r-nrings)*nranks+i];
c[r*nranks+i] = c[(r-nrings)*nranks+i];
d[r*nranks+i] = d[(r-nrings)*nranks+i];
}
}
return newNrings;
}
/* Main ring creation function */
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
*nrings = 0;
if (nranks == 1) return ncclSuccess;
char* str = getenv("NCCL_RINGS");
if (str && strlen(str)>0) {
int ret = parseRings(str, nrings, nranks, prev, next);
if (ret == ncclSuccess && *nrings > 0) {
if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
NCCLCHECK(getEnvThreads(nthreads));
for (int r = 0; r<*nrings; r++) {
for (int i = 0; i<nranks; i++) {
if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
}
}
return ncclSuccess;
}
if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
*nrings = 0;
}
// Compute hierarchical topology groups, indexes, and rank<->index tables
int* coords, *globalIdxToRank, *globalRankToIdx;
NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));
NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));
// Start with a high score, then decrease until we find rings
int minScore = NCCL_MAX_SCORE;
int nringsTmp;
int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&idxToRank, nranks));
NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
NCCLCHECK(ncclCalloc(&groups, nranks));
NCCLCHECK(ncclCalloc(&subgroups, nranks));
int nThreads;
do {
nThreads = *nthreads;
for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
nringsTmp = MAXCHANNELS;
// Loop over transports to connect groups
for (int t=NTRANSPORTS-1; t>=0; t--) {
for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
int nidx = 0;
for (int i=0; i<nranks; i++) {
// Extract only ranks in the same local area as rank
// We need to extract them in the topological order, hence we iterate over indexes, not ranks
int r = globalIdxToRank[i];
int sameLocal = 1;
for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
if (!sameLocal) continue;
groups[nidx] = coords[r*NTRANSPORTS+t];
subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
rankToIdx[r] = nidx;
idxToRank[nidx] = r;
nidx++;
}
int ngroups = groups[nidx-1] + 1; // Coords should be ordered
ncclTvalue_t* subvalues;
int *subprev, *subnext;
NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
if (ngroups > 1) {
/* Extract subvalues */
for (int i=0; i<nidx; i++) {
for (int j=0; j<nidx; j++) {
if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
else
subvalues[i*nidx+j] = 0;
}
}
/* Extract subprev/subnext */
for (int i=0; i<nidx*nringsTmp; i++) {
subprev[i] = subnext[i] = -1;
}
for (int r=0; r<nringsTmp; r++) {
int start = -1, end = -1;
for (int i=0; i<nranks; i++) {
if (rankToIdx[i] == -1) continue;
if (prevTmp[r*nranks+i] != -1) start = i;
if (nextTmp[r*nranks+i] != -1) end = i;
}
if (start != -1 && end != -1) {
subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
}
}
/* Get rings */
NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
/* Merge subprev/subnext into prev/next */
for (int r=0; r<nringsTmp; r++) {
for (int i=0; i<nidx; i++) {
if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
if (t == NTRANSPORTS-1) {
// Save node-level masters for trees
treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
}
}
}
//for (int r=0; r<nringsTmp; r++) {
//printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
//printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
//}
}
free(subvalues);
free(subprev);
free(subnext);
if (nringsTmp == 0) break;
}
minScore--;
if (nringsTmp > *nrings) {
*nrings = nringsTmp;
for (int i=0; i<nranks*(*nrings); i++) {
prev[i] = prevTmp[i];
next[i] = nextTmp[i];
}
}
} while (nringsTmp == 0 && minScore);
free(coords);
free(globalRankToIdx);
free(globalIdxToRank);
free(prevTmp);
free(nextTmp);
free(idxToRank);
free(rankToIdx);
free(groups);
free(subgroups);
*nthreads = nThreads;
/* Duplicate the rings in case of multinode+NVLink */
int nnodes = 0;
for (int r=0; r<nranks; r++) nnodes += treeIn[r];
int nvlink;
NCCLCHECK(ncclNvlinkGpu(&nvlink));
if (nnodes > 1 && nvlink) {
*nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
}
if (*nrings == 0) {
WARN("Could not create rings, falling back on simple ring");
*nrings = 1;
prev[rank] = (rank-1+nranks) % nranks;
next[rank] = (rank+1)%nranks;
}
int maxNrings = ncclParamMaxNrings();
int minNrings = ncclParamMinNrings();
if (maxNrings > 0 && minNrings > maxNrings) {
if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
minNrings = 0;
}
if (minNrings > MAXCHANNELS) {
if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
minNrings = MAXCHANNELS;
}
if (maxNrings > 0 && maxNrings <= *nrings) {
if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
*nrings = maxNrings;
} else {
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
if (minNrings > 0 && minNrings > *nrings) {
if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
*nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
}
}
NCCLCHECK(getEnvThreads(nthreads));
return ncclSuccess;
}