nccl/test/single/reduce_scatter_test.cu
2016-09-22 11:58:28 -07:00

282 lines
8.3 KiB
Plaintext

/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* See LICENCE.txt for license information
************************************************************************/
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <string>
#include <vector>
#include "nccl.h"
#include "test_utilities.h"
int errors = 0;
double min_bw = 10000.0;
bool is_reduction = true;
template<typename T>
void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) {
// initialize data
int nDev = 0;
NCCLCHECK(ncclCommCount(comms[0], &nDev));
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
T* buffer = (T*)malloc(N * nDev * sizeof(T));
T* result = (T*)malloc(N * nDev * sizeof(T));
memset(buffer, 0, N * nDev * sizeof(T));
memset(result, 0, N * nDev * sizeof(T));
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
CUDACHECK(cudaStreamCreate(s+i));
CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
Randomize(sendbuff[i], N * nDev, i);
if (i == 0) {
CUDACHECK(cudaMemcpy(result, sendbuff[i], N * nDev * sizeof(T),
cudaMemcpyDeviceToHost));
} else {
Accumulate<T>(result, sendbuff[i], N * nDev, op);
}
}
// warm up GPU
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
CUDACHECK(cudaStreamSynchronize(s[i]));
}
// for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
{
int n = N;
printf("%12i %12i %6s %6s", (int)(n * sizeof(T)), n,
TypeName(type).c_str(), OperationName(op).c_str());
// do out-of-place reduction first
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
op, comms[i], s[i]));
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
CUDACHECK(cudaStreamSynchronize(s[i]));
}
auto stop = std::chrono::high_resolution_clock::now();
double elapsedSec =
std::chrono::duration_cast<std::chrono::duration<double>>(
stop - start).count();
double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
double busbw = algbw * (double)(nDev - 1);
double maxDelta = 0.0;
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
double tmpDelta = CheckDelta<T>(recvbuff[i], result+i*n, n);
maxDelta = std::max(tmpDelta, maxDelta);
}
printf(" %7.3f %5.2f %5.2f %7.0le", elapsedSec * 1.0E3, algbw, busbw,
maxDelta);
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
if (busbw < min_bw) min_bw = busbw;
}
{
// now do in-place reduction
int n = N;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
op, comms[i], s[i]));
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
CUDACHECK(cudaStreamSynchronize(s[i]));
}
auto stop = std::chrono::high_resolution_clock::now();
double elapsedSec =
std::chrono::duration_cast<std::chrono::duration<double>>(
stop - start).count();
double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
double busbw = algbw * (double)(nDev - 1);
double maxDelta = 0.0;
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
double tmpDelta = CheckDelta<T>(sendbuff[i], result+i*n, n);
maxDelta = std::max(tmpDelta, maxDelta);
}
printf(" %7.3f %5.2f %5.2f %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
maxDelta);
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
if (busbw < min_bw) min_bw = busbw;
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
CUDACHECK(cudaStreamDestroy(s[i]));
}
free(s);
free(buffer);
free(result);
}
template<typename T>
void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
const std::vector<int>& dList) {
int nDev = 0;
NCCLCHECK(ncclCommCount(comms[0], &nDev));
T** sendbuff = (T**)malloc(nDev * sizeof(T*));
T** recvbuff = (T**)malloc(nDev * sizeof(T*));
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
CUDACHECK(cudaMalloc(sendbuff + i, N * nDev * sizeof(T)));
CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
}
for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
// for (ncclRedOp_t op : { ncclSum }) {
RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(dList[i]));
CUDACHECK(cudaFree(sendbuff[i]));
CUDACHECK(cudaFree(recvbuff[i]));
}
free(sendbuff);
free(recvbuff);
}
void usage() {
printf("Tests nccl ReduceScatter with user supplied arguments.\n"
" Usage: all_reduce_test <data size in bytes> [number of GPUs] "
"[GPU 0] [GPU 1] ...\n\n");
}
int main(int argc, char* argv[]) {
int nVis = 0;
CUDACHECK(cudaGetDeviceCount(&nVis));
int N = 0;
if (argc > 1) {
int t = sscanf(argv[1], "%d", &N);
if (t == 0) {
printf("Error: %s is not an integer!\n\n", argv[1]);
usage();
exit(EXIT_FAILURE);
}
} else {
printf("Error: must specify at least data size in bytes!\n\n");
usage();
exit(EXIT_FAILURE);
}
int nDev = nVis;
if (argc > 2) {
int t = sscanf(argv[2], "%d", &nDev);
if (t == 0) {
printf("Error: %s is not an integer!\n\n", argv[1]);
usage();
exit(EXIT_FAILURE);
}
}
std::vector<int> dList(nDev);
for (int i = 0; i < nDev; ++i)
dList[i] = i % nVis;
if (argc > 3) {
if (argc - 3 != nDev) {
printf("Error: insufficient number of GPUs in list\n\n");
usage();
exit(EXIT_FAILURE);
}
for (int i = 0; i < nDev; ++i) {
int t = sscanf(argv[3 + i], "%d", dList.data() + i);
if (t == 0) {
printf("Error: %s is not an integer!\n\n", argv[2 + i]);
usage();
exit(EXIT_FAILURE);
}
}
}
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
printf("# Using devices\n");
for (int g = 0; g < nDev; ++g) {
int cudaDev;
int rank;
cudaDeviceProp prop;
NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
NCCLCHECK(ncclCommUserRank(comms[g], &rank));
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
prop.pciBusID, prop.name);
}
printf("\n");
printf("# %10s %12s %6s %6s out-of-place "
"in-place\n", "", "", "", "");
printf("# %10s %12s %6s %6s %7s %5s %5s %7s %7s %5s %5s %7s\n",
"bytes", "N", "type", "op", "time", "algbw", "busbw", "delta", "time",
"algbw", "busbw", "delta");
RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
#ifdef CUDA_HAS_HALF
RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
#endif
RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
printf("\n");
for(int i=0; i<nDev; ++i)
ncclCommDestroy(comms[i]);
free(comms);
char* str = getenv("NCCL_TESTS_MIN_BW");
double check_min_bw = str ? atof(str) : -1;
printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
printf(" Min bus bandwidth : %g %s\n", min_bw, check_min_bw == -1 ? "" : (min_bw < check_min_bw ? "FAILED" : "OK"));
printf("\n");
if (errors || min_bw < check_min_bw)
exit(EXIT_FAILURE);
else
exit(EXIT_SUCCESS);
}