Make tests check for deltas and report bandwidth
This commit is contained in:
parent
cabd6848e4
commit
6c77476cc1
@ -9,6 +9,7 @@
|
||||
#define SRC_TEST_UTILITIES_H_
|
||||
|
||||
#include <curand.h>
|
||||
#include <cerrno>
|
||||
#include <string>
|
||||
|
||||
#define CUDACHECK(cmd) do { \
|
||||
@ -135,6 +136,27 @@ void Randomize<half>(half* const dest, const int N, const int randomSeed) {
|
||||
}
|
||||
#endif
|
||||
|
||||
void makeRandom(void* ptr, int n, ncclDataType_t type, int seed) {
|
||||
if (type == ncclChar)
|
||||
Randomize<char>((char*)ptr, n, seed);
|
||||
else if (type == ncclInt)
|
||||
Randomize<int>((int*)ptr, n, seed);
|
||||
#ifdef CUDA_HAS_HALF
|
||||
else if (type == ncclHalf)
|
||||
Randomize<half>((half*)ptr, n, seed);
|
||||
#endif
|
||||
else if (type == ncclFloat)
|
||||
Randomize<float>((float*)ptr, n, seed);
|
||||
else if (type == ncclDouble)
|
||||
Randomize<double>((double*)ptr, n, seed);
|
||||
else if (type == ncclInt64)
|
||||
Randomize<long long>((long long*)ptr, n, seed);
|
||||
else if (type == ncclUint64)
|
||||
Randomize<unsigned long long>((unsigned long long*)ptr, n, seed);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
template<typename T, int OP> __global__ static
|
||||
void accumKern(T* acum, const T* contrib, int N) {
|
||||
int tid = threadIdx.x + blockIdx.x*blockDim.x;
|
||||
@ -200,22 +222,44 @@ void accumKern<half, ncclMin>(half* acum, const half* contrib, int N) {
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
void accVecType(void* out, void* in, int n, ncclRedOp_t op) {
|
||||
switch(op) {
|
||||
case ncclSum: accumKern<T, ncclSum> <<<256,256>>>((T*)out, (T*)in, n); break;
|
||||
case ncclProd: accumKern<T, ncclProd><<<256,256>>>((T*)out, (T*)in, n); break;
|
||||
case ncclMax: accumKern<T, ncclMax> <<<256,256>>>((T*)out, (T*)in, n); break;
|
||||
case ncclMin: accumKern<T, ncclMin> <<<256,256>>>((T*)out, (T*)in, n); break;
|
||||
default:
|
||||
printf("Unknown reduction operation.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op) {
|
||||
|
||||
T* devdest;
|
||||
CUDACHECK(cudaHostRegister(dest, N*sizeof(T), 0));
|
||||
CUDACHECK(cudaHostGetDevicePointer(&devdest, dest, 0));
|
||||
switch(op) {
|
||||
case ncclSum: accumKern<T, ncclSum> <<<256,256>>>(devdest, contrib, N); break;
|
||||
case ncclProd: accumKern<T, ncclProd><<<256,256>>>(devdest, contrib, N); break;
|
||||
case ncclMax: accumKern<T, ncclMax> <<<256,256>>>(devdest, contrib, N); break;
|
||||
case ncclMin: accumKern<T, ncclMin> <<<256,256>>>(devdest, contrib, N); break;
|
||||
accVecType<T>((void*)devdest, (void*)contrib, N, op);
|
||||
CUDACHECK(cudaHostUnregister(dest));
|
||||
}
|
||||
|
||||
void accVec(void* out, void* in, int n, ncclDataType_t type, ncclRedOp_t op) {
|
||||
switch (type) {
|
||||
case ncclChar: accVecType<char> (out, in, n, op); break;
|
||||
case ncclInt: accVecType<int> (out, in, n, op); break;
|
||||
#ifdef CUDA_HAS_HALF
|
||||
case ncclHalf: accVecType<half> (out, in, n, op); break;
|
||||
#endif
|
||||
case ncclFloat: accVecType<float> (out, in, n, op); break;
|
||||
case ncclDouble: accVecType<double> (out, in, n, op); break;
|
||||
case ncclInt64: accVecType<long long> (out, in, n, op); break;
|
||||
case ncclUint64: accVecType<unsigned long long> (out, in, n, op); break;
|
||||
default:
|
||||
printf("Unknown reduction operation.\n");
|
||||
printf("Unknown reduction type.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
CUDACHECK(cudaHostUnregister(dest));
|
||||
}
|
||||
|
||||
template<typename T> __device__
|
||||
@ -270,6 +314,22 @@ double CheckDelta(const T* results, const T* expected, int N) {
|
||||
return maxerr;
|
||||
}
|
||||
|
||||
void maxDiff(double* max, void* first, void* second, int n, ncclDataType_t type, cudaStream_t s) {
|
||||
switch (type) {
|
||||
case ncclChar: deltaKern<char, 512> <<<1,512,0,s>>>((char*)first, (char*)second, n, max); break;
|
||||
case ncclInt: deltaKern<int, 512> <<<1,512,0,s>>>((int*)first, (int*)second, n, max); break;
|
||||
#ifdef CUDA_HAS_HALF
|
||||
case ncclHalf: deltaKern<half, 512> <<<1,512,0,s>>>((half*)first, (half*)second, n, max); break;
|
||||
#endif
|
||||
case ncclFloat: deltaKern<float, 512> <<<1,512,0,s>>>((float*)first, (float*)second, n, max); break;
|
||||
case ncclDouble: deltaKern<double, 512> <<<1,512,0,s>>>((double*)first, (double*)second, n, max); break;
|
||||
case ncclInt64: deltaKern<long long, 512> <<<1,512,0,s>>>((long long*)first, (long long*)second, n, max); break;
|
||||
case ncclUint64: deltaKern<unsigned long long, 512><<<1,512,0,s>>>((unsigned long long*)first, (unsigned long long*)second, n, max); break;
|
||||
default:
|
||||
printf("Unknown reduction type.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
std::string TypeName(const ncclDataType_t type) {
|
||||
switch (type) {
|
||||
@ -296,5 +356,83 @@ std::string OperationName(const ncclRedOp_t op) {
|
||||
}
|
||||
}
|
||||
|
||||
ncclDataType_t strToType(const char* s) {
|
||||
if (strcmp(s, "char") == 0)
|
||||
return ncclChar;
|
||||
if (strcmp(s, "int") == 0)
|
||||
return ncclInt;
|
||||
#ifdef CUDA_HAS_HALF
|
||||
if (strcmp(s, "half") == 0)
|
||||
return ncclHalf;
|
||||
#endif
|
||||
if (strcmp(s, "float") == 0)
|
||||
return ncclFloat;
|
||||
if (strcmp(s, "double") == 0)
|
||||
return ncclDouble;
|
||||
if (strcmp(s, "int64") == 0)
|
||||
return ncclInt64;
|
||||
if (strcmp(s, "uint64") == 0)
|
||||
return ncclUint64;
|
||||
|
||||
return nccl_NUM_TYPES;
|
||||
}
|
||||
|
||||
size_t wordSize(ncclDataType_t type) {
|
||||
switch(type) {
|
||||
case ncclChar: return sizeof(char);
|
||||
case ncclInt: return sizeof(int);
|
||||
#ifdef CUDA_HAS_HALF
|
||||
case ncclHalf: return sizeof(short);
|
||||
#endif
|
||||
case ncclFloat: return sizeof(float);
|
||||
case ncclDouble: return sizeof(double);
|
||||
case ncclInt64: return sizeof(long long);
|
||||
case ncclUint64: return sizeof(unsigned long long);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
double deltaMaxValue(ncclDataType_t type, bool is_reduction) {
|
||||
if (is_reduction) {
|
||||
switch(type) {
|
||||
#ifdef CUDA_HAS_HALF
|
||||
case ncclHalf: return 5e-2;
|
||||
#endif
|
||||
case ncclFloat: return 1e-5;
|
||||
case ncclDouble: return 1e-12;
|
||||
}
|
||||
}
|
||||
return 1e-200;
|
||||
}
|
||||
|
||||
ncclRedOp_t strToOp(const char* s) {
|
||||
if (strcmp(s, "sum") == 0)
|
||||
return ncclSum;
|
||||
if (strcmp(s, "prod") == 0)
|
||||
return ncclProd;
|
||||
if (strcmp(s, "max") == 0)
|
||||
return ncclMax;
|
||||
if (strcmp(s, "min") == 0)
|
||||
return ncclMin;
|
||||
|
||||
return nccl_NUM_OPS;
|
||||
}
|
||||
|
||||
int strToPosInt(const char* s) {
|
||||
errno = 0;
|
||||
long temp = strtol(s, NULL, 10);
|
||||
if (errno != 0 || temp > INT_MAX || temp < 0)
|
||||
return 0;
|
||||
return (int)temp;
|
||||
}
|
||||
|
||||
int strToNonNeg(const char* s) {
|
||||
errno = 0;
|
||||
long temp = strtol(s, NULL, 10);
|
||||
if (errno != 0 || temp > INT_MAX || temp < 0)
|
||||
return -1;
|
||||
return (int)temp;
|
||||
}
|
||||
|
||||
#endif // SRC_TEST_UTILITIES_H_
|
||||
|
@ -13,6 +13,9 @@
|
||||
#include "nccl.h"
|
||||
#include "test_utilities.h"
|
||||
|
||||
int errors = 0;
|
||||
double min_bw = 10000.0;
|
||||
bool is_reduction = false;
|
||||
|
||||
template<typename T>
|
||||
void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
@ -84,6 +87,9 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
|
||||
printf(" %7.3f %5.2f %5.2f %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
|
||||
maxDelta);
|
||||
|
||||
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
|
||||
if (busbw < min_bw) min_bw = busbw;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
@ -197,7 +203,7 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
|
||||
RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
|
||||
#if CUDART_VERSION >= 7050
|
||||
#ifdef CUDA_HAS_HALF
|
||||
RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
|
||||
#endif
|
||||
RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
|
||||
@ -211,6 +217,15 @@ int main(int argc, char* argv[]) {
|
||||
ncclCommDestroy(comms[i]);
|
||||
free(comms);
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
char* str = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_min_bw = str ? atof(str) : -1;
|
||||
|
||||
printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
|
||||
printf(" Min bus bandwidth : %g %s\n", min_bw, check_min_bw == -1 ? "" : (min_bw < check_min_bw ? "FAILED" : "OK"));
|
||||
printf("\n");
|
||||
if (errors || min_bw < check_min_bw)
|
||||
exit(EXIT_FAILURE);
|
||||
else
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
|
@ -15,6 +15,9 @@
|
||||
#include <nvToolsExt.h>
|
||||
|
||||
int csv = false;
|
||||
int errors = 0;
|
||||
double min_bw = 10000.0;
|
||||
bool is_reduction = true;
|
||||
|
||||
template<typename T>
|
||||
void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
@ -95,6 +98,9 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
printf((csv)?"%f,%f,%f,%le,":" %7.3f %5.2f %5.2f %7.0le",
|
||||
elapsedSec * 1.0E3, algbw, busbw, maxDelta);
|
||||
|
||||
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
|
||||
if (busbw < min_bw) min_bw = busbw;
|
||||
|
||||
nvtxRangePop();
|
||||
}
|
||||
|
||||
@ -138,6 +144,9 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
printf((csv)?"%f,%f,%f,%le,":" %7.3f %5.2f %5.2f %7.0le\n",
|
||||
elapsedSec * 1.0E3, algbw, busbw, maxDelta);
|
||||
|
||||
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
|
||||
if (busbw < min_bw) min_bw = busbw;
|
||||
|
||||
nvtxRangePop();
|
||||
}
|
||||
|
||||
@ -260,7 +269,7 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
|
||||
RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
|
||||
#if CUDART_VERSION >= 7050
|
||||
#ifdef CUDA_HAS_HALF
|
||||
RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
|
||||
#endif
|
||||
RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
|
||||
@ -274,6 +283,15 @@ int main(int argc, char* argv[]) {
|
||||
ncclCommDestroy(comms[i]);
|
||||
free(comms);
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
char* str = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_min_bw = str ? atof(str) : -1;
|
||||
|
||||
printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
|
||||
printf(" Min bus bandwidth : %g %s\n", min_bw, check_min_bw == -1 ? "" : (min_bw < check_min_bw ? "FAILED" : "OK"));
|
||||
printf("\n");
|
||||
if (errors || min_bw < check_min_bw)
|
||||
exit(EXIT_FAILURE);
|
||||
else
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,9 @@
|
||||
#include "nccl.h"
|
||||
#include "test_utilities.h"
|
||||
|
||||
int errors = 0;
|
||||
double min_bw = 10000.0;
|
||||
bool is_reduction = false;
|
||||
|
||||
template<typename T>
|
||||
void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
|
||||
@ -86,6 +89,9 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
|
||||
|
||||
printf(" %7.3f %5.2f %5.2f %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
|
||||
maxDelta);
|
||||
|
||||
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
|
||||
if (busbw < min_bw) min_bw = busbw;
|
||||
}
|
||||
|
||||
for(int i=0; i < nDev; ++i) {
|
||||
@ -211,6 +217,15 @@ int main(int argc, char* argv[]) {
|
||||
ncclCommDestroy(comms[i]);
|
||||
free(comms);
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
char* str = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_min_bw = str ? atof(str) : -1;
|
||||
|
||||
printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
|
||||
printf(" Min bus bandwidth : %g %s\n", min_bw, check_min_bw == -1 ? "" : (min_bw < check_min_bw ? "FAILED" : "OK"));
|
||||
printf("\n");
|
||||
if (errors || min_bw < check_min_bw)
|
||||
exit(EXIT_FAILURE);
|
||||
else
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,9 @@
|
||||
#include "nccl.h"
|
||||
#include "test_utilities.h"
|
||||
|
||||
int errors = 0;
|
||||
double min_bw = 10000.0;
|
||||
bool is_reduction = true;
|
||||
|
||||
template<typename T>
|
||||
void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
@ -90,6 +93,9 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
|
||||
printf(" %7.3f %5.2f %5.2f %7.0le", elapsedSec * 1.0E3, algbw, busbw,
|
||||
maxDelta);
|
||||
|
||||
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
|
||||
if (busbw < min_bw) min_bw = busbw;
|
||||
}
|
||||
|
||||
{
|
||||
@ -126,6 +132,9 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
|
||||
printf(" %7.3f %5.2f %5.2f %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
|
||||
maxDelta);
|
||||
|
||||
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
|
||||
if (busbw < min_bw) min_bw = busbw;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
@ -258,6 +267,15 @@ int main(int argc, char* argv[]) {
|
||||
ncclCommDestroy(comms[i]);
|
||||
free(comms);
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
char* str = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_min_bw = str ? atof(str) : -1;
|
||||
|
||||
printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
|
||||
printf(" Min bus bandwidth : %g %s\n", min_bw, check_min_bw == -1 ? "" : (min_bw < check_min_bw ? "FAILED" : "OK"));
|
||||
printf("\n");
|
||||
if (errors || min_bw < check_min_bw)
|
||||
exit(EXIT_FAILURE);
|
||||
else
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
|
@ -15,6 +15,9 @@
|
||||
#include <nvToolsExt.h>
|
||||
|
||||
int csv = false;
|
||||
int errors = 0;
|
||||
double min_bw = 10000.0;
|
||||
bool is_reduction = true;
|
||||
|
||||
template<typename T>
|
||||
void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
@ -94,6 +97,9 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
printf((csv)?"%f,%f,%f,%le,":" %7.3f %5.2f %5.2f %7.0le",
|
||||
elapsedSec * 1.0E3, algbw, busbw, maxDelta);
|
||||
|
||||
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
|
||||
if (busbw < min_bw) min_bw = busbw;
|
||||
|
||||
nvtxRangePop();
|
||||
}
|
||||
|
||||
@ -133,6 +139,9 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
|
||||
printf((csv)?"%f,%f,%f,%le,":" %7.3f %5.2f %5.2f %7.0le\n",
|
||||
elapsedSec * 1.0E3, algbw, busbw, maxDelta);
|
||||
|
||||
if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
|
||||
if (busbw < min_bw) min_bw = busbw;
|
||||
|
||||
nvtxRangePop();
|
||||
}
|
||||
|
||||
@ -272,6 +281,15 @@ int main(int argc, char* argv[]) {
|
||||
ncclCommDestroy(comms[i]);
|
||||
free(comms);
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
char* str = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_min_bw = str ? atof(str) : -1;
|
||||
|
||||
printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
|
||||
printf(" Min bus bandwidth : %g %s\n", min_bw, check_min_bw == -1 ? "" : (min_bw < check_min_bw ? "FAILED" : "OK"));
|
||||
printf("\n");
|
||||
if (errors || min_bw < check_min_bw)
|
||||
exit(EXIT_FAILURE);
|
||||
else
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user