#include "benchmark_tests.h" #include "benchmarks.h" #include #include #include using namespace std::chrono; vector test_A(const size_t &NLOOPS, const size_t &N, const function&, const vector&)>& scalar_function) { cout << "#################### (A) ####################" << endl; cout << "\nLOOPS = " << NLOOPS << endl; cout << "\nN = " << N << endl; // Memory allocation cout << "Memory allocation\n"; vector x(N), y(N); cout.precision(2); cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; cout.precision(6); // Data initialization // Special: x_i = i+1; y_i = 1/x_i ==> == N for (size_t i = 0; i < N; ++i) { x[i] = i % 219 + 1; y[i] = 1.0/x[i]; } cout << "\nStart Benchmarking scalar\n"; auto t1 = system_clock::now(); // start timer // Do calculation double check(0.0),ss(0.0); for (size_t i = 0; i < NLOOPS; ++i) { check = scalar_function(x, y); ss += check; // prevents the optimizer from removing unused calculation results. } auto t2 = system_clock::now(); // stop timer auto duration = duration_cast(t2 - t1); // duration in microseconds double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds t_diff = t_diff/NLOOPS; // duration per loop seconds // Check the correct result cout << "\n = " << check << endl; if (static_cast(check) != N) cout << " !! W R O N G result !!\n"; cout << endl; // Timings and Performance cout << endl; cout.precision(2); double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024; double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); cout << "Total duration : " << t_diff*NLOOPS << endl; cout << "Timing in sec. : " << t_diff << endl; cout << "GFLOPS : " << Gflops << endl; cout << "GiByte/s : " << MemBandwidth << endl; //########################################################################## cout << "\nStart Benchmarking norm\n"; auto t3 = system_clock::now(); // start timer // Do calculation double ss2(0.0); for (size_t i = 0; i < NLOOPS; ++i) { auto sk1 = sqrt(scalar(x, x)); ss2 += sk1; // prevents the optimizer from removing unused calculation results. } auto t4 = system_clock::now(); // stop timer auto duration2 = duration_cast(t4 - t3); // duration in microseconds double t_diff2 = static_cast(duration2.count()) / 1e6; // overall duration in seconds t_diff2 = t_diff2/NLOOPS; // duration per loop seconds cout << "ss(norm): " << ss2 << endl; cout << "Timing in sec. : " << t_diff2 << endl; return vector{t_diff, Gflops, MemBandwidth}; } vector test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function(const vector&, const vector&)>& MatVec_function) { cout << "#################### (B) ####################" << endl; cout << "\nLOOPS = " << NLOOPS << endl; cout << "\nN = " << N << endl; cout << "\nM = " << M << endl; // Memory allocation cout << "Memory allocation\n"; vector A(M*N); vector x(N); cout.precision(2); cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; cout.precision(6); // Data initialization for (size_t i = 0; i < M; ++i) for (size_t j = 0; j < N; ++j) A[N*i + j] = (i + j) % 219 + 1; for (size_t j = 0; j < N; ++j) { x[j] = 1.0/A[N*17 + j]; } cout << "\nStart Benchmarking MatVec\n"; auto t1 = system_clock::now(); // start timer // Do calculation vector b(M); for (size_t i = 0; i < NLOOPS; ++i) { b = MatVec_function(A, x); } auto t2 = system_clock::now(); // stop timer auto duration = duration_cast(t2 - t1); // duration in microseconds double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds t_diff = t_diff/NLOOPS; // duration per loop seconds // Check the correct result cout << "\n = " << b[17] << endl; if (static_cast(b[17]) != N) { cout << " !! W R O N G result !!\n"; } cout << endl; // Timings and Performance cout << endl; cout.precision(2); double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024; double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); cout << "Total duration : " << t_diff*NLOOPS << endl; cout << "Timing in sec. : " << t_diff << endl; cout << "GFLOPS : " << Gflops << endl; cout << "GiByte/s : " << MemBandwidth << endl; return vector{t_diff, Gflops, MemBandwidth}; } vector test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function(const vector&, const vector&, size_t const &shared_dim)>& MatMat_function) { cout << "#################### (C) ####################" << endl; cout << "\nLOOPS = " << NLOOPS << endl; cout << "\nL = " << L << endl; cout << "\nM = " << M << endl; cout << "\nN = " << N << endl; // Memory allocation cout << "Memory allocation\n"; vector A(M*L); vector B(L*N); cout.precision(2); cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; cout.precision(6); // Data initialization for (size_t i = 0; i < M; ++i) for (size_t k = 0; k < L; ++k) A[L*i + k] = (i + k) % 219 + 1; for (size_t k = 0; k < L; ++k) for (size_t j = 0; j < N; ++j) B[N*k + j] = 1.0/A[L*17 + k]; cout << "\nStart Benchmarking MatMat\n"; auto t1 = system_clock::now(); // start timer // Do calculation vector C(M*N); double check; double check_sum; for (size_t i = 0; i < NLOOPS; ++i) { C = MatMat_function(A, B, L); check = C[N*17]; check_sum += check; // prevents the optimizer from removing unused calculation results. } cout << check_sum; auto t2 = system_clock::now(); // stop timer auto duration = duration_cast(t2 - t1); // duration in microseconds double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds t_diff = t_diff/NLOOPS; // duration per loop seconds // Check the correct result cout << "\n C[17,0] = " << check << endl; if (static_cast(check) != L) { cout << " !! W R O N G result !!, should be " << L <<"\n"; } cout << endl; // Timings and Performance cout << endl; cout.precision(2); double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024; double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]); cout << "Total duration : " << t_diff*NLOOPS << endl; cout << "Timing in sec. : " << t_diff << endl; cout << "GFLOPS : " << Gflops << endl; cout << "GiByte/s : " << MemBandwidth << endl; return vector{t_diff, Gflops, MemBandwidth}; } vector test_D(const size_t &NLOOPS, const size_t &N, const size_t &p) { cout << "#################### (D) ####################" << endl; cout << "\nLOOPS = " << NLOOPS << endl; cout << "\nN = " << N << endl; cout << "\np = " << p << endl; // Memory allocation cout << "Memory allocation\n"; vector a(p + 1, 0); vector x(N); cout.precision(2); cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; cout.precision(6); // Data initialization for (size_t j = 0; j < N; ++j) x[j] = 1.0*j; for (size_t k = 0; k < p + 1; ++k) a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ... cout << "\nStart Benchmarking poly\n"; auto t1 = system_clock::now(); // start timer // Do calculation vector y(N); double check; double check_sum; for (size_t i = 0; i < NLOOPS; ++i) { y = poly(a, x); check = y[0]; check_sum += check; // prevents the optimizer from removing unused calculation results. } auto t2 = system_clock::now(); // stop timer auto duration = duration_cast(t2 - t1); // duration in microseconds double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds t_diff = t_diff/NLOOPS; // duration per loop seconds // Check the correct result cout << "\n poly(" << x[0] << ") = " << check << endl; if (abs(check - 1.0) > 1.0/1e6) { cout << " !! W R O N G result !!\n"; } cout << endl; // Timings and Performance cout << endl; cout.precision(2); double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024; double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); cout << "Total duration : " << t_diff*NLOOPS << endl; cout << "Timing in sec. : " << t_diff << endl; cout << "GFLOPS : " << Gflops << endl; cout << "GiByte/s : " << MemBandwidth << endl; return vector{t_diff, Gflops, MemBandwidth}; }