diff --git a/BSP_3_2to7/Makefile b/BSP_3_2to7/Makefile new file mode 100644 index 0000000..c2a6a5e --- /dev/null +++ b/BSP_3_2to7/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp bsp_3_lib_bench.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/BSP_3_2to7/bsp_3_lib_bench.cpp b/BSP_3_2to7/bsp_3_lib_bench.cpp new file mode 100644 index 0000000..f3bff9a --- /dev/null +++ b/BSP_3_2to7/bsp_3_lib_bench.cpp @@ -0,0 +1,796 @@ +#include "bsp_3_lib_bench.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace std::chrono; // timing + +double scalar(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + size_t const N = x.size(); + double sum = 0.0; + for (size_t i = 0; i < N; ++i) + { + sum += x[i] * y[i]; + //sum += exp(x[i])*log(y[i]); + } + return sum; +} + + +double scalar_cblas(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + size_t const N = x.size(); + + double sum = cblas_ddot(N, x.data(), 1, y.data(), 1); + + return sum; +} + + +double scalar_kahan(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + size_t const N = x.size(); + double sum = 0.0; + double c = 0.0; + for (size_t i = 0; i < N; ++i) + { + double yk = x[i] * y[i] - c; + double t = sum + yk; + c = t - sum - yk; + sum = t; + //sum += exp(x[i])*log(y[i]); + } + return sum; +} + + +double norm_eucl(std::vector const &x) +{ + size_t const N = x.size(); + double sum = 0.0; + for (size_t i = 0; i < N; ++i) + { + sum += x[i]*x[i]; + //sum += exp(x[i])*log(y[i]); + } + sum = sqrt(sum); + return sum; +} + + +vector MatVec(vector const & a, vector const & x) // row wise access +{ + int const nelem = static_cast(a.size()); // #elements in matrix + int const mcols = static_cast(x.size()); // #elements in vector <==> #columns in matrix + + assert(nelem % mcols == 0); // nelem has to be a multiple of mcols (==> #rows) + int const nrows = nelem/mcols; // integer division! + + vector b(nrows); // allocate resulting vector + + for(size_t i = 0; i < nrows; ++i) + { + double tmp = 0.0; + for(size_t j = 0; j < mcols; ++j) + { + tmp = tmp + a[i*mcols+j] * x[j]; + } + b[i] = tmp; + } + + return b; +} + + +vector MatVec_cblas(vector const & a, vector const & x) // row wise access +{ + int const nelem = static_cast(a.size()); // #elements in matrix + int const mcols = static_cast(x.size()); // #elements in vector <==> #columns in matrix + + assert(nelem % mcols == 0); // nelem has to be a multiple of mcols (==> #rows) + int const nrows = nelem/mcols; // integer division! + + vector b(nrows); // allocate resulting vector + + double const alpha = 1.0; + double const beta = 0.0; + + cblas_dgemv(CblasRowMajor, CblasTrans, + nrows, mcols, + alpha, a.data(), nrows, + x.data(), 1, + beta, b.data(), 1); + + return b; +} + + +vector MatVec_column(vector const & a, vector const & x) // column wise access +{ + int const nelem = static_cast(a.size()); // #elements in matrix + int const mcols = static_cast(x.size()); // #elements in vector <==> #columns in matrix + + assert(nelem % mcols == 0); // nelem has to be a multiple of mcols (==> #rows) + int const nrows = nelem/mcols; // integer division! + + vector b(nrows); // allocate resulting vector + + // if we do it directly we have cache issues - not optimal + // to make the code more efficient we change the two loops and put the b[i] inside the inner loop + // b is not so large compared to a, so higher amount of writing operations to not matter that much + for(size_t j = 0; j < mcols; ++j) + { + double xj = x[j]; + for(size_t i = 0; i < nrows; ++i) + { + b[i] += a[j*nrows+i] * xj; + } + } + + return b; +} + + +vector MatMatProd(vector const & a, vector const & b, int const & L) +{ + size_t const a_nelem = a.size(); + size_t const b_nelem = b.size(); + + assert(static_cast(a_nelem) % L == 0 && static_cast(b_nelem) % L == 0); + + size_t M = a_nelem/L; + size_t N = b_nelem/L; + + vector c(N*M,0); + + for(size_t i = 0; i < M; ++i) + { + for(size_t k = 0; k < L; ++k) + { + for(size_t j = 0; j < N; ++j) + { + c[i*M+j] = c[i*M+j] + a[i*L+k]*b[k*N+j]; + } + } + } + + return c; +} + + +vector MatMatProd_cblas(vector const & a, vector const & b, int const & L) +{ + size_t const a_nelem = a.size(); + size_t const b_nelem = b.size(); + + assert(static_cast(a_nelem) % L == 0 && static_cast(b_nelem) % L == 0); + + size_t M = a_nelem/L; + size_t N = b_nelem/L; + + vector c(N*M,0); + + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + M, N, L, + 1.0, a.data(), L, b.data(), N, + 0.0, c.data(), N); + + return c; +} + + +vector PolynomEval(vector const & a, vector const & x) +{ + // we want to use the Horner-scheme + vector sol(x.size(),0); + + for(size_t i = 0; i < x.size(); ++i) + { + double tmp = a[a.size()-1]; + for(int k = static_cast(a.size())-2; k >= 0; --k) + { + tmp = tmp*x[i] + a[k]; + } + sol[i] = tmp; + } + + return sol; +} + + +void benchmark_A(int const & N, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking A: scalar product\n"; + + vector x(N), y(N); + for(size_t k = 0; k < x.size(); ++k) + { + x[k] = (k % 219) + 1; + y[k] = 1.0/x[k]; + } + + auto t1 = system_clock::now(); // start timer +// Do calculation + double sk(0.0), ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + sk = scalar(x, y); + ss += sk; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n = " << sk << endl; + if (static_cast(sk) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_A_cblas(int const & N, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking A: scalar product with cblas\n"; + + vector x(N), y(N); + for(size_t k = 0; k < x.size(); ++k) + { + x[k] = (k % 219) + 1; + y[k] = 1.0/x[k]; + } + + auto t1 = system_clock::now(); // start timer +// Do calculation + double sk(0.0), ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + sk = scalar_cblas(x, y); + ss += sk; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n = " << sk << endl; + if (static_cast(sk) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_A_kahan(int const & N, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking A: scalar product with Kahan summation\n"; + + vector x(N), y(N); + for(size_t k = 0; k < x.size(); ++k) + { + x[k] = (k % 219) + 1; + y[k] = 1.0/x[k]; + } + + auto t1 = system_clock::now(); // start timer +// Do calculation + double sk(0.0), ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + sk = scalar(x, y); + ss += sk; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n = " << sk << endl; + if (static_cast(sk) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + //cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl; + //cout << "GiByte/s : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_A_norm(int const & N, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking A_norm: euclidean norm\n"; + + vector x(N,1.0); + + auto t1 = system_clock::now(); // start timer +// Do calculation + double sk(0.0), ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + sk = norm_eucl(x); + ss += sk; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n ||x|| = " << sk << endl; + if (sk - sqrt(N) > 1e-7) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_B(int const & N, int const & M, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking B: Matrix-Vector Product (row wise access)\n"; + + vector x(N), b(M), a(N*M); + // initialize data + for(size_t i = 0; i < M; ++i) + { + for(size_t j = 0; j < N; ++j) + { + a[i*N+j] = (i+j) % 219 + 1; + } + } + for(size_t i = 0; i < N; ++i) + { + x[i] = 1.0/a[17*N+i]; + } + + auto t1 = system_clock::now(); // start timer +// Do calculation + double ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + b = MatVec(a,x); + ss += b[0]; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n = " << b[17] << endl; + if (static_cast(b[17]) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << "\t M = " << M << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_B_cblas(int const & N, int const & M, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking B: Matrix-Vector Product with cblas (row wise access)\n"; + + vector x(N), b(M), a(N*M); + // initialize data + for(size_t i = 0; i < M; ++i) + { + for(size_t j = 0; j < N; ++j) + { + a[i*N+j] = (i+j) % 219 + 1; + } + } + for(size_t i = 0; i < N; ++i) + { + x[i] = 1.0/a[17*N+i]; + } + + auto t1 = system_clock::now(); // start timer +// Do calculation + double ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + b = MatVec_cblas(a,x); + ss += b[0]; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n = " << b[17] << endl; + if (static_cast(b[17]) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << "\t M = " << M << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_B_column(int const & N, int const & M, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking B: Matrix-Vector Product (column wise access)\n"; + + vector x(N), b(M), a(N*M); + // initialize data + for(size_t i = 0; i < M; ++i) + { + for(size_t j = 0; j < N; ++j) + { + a[i*N+j] = (i+j) % 219 + 1; + } + } + for(size_t i = 0; i < N; ++i) + { + x[i] = 1.0/a[17*N+i]; + } + + auto t1 = system_clock::now(); // start timer +// Do calculation + double ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + b = MatVec_column(a,x); + ss += b[0]; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n = " << b[17] << endl; + if (static_cast(b[17]) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << "\t M = " << M << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking C: Matrix-Matrix Product\n"; + + vector a(M*L,1.0), b(L*N,1.0), c(N*M); + // with this data we get C[i,j] = L for all i and j + + auto t1 = system_clock::now(); // start timer +// Do calculation + double ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + c = MatMatProd(a,b,L); + ss += c[0]; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n C[10,15] = " << c[10*N+15] << endl; + if (static_cast(c[10*N+15]) != L) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << "\t M = " << M << "\t L = " << L << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0 * N * M * L / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << (L*(N+M) + M*N) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_C_cblas(int const & N, int const & M, int const & L, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking C: Matrix-Matrix Product mit cblas\n"; + + vector a(M*L,1.0), b(L*N,1.0), c(N*M); + // with this data we get C[i,j] = L for all i and j + + auto t1 = system_clock::now(); // start timer +// Do calculation + double ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + c = MatMatProd_cblas(a,b,L); + ss += c[0]; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n C[10,15] = " << c[10*N+15] << endl; + if (static_cast(c[10*N+15]) != L) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "N = " << N << "\t M = " << M << "\t L = " << L << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0 * N * M * L / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << (L*(N+M) + M*N) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl; + cout << endl << endl; + + return; +} + + +void benchmark_D(int const & p, int const & N, int const & Nloops) +{ + //########################################################################## + cout << "\nStart Benchmarking D: polynomial evaluation\n"; + + vector x(N,1), sol(N), a(p+1); + for(size_t i = 0; i < a.size(); ++i) + { + a[i] = pow(-1.0,i); // 1-x+x^2-x^3+x^4... + } + a[0] = 1; + + auto t1 = system_clock::now(); // start timer +// Do calculation + double ss(0.0); + for (int i = 0; i < Nloops; ++i) + { + sol = PolynomEval(a,x); + ss += sol[0]; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/Nloops; // duration per loop seconds + + //assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "==" + +//########################################################################## +// Check the correct result + cout << "\n p(x[0]) = " << sol[0] << endl; + if (static_cast(sol[0]) != (static_cast(a.size()) % 2)) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "p = " << p << "\t N = " << N << endl; + cout << "Time for Nloops: " << t_diff*Nloops << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << 2.0*(p+1)*N / t_diff / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << N*(3+2*p) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl; + cout << endl << endl; + + return; +} + + +void solver(int const & N, int const & nrhs) +{ + // generating the system matrix a + vector a(N*N); + for(int i=0; i a_old = a; + + // generating rhs + vector rhs(N*nrhs); + for(int i=0; i b_old = rhs; + + vector ipiv(N); + // factorization + LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, a.data(), N, ipiv.data()); + + auto t1 = system_clock::now(); // start timer + // Calculation of rhs -> b gets overwritten and contains the solution + LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, nrhs, a.data(), N, ipiv.data(), rhs.data(), nrhs); + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + cout.precision(2); + cout << "Timing in sec. : " << t_diff << " for nrhs= " << nrhs << endl; + + // Check for correct result + vector ax = MatMatProd_cblas(a_old, rhs, N); + vector diff(N*nrhs); + for(int i=0; i 1e-4) + { + cout << " !! W R O N G result !!\n"; + } + + return; +} diff --git a/BSP_3_2to7/bsp_3_lib_bench.h b/BSP_3_2to7/bsp_3_lib_bench.h new file mode 100644 index 0000000..598ddf9 --- /dev/null +++ b/BSP_3_2to7/bsp_3_lib_bench.h @@ -0,0 +1,205 @@ +#ifndef BSP_3_LIB_BENCH_H_INCLUDED +#define BSP_3_LIB_BENCH_H_INCLUDED + +#include + +/** Inner product + @param[in] x vector + @param[in] y vector + @return resulting Euclidean inner product +*/ +double scalar(std::vector const &x, std::vector const &y); + + +/** Inner product with cblas + @param[in] x vector + @param[in] y vector + @return resulting Euclidean inner product +*/ +double scalar_cblas(std::vector const &x, std::vector const &y); + + +/** Inner product with Kahan summation + @param[in] x vector + @param[in] y vector + @return resulting Euclidean inner product +*/ +double scalar_kahan(std::vector const &x, std::vector const &y); + + +/** euclidean norm + @param[in] x vector + @return resulting Euclidean norm +*/ +double norm_eucl(std::vector const &x); + + +/** \brief Matrix-Vektor-Multiplikation (row-wise access) + * + * \param[in] a Matrix with row wise access + * \param[in] x vector which gets multiplied + * \return resulting product a*x (vector) + * + */ +std::vector MatVec(std::vector const & a, std::vector const & x); + + +/** \brief Matrix-Vektor-Multiplikation mit cblas (row-wise access) + * + * \param[in] a Matrix with row wise access + * \param[in] x vector which gets multiplied + * \return resulting product a*x (vector) + * + */ +std::vector MatVec_cblas(std::vector const & a, std::vector const & x); + + +/** \brief Matrix-Vektor-Multiplikation (column-wise access) + * + * \param[in] a Matrix with row wise access + * \param[in] x vector which gets multiplied + * \return resulting product a*x (vector) + * + */ +std::vector MatVec_column(std::vector const & a, std::vector const & x); + + +/** \brief Matrix-Matrix-Multiplikation (row-wise access) + * + * \param[in] a matrix with row wise access (M*L) + * \param[in] b matrix with row wise access (L*N) + * \param[in] L inner dimension of the matrix product + * \return resulting product a*b + * + */ +std::vector MatMatProd(std::vector const & a, std::vector const & b, int const & L); + + +/** \brief Matrix-Matrix-Multiplikation mit cblas (row-wise access) + * + * \param[in] a matrix with row wise access (M*L) + * \param[in] b matrix with row wise access (L*N) + * \param[in] L inner dimension of the matrix product + * \return resulting product a*b + * + */ +std::vector MatMatProd_cblas(std::vector const & a, std::vector const & b, int const & L); + + +/** \brief Polynomauswertung an Stelle x + * + * \param[in] a Vekor mit den Koeffizienten des Polynoms a=[a0,a1,a2,...] + * \param[in] x Vektor, für welchen das Polynom ausgewertet werden soll + * \return resulting vector p(x) + * + */ +std::vector PolynomEval(std::vector const & a, std::vector const & x); + + +/** \brief Benchmarking A - the scalar product + * + * \param N size of the vector + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_A(int const & N, int const & Nloops); + + +/** \brief Benchmarking A - the scalar product with cblas + * + * \param N size of the vector + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_A_cblas(int const & N, int const & Nloops); + + + +/** \brief Benchmarking A - the scalar product with Kahan summation + * + * \param N size of the vector + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_A_kahan(int const & N, int const & Nloops); + + +/** \brief Benchmarking A - norm + * + * \param N size of the vector + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_A_norm(int const & N, int const & Nloops); + + +/** \brief Benchmarking B - matrix-vector product Ax=b (row wise access) + * + * \param N size of vector x + * \param M size of vector b (=> A: M*N) + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_B(int const & N, int const & M, int const & Nloops); + + +/** \brief Benchmarking B - matrix-vector product Ax=b with cblas (row wise access) + * + * \param N size of vector x + * \param M size of vector b (=> A: M*N) + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_B_cblas(int const & N, int const & M, int const & Nloops); + + +/** \brief Benchmarking B - matrix-vector product Ax=b (column wise access) + * + * \param N size of vector x + * \param M size of vector b (=> A: M*N) + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_B_column(int const & N, int const & M, int const & Nloops); + + +/** \brief Benchmarking C - Matrix-Matrix product C=A*B A_M*L, B_L*N + * + * \param N + * \param M + * \param L + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops); + + +/** \brief Benchmarking C - Matrix-Matrix product with cblas; C=A*B A_M*L, B_L*N + * + * \param N + * \param M + * \param L + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_C_cblas(int const & N, int const & M, int const & L, int const & Nloops); + + +/** \brief Benchmarking D - polynomial evaluation + * + * \param p the degree of the polynomial + * \param N size of the input vector x where p(x) + * \param Nloops number of iterations we want to do for the measuring + * + */ +void benchmark_D(int const & p, int const & N, int const & Nloops); + + +/** \brief solving system of linear equations with time measurement + * + * \param N size of the system Ax=b with A(nxn) + * \param nrhs number of right hand sides b(nxnrhs) + * + */ +void solver(int const & N, int const & nrhs); +#endif // BSP_3_LIB_BENCH_H_INCLUDED diff --git a/BSP_3_2to7/main.cpp b/BSP_3_2to7/main.cpp new file mode 100644 index 0000000..4cc4670 --- /dev/null +++ b/BSP_3_2to7/main.cpp @@ -0,0 +1,29 @@ +#include "bsp_3_lib_bench.h" +#include + +using namespace std; + +int main() +{ + benchmark_A(25*1e7,50); + benchmark_A_cblas(25*1e7,50); + benchmark_A_kahan(25*1e7,50); + benchmark_A_norm(25*1e7,50); + benchmark_B(8000,8000,400); + benchmark_B_column(8000,8000,400); + benchmark_B_cblas(8000,8000,400); + benchmark_C(4000,4000,4000,1); + benchmark_C_cblas(4000,4000,4000,10); + benchmark_D(1e4,1e5,15); + + cout << "Solving system of linear equations\n"; + vector nrhsvec{1,2,4,8,16,32}; + for(size_t i=0; i = 2.5e+08 + + +N = 250000000 +Time for Nloops: 11 +Timing in sec. : 0.21 +GFLOPS : 2.2 +GiByte/s : 17 + + + +Start Benchmarking A: scalar product with cblas + + = 2.5e+08 + + +N = 250000000 +Time for Nloops: 5.7 +Timing in sec. : 0.11 +GFLOPS : 4.1 +GiByte/s : 33 + + + +Start Benchmarking A: scalar product with Kahan summation + + = 2.5e+08 + + +N = 250000000 +Time for Nloops: 11 +Timing in sec. : 0.22 + + + +Start Benchmarking A_norm: euclidean norm + + ||x|| = 1.6e+04 + + +N = 250000000 +Time for Nloops: 6.1 +Timing in sec. : 0.12 +GFLOPS : 3.8 +GiByte/s : 15 + + + +Start Benchmarking B: Matrix-Vector Product (row wise access) + + = 8e+03 + + +N = 8000 M = 8000 +Time for Nloops: 10 +Timing in sec. : 0.026 +GFLOPS : 4.7 +GiByte/s : 37 + + + +Start Benchmarking B: Matrix-Vector Product (column wise access) + + = 8e+03 + + +N = 8000 M = 8000 +Time for Nloops: 13 +Timing in sec. : 0.032 +GFLOPS : 3.7 +GiByte/s : 30 + + + +Start Benchmarking B: Matrix-Vector Product with cblas (row wise access) + + = 8e+03 + + +N = 8000 M = 8000 +Time for Nloops: 13 +Timing in sec. : 0.032 +GFLOPS : 3.8 +GiByte/s : 30 + + + +Start Benchmarking C: Matrix-Matrix Product + + C[10,15] = 4e+03 + + +N = 4000 M = 4000 L = 4000 +Time for Nloops: 29 +Timing in sec. : 29 +GFLOPS : 4 +GiByte/s : 0.012 + + + +Start Benchmarking C: Matrix-Matrix Product mit cblas + + C[10,15] = 4e+03 + + +N = 4000 M = 4000 L = 4000 +Time for Nloops: 6.2 +Timing in sec. : 0.62 +GFLOPS : 1.9e+02 +GiByte/s : 0.58 + + + +Start Benchmarking D: polynomial evaluation + + p(x[0]) = 1 + + +p = 10000 N = 100000 +Time for Nloops: 17 +Timing in sec. : 1.2 +GFLOPS : 1.6 +GiByte/s : 13 + + +Solving system of linear equations +Timing in sec. : 0.4 for nrhs= 1 +Timing in sec. : 0.4 for nrhs= 2 +Timing in sec. : 0.43 for nrhs= 4 +Timing in sec. : 0.47 for nrhs= 8 +Timing in sec. : 0.54 for nrhs= 16 +Timing in sec. : 0.48 for nrhs= 32 \ No newline at end of file