now with 6 and 7

2025-12-04 19:04:13 +01:00 · 2025-12-04 19:04:13 +01:00 · 7836d85c80
commit 7836d85c80
parent 8790b0f1b2
5 changed files with 1194 additions and 0 deletions
--- a/BSP_3_2to7/Makefile
+++ b/BSP_3_2to7/Makefile
@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp bsp_3_lib_bench.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
--- a/BSP_3_2to7/bsp_3_lib_bench.cpp
+++ b/BSP_3_2to7/bsp_3_lib_bench.cpp
@ -0,0 +1,796 @@
+#include "bsp_3_lib_bench.h"
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <ctime>
+#include <cblas.h>
+#include <lapacke.h>
+
+using namespace std;
+using namespace std::chrono;  // timing
+
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+        //sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+
+double scalar_cblas(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+
+    double sum = cblas_ddot(N, x.data(), 1, y.data(), 1);
+
+    return sum;
+}
+
+
+double scalar_kahan(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+    double c = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        double yk = x[i] * y[i] - c;
+        double t = sum + yk;
+        c = t - sum - yk;
+        sum = t;
+        //sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+
+double norm_eucl(std::vector<double> const &x)
+{
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i]*x[i];
+        //sum += exp(x[i])*log(y[i]);
+    }
+    sum = sqrt(sum);
+    return sum;
+}
+
+
+vector<double> MatVec(vector<double> const & a, vector<double> const & x)   // row wise access
+{
+    int const nelem = static_cast<int>(a.size());      // #elements in matrix
+    int const mcols = static_cast<int>(x.size());      // #elements in vector <==> #columns in matrix
+
+    assert(nelem % mcols == 0);                        // nelem has to be a multiple of mcols (==> #rows)
+    int const nrows = nelem/mcols;                     // integer division!
+
+    vector<double> b(nrows);                 // allocate resulting vector
+
+    for(size_t i = 0; i < nrows; ++i)
+    {
+        double tmp = 0.0;
+        for(size_t j = 0; j < mcols; ++j)
+        {
+            tmp = tmp + a[i*mcols+j] * x[j];
+        }
+        b[i] = tmp;
+    }
+
+    return b;
+}
+
+
+vector<double> MatVec_cblas(vector<double> const & a, vector<double> const & x)   // row wise access
+{
+    int const nelem = static_cast<int>(a.size());      // #elements in matrix
+    int const mcols = static_cast<int>(x.size());      // #elements in vector <==> #columns in matrix
+
+    assert(nelem % mcols == 0);                        // nelem has to be a multiple of mcols (==> #rows)
+    int const nrows = nelem/mcols;                     // integer division!
+
+    vector<double> b(nrows);                 // allocate resulting vector
+
+	double const alpha = 1.0;
+	double const beta = 0.0;
+
+    cblas_dgemv(CblasRowMajor, CblasTrans,
+                nrows, mcols,
+                alpha, a.data(), nrows,
+                x.data(), 1,
+                beta, b.data(), 1);
+
+    return b;
+}
+
+
+vector<double> MatVec_column(vector<double> const & a, vector<double> const & x)   // column wise access
+{
+    int const nelem = static_cast<int>(a.size());      // #elements in matrix
+    int const mcols = static_cast<int>(x.size());      // #elements in vector <==> #columns in matrix
+
+    assert(nelem % mcols == 0);                        // nelem has to be a multiple of mcols (==> #rows)
+    int const nrows = nelem/mcols;                     // integer division!
+
+    vector<double> b(nrows);                 // allocate resulting vector
+
+    // if we do it directly we have cache issues - not optimal
+    // to make the code more efficient we change the two loops and put the b[i] inside the inner loop
+    // b is not so large compared to a, so higher amount of writing operations to not matter that much
+    for(size_t j = 0; j < mcols; ++j)
+    {
+        double xj = x[j];
+        for(size_t i = 0; i < nrows; ++i)
+        {
+            b[i] += a[j*nrows+i] * xj;
+        }
+    }
+
+    return b;
+}
+
+
+vector<double> MatMatProd(vector<double> const & a, vector<double> const & b, int const & L)
+{
+    size_t const a_nelem = a.size();
+    size_t const b_nelem = b.size();
+
+    assert(static_cast<int>(a_nelem) % L == 0 && static_cast<int>(b_nelem) % L == 0);
+
+    size_t M = a_nelem/L;
+    size_t N = b_nelem/L;
+
+    vector<double> c(N*M,0);
+
+    for(size_t i = 0; i < M; ++i)
+    {
+        for(size_t k = 0; k < L; ++k)
+        {
+            for(size_t j = 0; j < N; ++j)
+            {
+                c[i*M+j] = c[i*M+j] + a[i*L+k]*b[k*N+j];
+            }
+        }
+    }
+
+    return c;
+}
+
+
+vector<double> MatMatProd_cblas(vector<double> const & a, vector<double> const & b, int const & L)
+{
+    size_t const a_nelem = a.size();
+    size_t const b_nelem = b.size();
+
+    assert(static_cast<int>(a_nelem) % L == 0 && static_cast<int>(b_nelem) % L == 0);
+
+    size_t M = a_nelem/L;
+    size_t N = b_nelem/L;
+
+    vector<double> c(N*M,0);
+
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+				M, N, L,
+				1.0, a.data(), L, b.data(), N,
+				0.0, c.data(), N);
+
+    return c;
+}
+
+
+vector<double> PolynomEval(vector<double> const & a, vector<double> const & x)
+{
+    // we want to use the Horner-scheme
+    vector<double> sol(x.size(),0);
+
+    for(size_t i = 0; i < x.size(); ++i)
+    {
+        double tmp = a[a.size()-1];
+        for(int k = static_cast<int>(a.size())-2; k >= 0; --k)
+        {
+            tmp = tmp*x[i] + a[k];
+        }
+        sol[i] = tmp;
+    }
+
+    return sol;
+}
+
+
+void benchmark_A(int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking A: scalar product\n";
+
+    vector<double> x(N), y(N);
+    for(size_t k = 0; k < x.size(); ++k)
+    {
+        x[k] = (k % 219) + 1;
+        y[k] = 1.0/x[k];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double sk(0.0), ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sk = scalar(x, y);
+        ss += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_A_cblas(int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking A: scalar product with cblas\n";
+
+    vector<double> x(N), y(N);
+    for(size_t k = 0; k < x.size(); ++k)
+    {
+        x[k] = (k % 219) + 1;
+        y[k] = 1.0/x[k];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double sk(0.0), ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sk = scalar_cblas(x, y);
+        ss += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_A_kahan(int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking A: scalar product with Kahan summation\n";
+
+    vector<double> x(N), y(N);
+    for(size_t k = 0; k < x.size(); ++k)
+    {
+        x[k] = (k % 219) + 1;
+        y[k] = 1.0/x[k];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double sk(0.0), ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sk = scalar(x, y);
+        ss += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    //cout << "GFLOPS         : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
+    //cout << "GiByte/s       : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_A_norm(int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking A_norm: euclidean norm\n";
+
+    vector<double> x(N,1.0);
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double sk(0.0), ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sk = norm_eucl(x);
+        ss += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n ||x|| = " << sk << endl;
+    if (sk - sqrt(N) > 1e-7)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_B(int const & N, int const & M, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking B: Matrix-Vector Product (row wise access)\n";
+
+    vector<double> x(N), b(M), a(N*M);
+    // initialize data
+    for(size_t i = 0; i < M; ++i)
+    {
+        for(size_t j = 0; j < N; ++j)
+        {
+            a[i*N+j] = (i+j) % 219 + 1;
+        }
+    }
+    for(size_t i = 0; i < N; ++i)
+    {
+        x[i] = 1.0/a[17*N+i];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        b = MatVec(a,x);
+        ss += b[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <A[17,.],x> = " << b[17] << endl;
+    if (static_cast<unsigned int>(b[17]) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << "\t M = " << M << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_B_cblas(int const & N, int const & M, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking B: Matrix-Vector Product with cblas (row wise access)\n";
+
+    vector<double> x(N), b(M), a(N*M);
+    // initialize data
+    for(size_t i = 0; i < M; ++i)
+    {
+        for(size_t j = 0; j < N; ++j)
+        {
+            a[i*N+j] = (i+j) % 219 + 1;
+        }
+    }
+    for(size_t i = 0; i < N; ++i)
+    {
+        x[i] = 1.0/a[17*N+i];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        b = MatVec_cblas(a,x);
+        ss += b[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <A[17,.],x> = " << b[17] << endl;
+    if (static_cast<unsigned int>(b[17]) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << "\t M = " << M << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_B_column(int const & N, int const & M, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking B: Matrix-Vector Product (column wise access)\n";
+
+    vector<double> x(N), b(M), a(N*M);
+    // initialize data
+    for(size_t i = 0; i < M; ++i)
+    {
+        for(size_t j = 0; j < N; ++j)
+        {
+            a[i*N+j] = (i+j) % 219 + 1;
+        }
+    }
+    for(size_t i = 0; i < N; ++i)
+    {
+        x[i] = 1.0/a[17*N+i];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        b = MatVec_column(a,x);
+        ss += b[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <A[17,.],x> = " << b[17] << endl;
+    if (static_cast<unsigned int>(b[17]) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << "\t M = " << M << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking C: Matrix-Matrix Product\n";
+
+    vector<double> a(M*L,1.0), b(L*N,1.0), c(N*M);
+    // with this data we get C[i,j] = L for all i and j
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        c = MatMatProd(a,b,L);
+        ss += c[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n C[10,15] = " << c[10*N+15] << endl;
+    if (static_cast<unsigned int>(c[10*N+15]) != L)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << "\t M = " << M << "\t L = " << L << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N * M * L / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << (L*(N+M) + M*N) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_C_cblas(int const & N, int const & M, int const & L, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking C: Matrix-Matrix Product mit cblas\n";
+
+    vector<double> a(M*L,1.0), b(L*N,1.0), c(N*M);
+    // with this data we get C[i,j] = L for all i and j
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        c = MatMatProd_cblas(a,b,L);
+        ss += c[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n C[10,15] = " << c[10*N+15] << endl;
+    if (static_cast<unsigned int>(c[10*N+15]) != L)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << "\t M = " << M << "\t L = " << L << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N * M * L / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << (L*(N+M) + M*N) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_D(int const & p, int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking D: polynomial evaluation\n";
+
+    vector<double> x(N,1), sol(N), a(p+1);
+    for(size_t i = 0; i < a.size(); ++i)
+    {
+        a[i] = pow(-1.0,i);     // 1-x+x^2-x^3+x^4...
+    }
+    a[0] = 1;
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sol = PolynomEval(a,x);
+        ss += sol[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n p(x[0]) = " << sol[0] << endl;
+    if (static_cast<unsigned int>(sol[0]) != (static_cast<int>(a.size()) % 2))
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "p = " << p << "\t N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0*(p+1)*N / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << N*(3+2*p) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void solver(int const & N, int const & nrhs)
+{
+	// generating the system matrix a
+	vector<double> a(N*N);
+	for(int i=0; i<N; ++i)
+	{
+		for(int j=0; j<N; ++j)
+		{
+			if(i==j)
+			{
+				a[i*N+j] = 4;
+			}
+			else
+			{
+				a[i*N+j] = 1.0/(i-j)/(i-j);
+			}
+		}
+	}
+	vector<double> a_old = a;
+	
+	// generating rhs
+	vector<double> rhs(N*nrhs);
+	for(int i=0; i<nrhs; ++i)
+	{
+		for(int j=0; j<N; ++j)
+		{
+			rhs[i*nrhs + j] = i+j-1;
+		}
+	}
+	vector<double> b_old = rhs;
+	
+	vector<int> ipiv(N);
+	// factorization
+	LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, a.data(), N, ipiv.data());
+	
+	auto t1 = system_clock::now(); // start timer
+	// Calculation of rhs -> b gets overwritten and contains the solution
+	LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, nrhs, a.data(), N, ipiv.data(), rhs.data(), nrhs);
+	auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    cout.precision(2);
+	cout << "Timing in sec. : " << t_diff << "	for nrhs= " << nrhs << endl;
+	
+	// Check for correct result
+	vector<double> ax = MatMatProd_cblas(a_old, rhs, N);
+	vector<double> diff(N*nrhs);
+	for(int i=0; i<N*nrhs; ++i)
+	{
+		diff[i] = ax[i] - b_old[i];
+	}
+	double diffnorm = norm_eucl(diff);
+	if(diffnorm > 1e-4)
+	{
+		cout << "  !!   W R O N G  result   !!\n";
+	}
+	
+	return;
+}
--- a/BSP_3_2to7/bsp_3_lib_bench.h
+++ b/BSP_3_2to7/bsp_3_lib_bench.h
@ -0,0 +1,205 @@
+#ifndef BSP_3_LIB_BENCH_H_INCLUDED
+#define BSP_3_LIB_BENCH_H_INCLUDED
+
+#include <vector>
+
+/** 	Inner product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidean inner product <x,y>
+*/
+double scalar(std::vector<double> const &x, std::vector<double> const &y);
+
+
+/** 	Inner product with cblas
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidean inner product <x,y>
+*/
+double scalar_cblas(std::vector<double> const &x, std::vector<double> const &y);
+
+
+/** 	Inner product with Kahan summation
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidean inner product <x,y>
+*/
+double scalar_kahan(std::vector<double> const &x, std::vector<double> const &y);
+
+
+/** 	euclidean norm
+	@param[in] x	vector
+	@return 	    resulting Euclidean norm
+*/
+double norm_eucl(std::vector<double> const &x);
+
+
+/** \brief Matrix-Vektor-Multiplikation (row-wise access)
+ *
+ * \param[in]   a   Matrix with row wise access
+ * \param[in]   x   vector which gets multiplied
+ * \return          resulting product a*x (vector)
+ *
+ */
+std::vector<double> MatVec(std::vector<double> const & a, std::vector<double> const & x);
+
+
+/** \brief Matrix-Vektor-Multiplikation mit cblas (row-wise access)
+ *
+ * \param[in]   a   Matrix with row wise access
+ * \param[in]   x   vector which gets multiplied
+ * \return          resulting product a*x (vector)
+ *
+ */
+std::vector<double> MatVec_cblas(std::vector<double> const & a, std::vector<double> const & x);
+
+
+/** \brief Matrix-Vektor-Multiplikation (column-wise access)
+ *
+ * \param[in]   a   Matrix with row wise access
+ * \param[in]   x   vector which gets multiplied
+ * \return          resulting product a*x (vector)
+ *
+ */
+std::vector<double> MatVec_column(std::vector<double> const & a, std::vector<double> const & x);
+
+
+/** \brief Matrix-Matrix-Multiplikation (row-wise access)
+ *
+ * \param[in]   a   matrix with row wise access (M*L)
+ * \param[in]   b   matrix with row wise access (L*N)
+ * \param[in]   L   inner dimension of the matrix product
+ * \return          resulting product a*b
+ *
+ */
+std::vector<double> MatMatProd(std::vector<double> const & a, std::vector<double> const & b, int const & L);
+
+
+/** \brief Matrix-Matrix-Multiplikation mit cblas (row-wise access)
+ *
+ * \param[in]   a   matrix with row wise access (M*L)
+ * \param[in]   b   matrix with row wise access (L*N)
+ * \param[in]   L   inner dimension of the matrix product
+ * \return          resulting product a*b
+ *
+ */
+std::vector<double> MatMatProd_cblas(std::vector<double> const & a, std::vector<double> const & b, int const & L);
+
+
+/** \brief Polynomauswertung an Stelle x
+ *
+ * \param[in]   a   Vekor mit den Koeffizienten des Polynoms a=[a0,a1,a2,...]
+ * \param[in]   x   Vektor, für welchen das Polynom ausgewertet werden soll
+ * \return          resulting vector p(x)
+ *
+ */
+std::vector<double> PolynomEval(std::vector<double> const & a, std::vector<double> const & x);
+
+
+/** \brief Benchmarking A - the scalar product
+ *
+ * \param   N   size of the vector
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_A(int const & N, int const & Nloops);
+
+
+/** \brief Benchmarking A - the scalar product with cblas
+ *
+ * \param   N   size of the vector
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_A_cblas(int const & N, int const & Nloops);
+
+
+
+/** \brief Benchmarking A - the scalar product with Kahan summation
+ *
+ * \param   N   size of the vector
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_A_kahan(int const & N, int const & Nloops);
+
+
+/** \brief Benchmarking A - norm
+ *
+ * \param   N   size of the vector
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_A_norm(int const & N, int const & Nloops);
+
+
+/** \brief Benchmarking B - matrix-vector product Ax=b (row wise access)
+ *
+ * \param   N   size of vector x
+ * \param   M   size of vector b (=> A: M*N)
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_B(int const & N, int const & M, int const & Nloops);
+
+
+/** \brief Benchmarking B - matrix-vector product Ax=b with cblas (row wise access)
+ *
+ * \param   N   size of vector x
+ * \param   M   size of vector b (=> A: M*N)
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_B_cblas(int const & N, int const & M, int const & Nloops);
+
+
+/** \brief Benchmarking B - matrix-vector product Ax=b (column wise access)
+ *
+ * \param   N   size of vector x
+ * \param   M   size of vector b (=> A: M*N)
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_B_column(int const & N, int const & M, int const & Nloops);
+
+
+/** \brief Benchmarking C - Matrix-Matrix product C=A*B A_M*L, B_L*N
+ *
+ * \param   N
+ * \param   M
+ * \param   L
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops);
+
+
+/** \brief Benchmarking C - Matrix-Matrix product with cblas; C=A*B A_M*L, B_L*N
+ *
+ * \param   N
+ * \param   M
+ * \param   L
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_C_cblas(int const & N, int const & M, int const & L, int const & Nloops);
+
+
+/** \brief Benchmarking D - polynomial evaluation
+ *
+ * \param   p   the degree of the polynomial
+ * \param   N   size of the input vector x where p(x)
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_D(int const & p, int const & N, int const & Nloops);
+
+
+/** \brief solving system of linear equations with time measurement
+ *
+ * \param   N   size of the system Ax=b with A(nxn)
+ * \param	nrhs	number of right hand sides b(nxnrhs)
+ *
+ */
+void solver(int const & N, int const & nrhs);
+#endif // BSP_3_LIB_BENCH_H_INCLUDED
--- a/BSP_3_2to7/main.cpp
+++ b/BSP_3_2to7/main.cpp
@ -0,0 +1,29 @@
+#include "bsp_3_lib_bench.h"
+#include <iostream>
+
+using namespace std;
+
+int main()
+{
+    benchmark_A(25*1e7,50);
+    benchmark_A_cblas(25*1e7,50);
+    benchmark_A_kahan(25*1e7,50);
+    benchmark_A_norm(25*1e7,50);
+    benchmark_B(8000,8000,400);
+    benchmark_B_column(8000,8000,400);
+	benchmark_B_cblas(8000,8000,400);
+    benchmark_C(4000,4000,4000,1);
+    benchmark_C_cblas(4000,4000,4000,10);
+    benchmark_D(1e4,1e5,15);
+
+    cout << "Solving system of linear equations\n";
+    vector<int> nrhsvec{1,2,4,8,16,32};
+    for(size_t i=0; i<nrhsvec.size(); ++i)
+    {
+		int nrhs = nrhsvec[i];
+		solver(5000,nrhs);
+	}
+	// the time for solving the system does not really depend on the number of the right hand sides
+
+    return 0;
+}
--- a/BSP_3_2to7/results_benchmarking.txt
+++ b/BSP_3_2to7/results_benchmarking.txt
@ -0,0 +1,134 @@
+Start Benchmarking A: scalar product
+
+ <x,y> = 2.5e+08
+
+
+N = 250000000
+Time for Nloops: 11
+Timing in sec. : 0.21
+GFLOPS         : 2.2
+GiByte/s       : 17
+
+
+
+Start Benchmarking A: scalar product with cblas
+
+ <x,y> = 2.5e+08
+
+
+N = 250000000
+Time for Nloops: 5.7
+Timing in sec. : 0.11
+GFLOPS         : 4.1
+GiByte/s       : 33
+
+
+
+Start Benchmarking A: scalar product with Kahan summation
+
+ <x,y> = 2.5e+08
+
+
+N = 250000000
+Time for Nloops: 11
+Timing in sec. : 0.22
+
+
+
+Start Benchmarking A_norm: euclidean norm
+
+ ||x|| = 1.6e+04
+
+
+N = 250000000
+Time for Nloops: 6.1
+Timing in sec. : 0.12
+GFLOPS         : 3.8
+GiByte/s       : 15
+
+
+
+Start Benchmarking B: Matrix-Vector Product (row wise access)
+
+ <A[17,.],x> = 8e+03
+
+
+N = 8000         M = 8000
+Time for Nloops: 10
+Timing in sec. : 0.026
+GFLOPS         : 4.7
+GiByte/s       : 37
+
+
+
+Start Benchmarking B: Matrix-Vector Product (column wise access)
+
+ <A[17,.],x> = 8e+03
+
+
+N = 8000         M = 8000
+Time for Nloops: 13
+Timing in sec. : 0.032
+GFLOPS         : 3.7
+GiByte/s       : 30
+
+
+
+Start Benchmarking B: Matrix-Vector Product with cblas (row wise access)
+
+ <A[17,.],x> = 8e+03
+
+
+N = 8000         M = 8000
+Time for Nloops: 13
+Timing in sec. : 0.032
+GFLOPS         : 3.8
+GiByte/s       : 30
+
+
+
+Start Benchmarking C: Matrix-Matrix Product
+
+ C[10,15] = 4e+03
+
+
+N = 4000         M = 4000        L = 4000
+Time for Nloops: 29
+Timing in sec. : 29
+GFLOPS         : 4
+GiByte/s       : 0.012
+
+
+
+Start Benchmarking C: Matrix-Matrix Product mit cblas
+
+ C[10,15] = 4e+03
+
+
+N = 4000         M = 4000        L = 4000
+Time for Nloops: 6.2
+Timing in sec. : 0.62
+GFLOPS         : 1.9e+02
+GiByte/s       : 0.58
+
+
+
+Start Benchmarking D: polynomial evaluation
+
+ p(x[0]) = 1
+
+
+p = 10000        N = 100000
+Time for Nloops: 17
+Timing in sec. : 1.2
+GFLOPS         : 1.6
+GiByte/s       : 13
+
+
+Solving system of linear equations
+Timing in sec. : 0.4    for nrhs= 1
+Timing in sec. : 0.4    for nrhs= 2
+Timing in sec. : 0.43   for nrhs= 4
+Timing in sec. : 0.47   for nrhs= 8
+Timing in sec. : 0.54   for nrhs= 16
+Timing in sec. : 0.48   for nrhs= 32