exercises 2 to 5

2025-11-13 02:29:23 +01:00 · 2025-11-13 02:29:23 +01:00 · ce217b844f
commit ce217b844f
parent 14b51cb405
5 changed files with 820 additions and 0 deletions
--- a/BSP_3_2to5/bsp_3_lib_bench.cpp
+++ b/BSP_3_2to5/bsp_3_lib_bench.cpp
@ -0,0 +1,519 @@
+#include "bsp_3_lib_bench.h"
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <ctime>
+
+using namespace std;
+using namespace std::chrono;  // timing
+
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+        //sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+
+double scalar_kahan(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+    double c = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        double yk = x[i] * y[i] - c;
+        double t = sum + yk;
+        c = t - sum - yk;
+        sum = t;
+        //sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+
+double norm_eucl(std::vector<double> const &x)
+{
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i]*x[i];
+        //sum += exp(x[i])*log(y[i]);
+    }
+    sum = sqrt(sum);
+    return sum;
+}
+
+
+vector<double> MatVec(vector<double> const & a, vector<double> const & x)   // row wise access
+{
+    int const nelem = static_cast<int>(a.size());      // #elements in matrix
+    int const mcols = static_cast<int>(x.size());      // #elements in vector <==> #columns in matrix
+
+    assert(nelem % mcols == 0);                        // nelem has to be a multiple of mcols (==> #rows)
+    int const nrows = nelem/mcols;                     // integer division!
+
+    vector<double> b(nrows);                 // allocate resulting vector
+
+    for(size_t i = 0; i < nrows; ++i)
+    {
+        double tmp = 0.0;
+        for(size_t j = 0; j < mcols; ++j)
+        {
+            tmp = tmp + a[i*mcols+j] * x[j];
+        }
+        b[i] = tmp;
+    }
+
+    return b;
+}
+
+
+vector<double> MatVec_column(vector<double> const & a, vector<double> const & x)   // column wise access
+{
+    int const nelem = static_cast<int>(a.size());      // #elements in matrix
+    int const mcols = static_cast<int>(x.size());      // #elements in vector <==> #columns in matrix
+
+    assert(nelem % mcols == 0);                        // nelem has to be a multiple of mcols (==> #rows)
+    int const nrows = nelem/mcols;                     // integer division!
+
+    vector<double> b(nrows);                 // allocate resulting vector
+
+    // if we do it directly we have cache issues - not optimal
+    // to make the code more efficient we change the two loops and put the b[i] inside the inner loop
+    // b is not so large compared to a, so higher amount of writing operations to not matter that much
+    for(size_t j = 0; j < mcols; ++j)
+    {
+        double xj = x[j];
+        for(size_t i = 0; i < nrows; ++i)
+        {
+            b[i] += a[j*nrows+i] * xj;
+        }
+    }
+
+    return b;
+}
+
+
+vector<double> MatMatProd(vector<double> const & a, vector<double> const & b, int const & L)
+{
+    size_t const a_nelem = a.size();
+    size_t const b_nelem = b.size();
+
+    assert(static_cast<int>(a_nelem) % L == 0 && static_cast<int>(b_nelem) % L == 0);
+
+    size_t M = a_nelem/L;
+    size_t N = b_nelem/L;
+
+    vector<double> c(N*M,0);
+
+    for(size_t i = 0; i < M; ++i)
+    {
+        for(size_t k = 0; k < L; ++k)
+        {
+            for(size_t j = 0; j < N; ++j)
+            {
+                c[i*M+j] = c[i*M+j] + a[i*L+k]*b[k*N+j];
+            }
+        }
+    }
+
+    return c;
+}
+
+
+vector<double> PolynomEval(vector<double> const & a, vector<double> const & x)
+{
+    // we want to use the Horner-scheme
+    vector<double> sol(x.size(),0);
+
+    for(size_t i = 0; i < x.size(); ++i)
+    {
+        double tmp = a[a.size()-1];
+        for(int k = static_cast<int>(a.size())-2; k >= 0; --k)
+        {
+            tmp = tmp*x[i] + a[k];
+        }
+        sol[i] = tmp;
+    }
+
+    return sol;
+}
+
+
+void benchmark_A(int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking A: scalar product\n";
+
+    vector<double> x(N), y(N);
+    for(size_t k = 0; k < x.size(); ++k)
+    {
+        x[k] = (k % 219) + 1;
+        y[k] = 1.0/x[k];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double sk(0.0), ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sk = scalar(x, y);
+        ss += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_A_kahan(int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking A: scalar product with Kahan summation\n";
+
+    vector<double> x(N), y(N);
+    for(size_t k = 0; k < x.size(); ++k)
+    {
+        x[k] = (k % 219) + 1;
+        y[k] = 1.0/x[k];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double sk(0.0), ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sk = scalar(x, y);
+        ss += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    //cout << "GFLOPS         : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
+    //cout << "GiByte/s       : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_A_norm(int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking A_norm: euclidean norm\n";
+
+    vector<double> x(N,1.0);
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double sk(0.0), ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sk = norm_eucl(x);
+        ss += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n ||x|| = " << sk << endl;
+    if (sk - sqrt(N) > 1e-7)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_B(int const & N, int const & M, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking B: Matrix-Vector Product (row wise access)\n";
+
+    vector<double> x(N), b(M), a(N*M);
+    // initialize data
+    for(size_t i = 0; i < M; ++i)
+    {
+        for(size_t j = 0; j < N; ++j)
+        {
+            a[i*N+j] = (i+j) % 219 + 1;
+        }
+    }
+    for(size_t i = 0; i < N; ++i)
+    {
+        x[i] = 1.0/a[17*N+i];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        b = MatVec(a,x);
+        ss += b[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <A[17,.],x> = " << b[17] << endl;
+    if (static_cast<unsigned int>(b[17]) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << "\t M = " << M << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_B_column(int const & N, int const & M, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking B: Matrix-Vector Product (column wise access)\n";
+
+    vector<double> x(N), b(M), a(N*M);
+    // initialize data
+    for(size_t i = 0; i < M; ++i)
+    {
+        for(size_t j = 0; j < N; ++j)
+        {
+            a[i*N+j] = (i+j) % 219 + 1;
+        }
+    }
+    for(size_t i = 0; i < N; ++i)
+    {
+        x[i] = 1.0/a[17*N+i];
+    }
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        b = MatVec_column(a,x);
+        ss += b[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <A[17,.],x> = " << b[17] << endl;
+    if (static_cast<unsigned int>(b[17]) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << "\t M = " << M << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking C: Matrix-Matrix Product\n";
+
+    vector<double> a(M*L,1.0), b(L*N,1.0), c(N*M);
+    // with this data we get C[i,j] = L for all i and j
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        c = MatMatProd(a,b,L);
+        ss += c[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n C[10,15] = " << c[10*N+15] << endl;
+    if (static_cast<unsigned int>(c[10*N+15]) != L)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "N = " << N << "\t M = " << M << "\t L = " << L << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N * M * L / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << (L*(N+M) + M*N) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
+
+
+void benchmark_D(int const & p, int const & N, int const & Nloops)
+{
+    //##########################################################################
+    cout << "\nStart Benchmarking D: polynomial evaluation\n";
+
+    vector<double> x(N,1), sol(N), a(p+1);
+    for(size_t i = 0; i < a.size(); ++i)
+    {
+        a[i] = pow(-1.0,i);     // 1-x+x^2-x^3+x^4...
+    }
+    a[0] = 1;
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double ss(0.0);
+    for (int i = 0; i < Nloops; ++i)
+    {
+        sol = PolynomEval(a,x);
+        ss += sol[0];                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/Nloops;                                      // duration per loop seconds
+
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "=="
+
+//##########################################################################
+// Check the correct result
+    cout << "\n p(x[0]) = " << sol[0] << endl;
+    if (static_cast<unsigned int>(sol[0]) != (static_cast<int>(a.size()) % 2))
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "p = " << p << "\t N = " << N << endl;
+    cout << "Time for Nloops: " << t_diff*Nloops << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0*(p+1)*N / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << N*(3+2*p) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
+    cout << endl << endl;
+
+    return;
+}
--- a/BSP_3_2to5/bsp_3_lib_bench.h
+++ b/BSP_3_2to5/bsp_3_lib_bench.h
@ -0,0 +1,136 @@
+#ifndef BSP_3_LIB_BENCH_H_INCLUDED
+#define BSP_3_LIB_BENCH_H_INCLUDED
+
+#include <vector>
+
+/** 	Inner product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidean inner product <x,y>
+*/
+double scalar(std::vector<double> const &x, std::vector<double> const &y);
+
+
+/** 	Inner product with Kahan summation
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidean inner product <x,y>
+*/
+double scalar_kahan(std::vector<double> const &x, std::vector<double> const &y);
+
+
+/** 	euclidean norm
+	@param[in] x	vector
+	@return 	    resulting Euclidean norm
+*/
+double norm_eucl(std::vector<double> const &x);
+
+
+/** \brief Matrix-Vektor-Multiplikation (row-wise access)
+ *
+ * \param[in]   a   Matrix with row wise access
+ * \param[in]   x   vector which gets multiplied
+ * \return          resulting product a*x (vector)
+ *
+ */
+std::vector<double> MatVec(std::vector<double> const & a, std::vector<double> const & x);
+
+
+/** \brief Matrix-Vektor-Multiplikation (column-wise access)
+ *
+ * \param[in]   a   Matrix with row wise access
+ * \param[in]   x   vector which gets multiplied
+ * \return          resulting product a*x (vector)
+ *
+ */
+std::vector<double> MatVec_column(std::vector<double> const & a, std::vector<double> const & x);
+
+
+/** \brief Matrix-Matrix-Multiplikation (row-wise access)
+ *
+ * \param[in]   a   matrix with row wise access (M*L)
+ * \param[in]   b   matrix with row wise access (L*N)
+ * \param[in]   L   inner dimension of the matrix product
+ * \return          resulting product a*b
+ *
+ */
+std::vector<double> MatMatProd(std::vector<double> const & a, std::vector<double> const & b, int const & L);
+
+
+/** \brief Polynomauswertung an Stelle x
+ *
+ * \param[in]   a   Vekor mit den Koeffizienten des Polynoms a=[a0,a1,a2,...]
+ * \param[in]   x   Vektor, für welchen das Polynom ausgewertet werden soll
+ * \return          resulting vector p(x)
+ *
+ */
+std::vector<double> PolynomEval(std::vector<double> const & a, std::vector<double> const & x);
+
+
+/** \brief Benchmarking A - the scalar product
+ *
+ * \param   N   size of the vector
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_A(int const & N, int const & Nloops);
+
+
+/** \brief Benchmarking A - the scalar product with Kahan summation
+ *
+ * \param   N   size of the vector
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_A_kahan(int const & N, int const & Nloops);
+
+
+/** \brief Benchmarking A - norm
+ *
+ * \param   N   size of the vector
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_A_norm(int const & N, int const & Nloops);
+
+
+/** \brief Benchmarking B - matrix-vector product Ax=b (row wise access)
+ *
+ * \param   N   size of vector x
+ * \param   M   size of vector b (=> A: M*N)
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_B(int const & N, int const & M, int const & Nloops);
+
+
+/** \brief Benchmarking B - matrix-vector product Ax=b (column wise access)
+ *
+ * \param   N   size of vector x
+ * \param   M   size of vector b (=> A: M*N)
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_B_column(int const & N, int const & M, int const & Nloops);
+
+
+/** \brief Benchmarking C - Matrix-Matrix product C=A*B A_M*L, B_L*N
+ *
+ * \param   N
+ * \param   M
+ * \param   L
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops);
+
+
+/** \brief Benchmarking D - polynomial evaluation
+ *
+ * \param   p   the degree of the polynomial
+ * \param   N   size of the input vector x where p(x)
+ * \param   Nloops  number of iterations we want to do for the measuring
+ *
+ */
+void benchmark_D(int const & p, int const & N, int const & Nloops);
+#endif // BSP_3_LIB_BENCH_H_INCLUDED
--- a/BSP_3_2to5/bsp_3_results_2-5.txt
+++ b/BSP_3_2to5/bsp_3_results_2-5.txt
@ -0,0 +1,106 @@
+Aufgabe 2: Übersicht über memory, number of floating point operations and writing/reading operations
+
+			memory (*8, in Bytes)	FLOP		writing/reading
+A - scalar		2N			2N		2N
+B - MatVec		N*M+N			2*N*M		2*N*M+M
+C - MatMatProd		L*(M+N)			2*L*M*N		L*N+L*M+M*N
+D - PolyEval		p+1+N			2*(p+1)*N	2*(p+1)*N + N
+
+Aufgabe 3-5: Ergebnisse saemtlicher benachmark tests
+
+Start Benchmarking A: scalar product
+
+ <x,y> = 2.5e+08
+
+
+N = 250000000
+Time for Nloops: 20
+Timing in sec. : 0.79
+GFLOPS         : 0.59
+GiByte/s       : 4.7
+
+
+
+Start Benchmarking A: scalar product with Kahan summation
+
+ <x,y> = 2.5e+08
+
+
+N = 250000000
+Time for Nloops: 23
+Timing in sec. : 0.92
+
+
+
+Start Benchmarking A_norm: euclidean norm
+
+ ||x|| = 1.6e+04
+
+
+N = 250000000
+Time for Nloops: 22
+Timing in sec. : 0.88
+GFLOPS         : 0.53
+GiByte/s       : 2.1
+
+
+
+Start Benchmarking B: Matrix-Vector Product (row wise access)
+
+ <A[17,.],x> = 8e+03
+
+
+N = 8000         M = 8000
+Time for Nloops: 17
+Timing in sec. : 0.23
+GFLOPS         : 0.52
+GiByte/s       : 4.1
+
+
+
+Start Benchmarking B: Matrix-Vector Product (column wise access)
+
+ <A[17,.],x> = 8e+03
+
+
+N = 8000         M = 8000
+Time for Nloops: 18
+Timing in sec. : 0.24
+GFLOPS         : 0.51
+GiByte/s       : 4
+
+
+
+Start Benchmarking C: Matrix-Matrix Product
+
+ C[10,15] = 1e+03
+
+
+N = 1000         M = 1000        L = 1000
+Time for Nloops: 19
+Timing in sec. : 6.5
+GFLOPS         : 0.29
+GiByte/s       : 0.0034
+
+
+
+Start Benchmarking D: polynomial evaluation
+
+ p(x[0]) = 1
+
+
+p = 10000        N = 100000
+Time for Nloops: 19
+Timing in sec. : 3.8
+GFLOPS         : 0.49
+GiByte/s       : 4
+
+
+Anmerkungen zu Bsp 5:
+A:	die Berechnung der Norm benötigt länger, obwohl nur ein Vektor beteiligt ist; sqrt ist rechenintensiv im Verlgleich zu +,-,/,*
+B:	kahan summation benötigt etwas länger
+C:	keine gravierenden Unterschiede feststellbar (column wise aber bereits optimiert)
+
+Anmerkungen zu Bsp 4:
+bei C) Matrix*Matrix wurde nicht das 10-100 fache des L3 Caches verwendet, da die Berechnung dann irsinnig lange gebraucht hätte;
+dadurch etwas kleinere Dimensionen
--- a/BSP_3_2to5/bsp_3_x.cbp
+++ b/BSP_3_2to5/bsp_3_x.cbp
@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_project_file>
+	<FileVersion major="1" minor="6" />
+	<Project>
+		<Option title="bsp_3_x" />
+		<Option pch_mode="2" />
+		<Option compiler="gcc" />
+		<Build>
+			<Target title="Debug">
+				<Option output="bin/Debug/bsp_3_x" prefix_auto="1" extension_auto="1" />
+				<Option object_output="obj/Debug/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-g" />
+				</Compiler>
+			</Target>
+			<Target title="Release">
+				<Option output="bin/Release/bsp_3_x" prefix_auto="1" extension_auto="1" />
+				<Option object_output="obj/Release/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-O2" />
+				</Compiler>
+				<Linker>
+					<Add option="-s" />
+				</Linker>
+			</Target>
+		</Build>
+		<Compiler>
+			<Add option="-Wall" />
+			<Add option="-fexceptions" />
+		</Compiler>
+		<Unit filename="bsp_3_lib_bench.cpp" />
+		<Unit filename="bsp_3_lib_bench.h" />
+		<Unit filename="main.cpp" />
+		<Extensions>
+			<lib_finder disable_auto="1" />
+		</Extensions>
+	</Project>
+</CodeBlocks_project_file>
--- a/BSP_3_2to5/main.cpp
+++ b/BSP_3_2to5/main.cpp
@ -0,0 +1,17 @@
+#include "bsp_3_lib_bench.h"
+#include <iostream>
+
+using namespace std;
+
+int main()
+{
+    benchmark_A(25*1e7,25);
+    benchmark_A_kahan(25*1e7,25);
+    benchmark_A_norm(25*1e7,25);
+    benchmark_B(8000,8000,75);
+    benchmark_B_column(8000,8000,75);
+    benchmark_C(1000,1000,1000,3);
+    benchmark_D(1e4,1e5,5);
+
+    return 0;
+}