Upload files to "ex3_benchmarks"

2025-11-11 16:16:28 +01:00 · 2025-11-11 16:16:28 +01:00 · 1e81786622
commit 1e81786622
parent 90e669c6de
5 changed files with 3246 additions and 0 deletions
--- a/ex3_benchmarks/Doxyfile
+++ b/ex3_benchmarks/Doxyfile
--- a/ex3_benchmarks/benchmark_tests.cpp
+++ b/ex3_benchmarks/benchmark_tests.cpp
@ -0,0 +1,333 @@
 #include "benchmark_tests.h"
 #include "benchmarks.h"
 #include <chrono>
 #include <iostream>
 #include <math.h>       
 using namespace std::chrono; 
 vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function)
 {
    cout << "#################### (A) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nN = " << N << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> x(N), y(N);
    cout.precision(2);
    cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
 //  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
    for (size_t i = 0; i < N; ++i)
    {
        x[i] = i % 219 + 1;
        y[i] = 1.0/x[i];
    }
    cout << "\nStart Benchmarking scalar\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    double check(0.0),ss(0.0);
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        check = scalar_function(x, y);
        ss += check;                   // prevents the optimizer from removing unused calculation results.
    }
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n <x,y> = " << check << endl;
    if (static_cast<unsigned int>(check) != N)
        cout << "  !!   W R O N G  result   !!\n";
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
 //##########################################################################
    cout << "\nStart Benchmarking norm\n";
    auto t3 = system_clock::now(); // start timer
 // Do calculation
    double ss2(0.0);
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        auto sk1 = sqrt(scalar(x, x));
        ss2 += sk1;                   // prevents the optimizer from removing unused calculation results.
    }
    auto t4 = system_clock::now();  // stop timer
    auto duration2 = duration_cast<microseconds>(t4 - t3);        // duration in microseconds
    double t_diff2 = static_cast<double>(duration2.count()) / 1e6; // overall duration in seconds
    t_diff2 = t_diff2/NLOOPS;                                      // duration per loop seconds
    cout << "ss(norm): " << ss2 << endl;
    cout << "Timing in sec. : " << t_diff2 << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
 vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function)
 {
    cout << "#################### (B) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nN = " << N << endl;
    cout << "\nM = " << M << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> A(M*N);
    vector<double> x(N);
    cout.precision(2);
    cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
    for (size_t i = 0; i < M; ++i)
        for (size_t j = 0; j < N; ++j)
            A[N*i + j] = (i + j) % 219 + 1;
    for (size_t j = 0; j < N; ++j)
    {
        x[j] = 1.0/A[N*17 + j];
    }
    cout << "\nStart Benchmarking MatVec\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    vector<double> b(M);
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        b = MatVec_function(A, x);
    }
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n <A[17,*],x> = " << b[17] << endl;
    if (static_cast<size_t>(b[17]) != N)
    {
        cout << "  !!   W R O N G  result   !!\n";
    }
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
 vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function)
 {
    cout << "#################### (C) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nL = " << L << endl;
    cout << "\nM = " << M << endl;
    cout << "\nN = " << N << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> A(M*L);
    vector<double> B(L*N);
    cout.precision(2);
    cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
    for (size_t i = 0; i < M; ++i)
        for (size_t k = 0; k < L; ++k)
            A[L*i + k] = (i + k) % 219 + 1;
    for (size_t k = 0; k < L; ++k)
        for (size_t j = 0; j < N; ++j)
            B[N*k + j] = 1.0/A[L*17 + k];
    cout << "\nStart Benchmarking MatMat\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    vector<double> C(M*N);
    double check;
    double check_sum;
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        C = MatMat_function(A, B, L);
        check = C[N*17];
        check_sum += check; // prevents the optimizer from removing unused calculation results.
    }
    cout << check_sum;
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n C[17,0] = " << check << endl;
    if (static_cast<unsigned int>(check) != L)
    {
        cout << "  !!   W R O N G  result   !!, should be " << L <<"\n";
    }
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
 vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
 {
    cout << "#################### (D) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nN = " << N << endl;
    cout << "\np = " << p << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> a(p + 1, 0);
    vector<double> x(N);
    cout.precision(2);
    cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
    for (size_t j = 0; j < N; ++j)
        x[j] = 1.0*j;
    for (size_t k = 0; k < p + 1; ++k)
        a[k] = pow(-1.0, k);         // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
    cout << "\nStart Benchmarking poly\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    vector<double> y(N);
    double check;
    double check_sum;
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        y = poly(a, x);
        check = y[0];
        check_sum += check; // prevents the optimizer from removing unused calculation results.
    }
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n poly(" << x[0] << ") = " << check << endl;
    if (abs(check - 1.0) > 1.0/1e6)
    {
        cout << "  !!   W R O N G  result   !!\n";
    }
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
--- a/ex3_benchmarks/benchmark_tests.h
+++ b/ex3_benchmarks/benchmark_tests.h
@ -0,0 +1,15 @@
 #pragma once
 #include <vector>
 #include <functional>
 using namespace std;
 vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function);
 vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function);
 vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function);
 vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
--- a/ex3_benchmarks/benchmarks.cpp
+++ b/ex3_benchmarks/benchmarks.cpp
@ -0,0 +1,246 @@
 #include "benchmarks.h"
 #include "vdop.h"
 #include <iostream>
 #include <vector>
 #include <cmath>
 #include <cassert>       // assert()
 #ifdef __INTEL_CLANG_COMPILER
 #pragma message(" ##########  Use of MKL  ###############")
 #include <mkl.h>
 #else
 #pragma message(" ##########  Use of CBLAS  ###############")
 #include <cblas.h>               // cBLAS Library
 #include <lapacke.h>             // Lapack
 #endif
 // (A) Inner product of two vectors (from skalar_stl)
 double scalar(vector<double> const &x, vector<double> const &y)
 {
    assert(x.size() == y.size());
    size_t const N = x.size();
    double sum = 0.0;
    for (size_t i = 0; i < N; ++i)
    {
        sum += x[i] * y[i];
    }
    return sum;
 }
 // (A) 5.(b) Kahan scalar product
 double Kahan_skalar(vector<double> const &x, vector<double> const &y)
 {
    double sum = 0.0;
    double c = 0.0;
    size_t n = x.size();
    for (size_t i = 0; i < n; ++i)
    {
        double z = x[i]*y[i] - c;    // c is the part that got lost in the last iteration
        double t = sum + z;          // when adding sum + z, the lower digits are lost if sum is large
        c = (t - sum) - z;           // now we recover the lower digits to add in the next iteration
        sum = t;
    }
    return sum;
 }
 // (A) 6. cBLAS scalar product
 double scalar_cBLAS(vector<double> const &x, vector<double> const &y)
 {
    return cblas_ddot(x.size(), x.data(), 1, y.data(), 1); // x.data() = &x[0]
 }
 // (B) Matrix-vector product (from intro_vector_densematrix)
 vector<double> MatVec(vector<double> const &A, vector<double> const &x)
 {
    size_t const nelem = A.size();
    size_t const N = x.size();
    assert(nelem % N == 0); // make sure multiplication is possible
    size_t const M = nelem/N;          
    vector<double> b(M);    
    for (size_t i = 0; i < M; ++i)
    {
        double tmp = 0.0;
        for (size_t j = 0; j < N; ++j)
            tmp += A[N*i + j] * x[j];
        b[i] = tmp;
    }
    return b;
 }
 // (B) cBLAS Matrix-vector product
 vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x)
 {
    size_t const nelem = A.size();
    size_t const N = x.size();
    assert(nelem % N == 0); // make sure multiplication is possible
    size_t const M = nelem/N;
    vector<double> b(M);
    cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0.0, b.data(), 1);
    return b;
 }
 // (C) Matrix-matrix product
 vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &L)
 {
    size_t const nelem_A = A.size();
    size_t const nelem_B = B.size();
    assert(nelem_A % L == 0 && nelem_B % L == 0);
    size_t const M = nelem_A/L;
    size_t const N = nelem_B/L;
    vector<double> C(M*N);
    for (size_t i = 0; i < M; ++i)
    {
        for (size_t j = 0; j < N; ++j)
        {
            double C_temp = 0;
            for (size_t k = 0; k < L; ++k)
            {
                C_temp += A[L*i + k]*B[N*k + j];
            }
            C[N*i + j] = C_temp;
        }
    }
    return C;
 }
 // (C) cBLAS matrix-matrix product
 vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &L)
 {
    size_t const nelem_A = A.size();
    size_t const nelem_B = B.size();
    assert(nelem_A % L == 0 && nelem_B % L == 0);
    size_t const M = nelem_A/L;
    size_t const N = nelem_B/L;
    vector<double> C(M*N);
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
    return C;
 }
 // (D) Evaluation of a polynomial function
 vector<double> poly(vector<double> const &a, vector<double> const &x)
 {
    size_t const N = x.size();
    size_t const p = a.size() - 1;
    vector<double> y(N, 0);
    for (size_t i = 0; i < N; ++i)
    {
        double x_temp = x[i];
        double y_temp = 0;
        for (size_t k = 0; k < p + 1; ++k)
        {
            y_temp += x_temp*y_temp + a[p - k];
        }
        y[i] = y_temp;
    }
    return y;
 }
 // (E) Solves linear system of equations
 void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
 {
    const double omega   = 1.0;
    const int    maxiter = 1000;
    const double tol  = 1e-5,                // tolerance
                 tol2 = tol * tol;           // tolerance^2
    int nrows = SK.Nrows();                  // number of rows == number of columns
    assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
    cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s"  << endl;
    //  Choose initial guess
    for (int k = 0; k < nrows; ++k)
    {
        u[k] = 0.0;                          //  u := 0
    }
    vector<double> dd(nrows);                // matrix diagonal
    vector<double>  r(nrows);                // residual
    vector<double>  w(nrows);                // correction
    SK.GetDiag(dd);                          //  dd := diag(K)
    ////DebugVector(dd);{int ijk; cin >> ijk;}
    //  Initial sweep
    SK.Defect(r, f, u);                      //  r := f - K*u
    vddiv(w, r, dd);                         //  w := D^{-1}*r
    double sigma0 = dscapr(w, r);            // s0 := <w,r>
    // Iteration sweeps
    int iter  = 0;
    double sigma = sigma0;
    while ( sigma > tol2 * sigma0 && maxiter > iter)
    {
        ++iter;
        vdaxpy(u, u, omega, w );             //  u := u + om*w
        SK.Defect(r, f, u);                  //  r := f - K*u
        vddiv(w, r, dd);                     //  w := D^{-1}*r
        sigma = dscapr(w, r);                // s0 := <w,r>
 //      	cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
    }
    cout << "aver. Jacobi rate :  " << exp(log(sqrt(sigma / sigma0)) / iter) << "  (" << iter << " iter)" << endl;
    cout << "final error: " << sqrt(sigma / sigma0) << " (rel)   " << sqrt(sigma) << " (abs)\n";
    return;
 }
--- a/ex3_benchmarks/benchmarks.h
+++ b/ex3_benchmarks/benchmarks.h
@ -0,0 +1,89 @@
 #pragma once
 #include "getmatrix.h"
 #include <vector>
 using namespace std;
 /**     (A) Inner product of two vectors (from skalar_stl)
 	@param[in] x	vector
 	@param[in] y	vector
 	@return 	    resulting Euclidian inner product <x,y>
 */
 double scalar(vector<double> const &x, vector<double> const &y);
 /**     (A) 5.(b) Inner product of two vectors using the Kahan scalar product
 	@param[in] x	vector
 	@param[in] y	vector
 	@return 	    resulting Euclidian inner product <x,y>
 */
 double Kahan_skalar(vector<double> const &x,  vector<double> const &y);
 /** 	(A) 6. cBLAS scalar product of two vectors
 	@param[in] x	vector
 	@param[in] y	vector
 	@return 	    resulting Euclidian inner product <x,y>
 */
 double scalar_cBLAS(vector<double> const &x, vector<double> const &y);
 /** 	(B) Matrix-vector product (from intro_vector_densematrix)
 * 	@param[in] A	dense matrix (1D access)
 *  @param[in] u	vector
 *
 *	@return    resulting vector
 */
 vector<double> MatVec(vector<double> const &A, vector<double> const &x);
 /** 	(B) 6. cBLAS Matrix-vector product
 * 	@param[in] A	dense matrix (1D access)
 *  @param[in] u	vector
 *
 *	@return    resulting vector
 */
 vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x);
 /** 	(C) Matrix-matrix product
 * 	@param[in] A			MxL dense matrix (1D access)
 *  @param[in] B			LxN dense matrix (1D access)
 *  @param[in] shared_dim 	shared dimension L
 *
 *	@return    resulting MxN matrix
 */
 vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
 /** 	(C) 6. cBLAS Matrix-matrix product
 * 	@param[in] A			MxL dense matrix (1D access)
 *  @param[in] B			LxN dense matrix (1D access)
 *  @param[in] shared_dim 	shared dimension L
 *
 *	@return    resulting MxN matrix
 */
 vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
 /** 	(D) Evaluation of a polynomial function using Horner's scheme
 * 	@param[in] a	coefficient vector
 *  @param[in] x	vector with input values
 *
 *	@return    vector with output values
 */
 vector<double> poly(vector<double> const &a, vector<double> const &x);
 /**     (E) Solves linear system of equations  K @p u = @p f  via the Jacobi iteration (from jaboci_oo_stl)
 * We use a distributed symmetric  CSR matrix @p SK and initial guess of the
 * solution is set to 0.
 * @param[in] SK	CSR matrix
 * @param[in] f		distributed local vector storing the right hand side
 * @param[out] u	accumulated local vector storing the solution.
 */
 void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u);